def detect_withdrawn(self, tree, url): comment = CSSSelector(".tablecell.comments")(tree) if comment: comment = comment[0].text_content() if "withdrawn" in comment.lower(): print("Paper", url, "appears to be withdrawn!") return True return False
def detect_withdrawn(self, tree, url): comment = CSSSelector(".tablecell.comments")(tree) if comment: comment = comment[0].text_content() if "withdrawn" in comment.lower(): print("Paper", url, "appears to be withdrawn!") return True return False
def scrape_speakers(images): doc = html.fromstring( requests.get('https://ojibwe.lib.umn.edu/about/voices').text) doc = CSSSelector("div.col-md-9.col-sm-9.content")(doc)[0] speakers = CSSSelector("div.voice.row.full-row")(doc) retval = [] for speaker in speakers: image = CSSSelector("img.voice-image")(speaker)[0] imagesrc = image.attrib.get('src') imagealt = image.attrib.get('alt') image = {'src': imagesrc, 'alt': imagealt} if image not in images: images.append(image) voice_info_fields = CSSSelector( "div.voice-info-field span.voice-field")(speaker) ojibwe, english, community, region = [ vif.text for vif in voice_info_fields ] primary = CSSSelector("h2")(speaker)[0].text.strip() initials = CSSSelector("h2 div")(speaker)[0].text.strip() description = ''.join([ etree.tounicode(p, with_tail=False) for p in CSSSelector("p")(speaker) ]) url = '/speaker/' + ''.join( [L if L != " " else "-" for L in primary.lower()]) retval.append({ "primary_name": primary, "ojibwe_name": ojibwe, "english_name": english, "initials": initials, "image": image, "community": community, "region": region, "description": description, "href": url }) return retval