예제 #1
0
 def detect_withdrawn(self, tree, url):
     comment = CSSSelector(".tablecell.comments")(tree)
     if comment:
         comment = comment[0].text_content()
         if "withdrawn" in comment.lower():
             print("Paper", url, "appears to be withdrawn!")
             return True
     return False
예제 #2
0
 def detect_withdrawn(self, tree, url):
     comment = CSSSelector(".tablecell.comments")(tree)
     if comment:
         comment = comment[0].text_content()
         if "withdrawn" in comment.lower():
             print("Paper", url, "appears to be withdrawn!")
             return True
     return False
예제 #3
0
def scrape_speakers(images):
    doc = html.fromstring(
        requests.get('https://ojibwe.lib.umn.edu/about/voices').text)
    doc = CSSSelector("div.col-md-9.col-sm-9.content")(doc)[0]
    speakers = CSSSelector("div.voice.row.full-row")(doc)
    retval = []
    for speaker in speakers:
        image = CSSSelector("img.voice-image")(speaker)[0]
        imagesrc = image.attrib.get('src')
        imagealt = image.attrib.get('alt')
        image = {'src': imagesrc, 'alt': imagealt}
        if image not in images: images.append(image)
        voice_info_fields = CSSSelector(
            "div.voice-info-field span.voice-field")(speaker)
        ojibwe, english, community, region = [
            vif.text for vif in voice_info_fields
        ]
        primary = CSSSelector("h2")(speaker)[0].text.strip()
        initials = CSSSelector("h2 div")(speaker)[0].text.strip()
        description = ''.join([
            etree.tounicode(p, with_tail=False)
            for p in CSSSelector("p")(speaker)
        ])
        url = '/speaker/' + ''.join(
            [L if L != " " else "-" for L in primary.lower()])
        retval.append({
            "primary_name": primary,
            "ojibwe_name": ojibwe,
            "english_name": english,
            "initials": initials,
            "image": image,
            "community": community,
            "region": region,
            "description": description,
            "href": url
        })
    return retval