def extract(self, response, link): #extract

#    for link in link_list:
    #       response = ulib.urlopen(link).read()
            
        #get relevant content using readability
        readable = Document(response)
        body = readable.summary()
        title = readable.short_title()

        #strip extra html readability leaves in, like p tags
        title = html.fromstring(title).text_content()
        body = html.fromstring(body).text_content()
        title = condense_whitespace(title)
        body = condense_whitespace(body)

        links = self.extra['links']
       
        try: 
            d = unicode(self.extra['dates'][links.index(link)])
        except:
            #pr web rss feeds don't have pubdate
            html_body = html.fromstring(response)
            d = re.sub('.*\(.*\)', '', html_body.find_class('releaseDateline')[0].text_content())

        #print d
        
        try:                
            date = parse(d)
        except:
            date = datetime.now()

        doc = { 'url': link,
                'title': title,
                'text': body,
                'date': date,
                'source': self.extra['source'][links.index(link)]}

        return doc
Пример #2
0
def get_link_content(link):
    content = requests.get(link).content
    readable = Document(content)
    body = html.fromstring(readable.summary()).text_content()
    return condense_whitespace(body)