def extract(self, response, link): #extract # for link in link_list: # response = ulib.urlopen(link).read() #get relevant content using readability readable = Document(response) body = readable.summary() title = readable.short_title() #strip extra html readability leaves in, like p tags title = html.fromstring(title).text_content() body = html.fromstring(body).text_content() title = condense_whitespace(title) body = condense_whitespace(body) links = self.extra['links'] try: d = unicode(self.extra['dates'][links.index(link)]) except: #pr web rss feeds don't have pubdate html_body = html.fromstring(response) d = re.sub('.*\(.*\)', '', html_body.find_class('releaseDateline')[0].text_content()) #print d try: date = parse(d) except: date = datetime.now() doc = { 'url': link, 'title': title, 'text': body, 'date': date, 'source': self.extra['source'][links.index(link)]} return doc
def get_link_content(link): content = requests.get(link).content readable = Document(content) body = html.fromstring(readable.summary()).text_content() return condense_whitespace(body)