Пример #1
0
def worker(query, working, process, loops, page_limit, timeout=1):
    if not os.path.exists(
            path + 'archive_dict.pkl'
    ):  # performs initial crawl to build archive_dict.pkl (required by article_extractor.py)
        crawl(query, path, page_limit)
    while working:
        for i in range(loops):
            crawl(query, path, page_limit)
            time.sleep(timeout)
        if process:
            extract(path)
Пример #2
0
 def get(self):
     from bs4 import BeautifulSoup as bs
     from article_extractor import extract
     url = self.request.get('url')
     markup = util.url_fetch(url)
     soup = bs(markup, 'lxml')
     text = u""
     if soup.title:
         title = soup.title.string
         h1 = soup.new_tag('h1')
         h1.string = title
         text += unicode(h1)
     # print create_soup_with_ids(markup).prettify()
     text += extract(markup, url)
     self.response.headers['Access-Control-Allow-Origin'] = '*'
     self.response.write(text)
Пример #3
0
 def fetch_normal():
     response = url_fetch(url, return_response_obj=True)
     # print 'INFO', response.info()
     if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html':
         markup = response.read()
     else:
         print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE'
         markup = None
 
     if markup:
         # process markup:
         markup_soup = BeautifulSoup(markup, 'lxml')
         og_title = find_meta_value(markup_soup, 'og:title')
         og_image = find_meta_value(markup_soup, 'og:image')
         og_description = find_meta_value(markup_soup, 'og:description')
         title_field = find_title(markup_soup)
     
         article.site_name = find_meta_value(markup_soup, 'og:site_name')
     
         # find author:
         article.author = find_author(markup_soup)
     
         # parse and process article content:
         content.html = article_extractor.extract(markup, article.url)
         doc_soup = BeautifulSoup(content.html, 'lxml')
     
         article.title = first_present([og_title, title_field, article.title])
         article.top_image = make_url_absolute(first_present([article.top_image, og_image]))
     
         populate_article_json(article, content)
     
         # compute description:
         description = None
         if og_description and len(og_description.strip()):
             description = truncate(og_description.strip(), words=40)
         elif content.text and len(content.text.strip()) > 0:
             description = truncate(content.text, words=40)
         article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None
             
         return True
     else:
         return False