def worker(query, working, process, loops, page_limit, timeout=1): if not os.path.exists( path + 'archive_dict.pkl' ): # performs initial crawl to build archive_dict.pkl (required by article_extractor.py) crawl(query, path, page_limit) while working: for i in range(loops): crawl(query, path, page_limit) time.sleep(timeout) if process: extract(path)
def get(self): from bs4 import BeautifulSoup as bs from article_extractor import extract url = self.request.get('url') markup = util.url_fetch(url) soup = bs(markup, 'lxml') text = u"" if soup.title: title = soup.title.string h1 = soup.new_tag('h1') h1.string = title text += unicode(h1) # print create_soup_with_ids(markup).prettify() text += extract(markup, url) self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.write(text)
def fetch_normal(): response = url_fetch(url, return_response_obj=True) # print 'INFO', response.info() if response and response.info().getheader('content-type', 'text/html').lower().split(';')[0].strip() == 'text/html': markup = response.read() else: print 'BAD MIME TYPE' if response else 'NO SUCCESSFUL RESPONSE' markup = None if markup: # process markup: markup_soup = BeautifulSoup(markup, 'lxml') og_title = find_meta_value(markup_soup, 'og:title') og_image = find_meta_value(markup_soup, 'og:image') og_description = find_meta_value(markup_soup, 'og:description') title_field = find_title(markup_soup) article.site_name = find_meta_value(markup_soup, 'og:site_name') # find author: article.author = find_author(markup_soup) # parse and process article content: content.html = article_extractor.extract(markup, article.url) doc_soup = BeautifulSoup(content.html, 'lxml') article.title = first_present([og_title, title_field, article.title]) article.top_image = make_url_absolute(first_present([article.top_image, og_image])) populate_article_json(article, content) # compute description: description = None if og_description and len(og_description.strip()): description = truncate(og_description.strip(), words=40) elif content.text and len(content.text.strip()) > 0: description = truncate(content.text, words=40) article.description = re.sub(r"[\r\n\t ]+", " ", description).strip() if description else None return True else: return False