def web_crawling(url): """ Main task is extract page content by url, parse html and get all links and add to redis queue. """ logging.info('Extracting content for: %s', url) #extract page content try: page = urlopen(url) content = page.read() except (HTTPError, URLError): return logging.info('Start to parse content for: %s', url) soup = BeautifulSoup(content, 'html.parser') #parse and store content of pages for s in soup(['style', 'script', '[document]', 'had', 'title']): s.extract() page = Page(url, soup.getText()) page.save() logging.info('Stored Content in for: %s', url) #find all links and add to queue links = soup.findAll('a', attrs={'href': re.compile('^http://')}) for link in links: href = link.get('href') q.put(href) logging.info('Added %s to Url Queue for processing', url) logging.info('Finish to parse content for: %s', url)
def search_result(search_query): return Page.get(search_query)