def crawl(): #Use logger to debug the code Logger.debug('Hello google') #Get the html knowledge by parsing url soup = get_b_soup('http://www.google.com') #Send the data to output pipe line Queues.send_to_output(soup.head.title.text) Queues.send_to_output('http://www.google.com' + soup.head.meta['content'])
def collect(url_queue): #Use logger to debug the code Logger.debug('Hello amazon') #Get the html knowledge by parsing url soup = get_b_soup(base_path) #Travel with html knowledge for all_url in soup.findAll('a', href=re.compile('/gp/product/')): #REQUIRED #Send all url via url_queue url_queue.put(base_path + all_url['href'])
def crawl_thread(url, module): Logger.debug('Thread start') encoding = getattr(module, 'ENCODING', None) try: soup = get_b_soup(url, encoding=encoding) module.crawl(soup) except: Logger.error('Crawl error url: ' + url) Logger.error(traceback.format_exc()) return Logger.debug('Thread done')