class Crawler: ''' This Crawler class will implement crawling the website, get the text the links in page ''' def __init__(self): self.frontier = Frontier() self.count = 0 self.last_domain = '' self.store = Store() def crawl(self): ''' pop a url from frontier and get the header, html, text and out links and push the out links into frontier and insert them into elasticsearch :return: None ''' while True and self.count < MAX_COUNT: url = self.frontier.pop_url() try: current_domain = urlparse(url).netloc if current_domain not in self.frontier.robot_dict and self.frontier.no_robot: self.frontier.add_robot_dict(url) if current_domain in self.frontier.robot_dict and not ( self.frontier.robot_dict[current_domain].can_fetch( '*', url)): continue except Exception, e: print 'current_domain_exception'.format(e) continue print 'current url {}'.format(url) if current_domain == self.last_domain: time.sleep(1) else: self.last_domain = current_domain try: header, raw_html = self.downloader(url) except Exception, e: print 'downloader exception'.format(e) continue try: text, title, links = self.parse_url(url, raw_html) except Exception, e: print e continue
class Crawler: ''' This Crawler class will implement crawling the website, get the text the links in page ''' def __init__(self): self.frontier = Frontier() self.count = 0 self.last_domain = '' self.store = Store() def crawl(self): ''' pop a url from frontier and get the header, html, text and out links and push the out links into frontier and insert them into elasticsearch :return: None ''' while True and self.count < MAX_COUNT: url = self.frontier.pop_url() try: current_domain = urlparse(url).netloc if current_domain not in self.frontier.robot_dict and self.frontier.no_robot: self.frontier.add_robot_dict(url) if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)): continue except Exception, e: print 'current_domain_exception'.format(e) continue print 'current url {}'.format(url) if current_domain == self.last_domain: time.sleep(1) else: self.last_domain = current_domain try: header, raw_html = self.downloader(url) except Exception, e: print 'downloader exception'.format(e) continue try: text, title, links = self.parse_url(url, raw_html) except Exception, e: print e continue
class Crawler: ''' crawling the website, get the text the links in page ''' def __init__(self): self.count = 0 self.last_domain = '' self.frontier = Frontier() self.store = Store() def initial_seeds(self): self.frontier.initial_queue() def parseRobot(self, domain): robot_url = 'http://' + domain + '/robots.txt' try: robot_file = urllib2.urlopen(robot_url).read() robot_content = '' for l in robot_file.split('\n'): if l.replace(' ','') != '': robot_content += l + '\n' robot_parser = robotexclusionrulesparser.RobotExclusionRulesParser() robot_parser.parse(robot_content) try: crawler_delay = robot_parser.get_crawl_delay('*') except Exception as e: ## print 'crawler_delay exception: {}'.format(e) crawler_delay = None return robot_parser, crawler_delay except Exception as e: ## print 'robot parse exception: {}'.format(e) return None, None def crawl(self): ''' pop a url from frontier and get the header, html, text and out links. push the out links into frontier and insert them into elasticsearch ''' while self.count < MAX_COUNT: level, url = self.frontier.pop_url() try: current_domain = urlparse(url).netloc ## if current_domain not in self.frontier.robot_dict and self.frontier.no_robot: ## self.frontier.add_robot_dict(url) ## ## if current_domain in self.frontier.robot_dict and not (self.frontier.robot_dict[current_domain].can_fetch('*', url)): ## continue robot_parser, crawler_delay = self.parseRobot(current_domain) if robot_parser is not None: if not robot_parser.is_allowed('*', url): print 'not allowed to crawl: {}'.format(url) continue if crawler_delay is not None: time.sleep(crawler_delay) except Exception as e: print 'current_domain_exception: {}'.format(e) print url continue if current_domain == self.last_domain: time.sleep(1) else: self.last_domain = current_domain try: header, raw_html = self.downloader(url) except Exception, e: print 'downloader exception: {}'.format(e) continue try: text, title, links = self.parse_url(url, raw_html) except Exception as e: print 'parse exception: {}'.format(e) continue if text or links: self.count += 1 out_links = [] for link in links: try: if len(self.frontier.pq) > MAX_COUNT: break if self.frontier.check_push_url(link, url): out_links.append(link) except Exception as e: continue print 'FINISHED: {}'.format(self.count) self.store.insert(self.count, url, header, title, text, raw_html, [], out_links, level) self.write_to_file(self.count, url, header, title, text, raw_html, out_links, level) else: continue self.frontier.write_in_links() self.store.write_urls()