class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n
class Crawler(): def __init__(self): self.downloader = DownloadManager() self.webpage = None self.rules = {} self.dbop = OperatorDB() def add_seeds(self, links): self.dbop.add_seeds(links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def start(self): while 1: try: url = self.dbop.pop_url() print "url: %s" % url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) #print error_msg, url, redirected_url, html if html != None: self.webpage = WebPage(url, html) article = self.webpage.extract() if len(article) > 5: addtime = "%s %s" % (article[1], article[2]) self.dbop.html2db(url, html, article[0], addtime, article[3], article[5]) else: self.dbop.html2db(url, html) print self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) links = self.webpage.filter_links(tags=['a'], str_patterns=ruptn) self.add_seeds(links) self.mysleep(3) except Exception, err: print "!!error!! Exception happend! %s %s" % (url, err) self.dbop.close()
form.action = urlparse.urljoin(self.url, form.action) return form.action, form.fields # def get_html(self): return self.html if __name__ == "__main__": import time from downloader import DownloadManager downloader = DownloadManager() url = "http://www.cs.colorado.edu/" error_msg, url, redirected_url, html = downloader.download(url) print error_msg, url, redirected_url, len(html) time.sleep(2) page = WebPage(url, html) page.parse_links() links = page.filter_links(tags=["a"], str_patterns=["^(http://www\.cs\.colorado\.edu)(/info.+)$"]) elements = page.doc.findall("./body//div") for e in elements: print "ELEMETNS ==========================================" print lxml.html.tostring(e, pretty_print=True) print "ITEMS------------------------------------------" print e.items() print "TEXT-CONTENT-----------------------------------" print e.text_content()
form = self.doc.forms[index] form.action = urlparse.urljoin(self.url, form.action) return form.action, form.fields # def get_html(self): return self.html if __name__ == "__main__": import time from downloader import DownloadManager downloader = DownloadManager() url = "http://www.cs.colorado.edu/" error_msg, url, redirected_url, html = downloader.download(url) print error_msg, url, redirected_url, len(html) time.sleep(2) page = WebPage(url, html) page.parse_links() links = page.filter_links(tags=['a'],patterns=['^(http://www\.cs\.colorado\.edu)(/info.+)$']) elements = page.doc.findall('./body//div') for e in elements: print "ELEMETNS ==========================================" print lxml.html.tostring(e,pretty_print=True) print "ITEMS------------------------------------------" print e.items() print "TEXT-CONTENT-----------------------------------" print e.text_content()
class Server(object): SHUTDOWN_TIMEOUT = 60 def __init__(self): self._cfg = _load_cfg() try: with open(os.path.join(APP_PATH, 'data.json')) as json_fp: user_data = json.load(json_fp) except IOError: logging.warning('No user data') user_data = {} self._download_manager = DownloadManager(self._cfg['user'], self._cfg['password'], user_data.get('session_id')) def stop(self): self._bottle_server.stop() def _on_heartbeat_timeout(self, heartbeat): ''' Called if we haven't had a heartbeat in a while ''' if any(self._download_manager.get_downloading()): logging.debug('No heartbeat but downloading.... Still alive') heartbeat.beat() else: logging.debug('No heartbeat, no downloads. Stopping...') self._bottle_server.stop() def _on_dl_finished(self, path, file_name): ''' TODO this needs sorting ''' import shutil save_path = os.path.join(self._cfg['save_location'], file_name) logging.debug('Moving <%s> to <%s>' % (path, save_path,)) shutil.move(path, save_path) def start_download(self, game, game_id, mod_id, file_id): self._download_manager.download(self._on_dl_finished, game, game_id, mod_id, file_id) def start_server(self, host, port): self._bottle_server = StoppableWSGIRefServer(host=host, port=port) hb = Heartbeat() install(partial(local_variable_plugin, { 'cfg':self._cfg, 'heartbeat': hb, 'server' : self, 'download_manager' : self._download_manager, })) hb_monitor = HeartbeatMonitor(hb, self.SHUTDOWN_TIMEOUT, self._on_heartbeat_timeout) hb_monitor.monitor() run(server=self._bottle_server) self._download_manager.stop() hb_monitor.stop() _save_cfg(self._cfg)
class Crawler(object): def __init__(self): super(Crawler, self).__init__() self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def get_patterns_from_rules(self, url): patns = [] for purl, ru in self.rules.items(): if purl.match(url) != None: patns.extend(ru) return list(set(patns)) def getlinks(self, url, html): self.webpage = WebPage(url, html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags=['a'], patterns=ruptn) return links def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download( url) #print error_msg, url, redirected_url, html if html != None: self.webpagedb.html2db(url, html) links = self.getlinks(url, html) self.add_seeds(links) self.mysleep(3) def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep", i, "of", n
class Crawler(): def __init__(self ): self.downloader = DownloadManager() self.webpage = None self.init_database() self.rules = {} self.files = [] self.file_rule = ".+" def init_database(self): self.queue = QueueDB('queue.db') self.webpagedb = WebpageDB('webpage.db') self.duplcheck = DuplCheckDB('duplcheck.db') self.repodb = RepoStateDB() def add_seeds(self, links): new_links = self.duplcheck.filter_dupl_urls(links) self.duplcheck.add_urls(new_links) self.queue.push_urls(new_links) def add_rules(self, rules): self.rules = {} for url, inurls in rules.items(): reurl = re.compile(url) repatn = [] for u in inurls: repatn.append(re.compile(u)) self.rules[reurl] = repatn def set_file_rule(self, rule): self.file_rule = rule def get_patterns_from_rules(self,url): patns = [] for purl,ru in self.rules.items(): if purl.match(url)!= None: patns.extend(ru) return list(set(patns)) def download_files(self, files): for f in files: #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir cmd = "wget -c " + f + " -P " + config.repos_dir ret_code = os.system(cmd) self.repodb.update(f, ret_code == 0) def start(self): while 1: url = self.queue.pop_url() print url if url == None: print "crawling task is done." break error_msg, url, redirected_url, html = self.downloader.download(url) # print error_msg, url, redirected_url, html if html !=None: self.webpagedb.html2db(url,html) self.webpage = WebPage(url,html) self.webpage.parse_links() ruptn = self.get_patterns_from_rules(url) #print ruptn links = self.webpage.filter_links(tags = ['a'], patterns= ruptn) print links self.add_seeds(links) file_pattern = [] file_pattern.append(re.compile(self.file_rule)) files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern) self.files.append(files) #TODO: self.download_files(files) print files def mysleep(self, n): for i in range(n): time.sleep(1) print "sleep",i,"of",n