def process_last(self, user_jid, our_jid, sub, last): if last is not None: sub["last"] = last yield UserSubscriptions(user_jid).subscribe(sub["url"]) yield Subscription.create(sub) self._xmpp.send_presence(to=user_jid, from_=sub["jid"], type_="subscribe") else: yield wait_for_host(sub["host"], level=2) try: page = yield get_page(sub["url"]) except Exception: self._xmpp.send_message( to=user_jid, from_=get_full_jid(our_jid), body=u"Url check failed, subscription aborted. " "Seems like not existing url.") else: parsed = yield self._worker.parse(sub, page) if "last" in parsed and parsed["last"] is not None: self.process_last(user_jid, our_jid, sub, parsed["last"]) else: self._xmpp.send_message( to=user_jid, from_=get_full_jid(our_jid), body=u"Page parsing failed, subscription aborted. " "Seems like not existing url.")
def process_last(self, user_jid, our_jid, sub, last): if last is not None: sub["last"] = last yield UserSubscriptions(user_jid).subscribe(sub["url"]) yield Subscription.create(sub) self._xmpp.send_presence( to=user_jid, from_=sub["jid"], type_="subscribe") else: yield wait_for_host(sub["host"], level=2) try: page = yield get_page(sub["url"]) except Exception: self._xmpp.send_message( to=user_jid, from_=get_full_jid(our_jid), body=u"Url check failed, subscription aborted. " "Seems like not existing url.") else: parsed = yield self._worker.parse(sub, page) if "last" in parsed and parsed["last"] is not None: self.process_last(user_jid, our_jid, sub, parsed["last"]) else: self._xmpp.send_message( to=user_jid, from_=get_full_jid(our_jid), body=u"Page parsing failed, subscription aborted. " "Seems like not existing url.")
def get_data(url, handler): key = parser.convertUrl(url) if not options.DEBUG: cached = yield fetcher.get_data(key) if cached: return cached result = yield fetcher.get_page(url) ret = yield maybe_future(handler(result)) ret = json.dumps(ret) yield fetcher.write_data(key, ret, options.CACHE_TIME) return ret
def get_link_data(valid_link, link_file_prename): f = open('%surl_list.html' % link_file_prename, 'wb+') for url in valid_link: f.write(url + '\n') f.close() for num in range(0, len(valid_link)): url = valid_link[num] print url html = fetcher.get_page(url) f = open('%s%d.html' % (link_file_prename, num + 1), 'wb+') f.write(html) f.close() pass
def process_page(self, sub, last_modified): yield utils.wait_for_host(sub["host"]) self.debug("HOST OK: %s (page)" % sub["url"]) try: page = yield get_page(sub["url"]) except NotFound: self.dead_url(sub) except Exception: err = traceback.format_exc()[:-1] self.bad_url(sub, err) else: parsed = yield self._worker.parse(sub, page) self.process_parsed(sub, parsed, last_modified) # We've done, decrement connections count self._conn_count -= 1
def crawl_web(environment, index, graph, seed, max_pages, max_depth): tocrawl = ['https://wikipedia.org/wiki/' + seed] next_depth = [] crawled = [] count = 0 depth = 0 while tocrawl and count < max_pages and depth <= max_depth: count += 1 page = tocrawl.pop() if page not in crawled: links, page_content = get_page(environment, page, max_pages) add_page_to_index(index, page, page_content) graph[page] = links mergeLists(next_depth, links) crawled.append(page) if not tocrawl: tocrawl, next_depth = next_depth, [] depth = depth + 1 return index, graph
if __name__ == '__main__': config = configparser.ConfigParser() config.read('crawler.config') print("Master started. Initial page {}".format( config["SITE"]["initial_page"])) # init start_time = datetime.datetime.now() url_manager = URLManager.URLManager() url_manager.insert_url(config["SITE"]["initial_page"], 0, 0, -1) fetcher = fetcher.Fetcher(url_manager) end_time = datetime.datetime.now() delta = end_time - start_time print("Init time", delta) # start crawling while url_manager.has_next_url(): # TODO: change if parallel print("queue size", url_manager.get_size()) next_url = url_manager.get_next_url() print("fetching", next_url) fetcher.get_page(next_url) print("done") sys.stdout.flush() sys.stderr.flush()