Пример #1
0
 def process_last(self, user_jid, our_jid, sub, last):
     if last is not None:
         sub["last"] = last
         yield UserSubscriptions(user_jid).subscribe(sub["url"])
         yield Subscription.create(sub)
         self._xmpp.send_presence(to=user_jid,
                                  from_=sub["jid"],
                                  type_="subscribe")
     else:
         yield wait_for_host(sub["host"], level=2)
         try:
             page = yield get_page(sub["url"])
         except Exception:
             self._xmpp.send_message(
                 to=user_jid,
                 from_=get_full_jid(our_jid),
                 body=u"Url check failed, subscription aborted. "
                 "Seems like not existing url.")
         else:
             parsed = yield self._worker.parse(sub, page)
             if "last" in parsed and parsed["last"] is not None:
                 self.process_last(user_jid, our_jid, sub, parsed["last"])
             else:
                 self._xmpp.send_message(
                     to=user_jid,
                     from_=get_full_jid(our_jid),
                     body=u"Page parsing failed, subscription aborted. "
                     "Seems like not existing url.")
Пример #2
0
 def process_last(self, user_jid, our_jid, sub, last):
     if last is not None:
         sub["last"] = last
         yield UserSubscriptions(user_jid).subscribe(sub["url"])
         yield Subscription.create(sub)
         self._xmpp.send_presence(
             to=user_jid, from_=sub["jid"],
             type_="subscribe")
     else:
         yield wait_for_host(sub["host"], level=2)
         try:
             page = yield get_page(sub["url"])
         except Exception:
             self._xmpp.send_message(
                 to=user_jid, from_=get_full_jid(our_jid),
                 body=u"Url check failed, subscription aborted. "
                       "Seems like not existing url.")
         else:
             parsed = yield self._worker.parse(sub, page)
             if "last" in parsed and parsed["last"] is not None:
                 self.process_last(user_jid, our_jid, sub, parsed["last"])
             else:
                 self._xmpp.send_message(
                     to=user_jid, from_=get_full_jid(our_jid),
                     body=u"Page parsing failed, subscription aborted. "
                           "Seems like not existing url.")
def get_data(url, handler):
    key = parser.convertUrl(url)
    if not options.DEBUG:
        cached = yield fetcher.get_data(key)
        if cached:
            return cached

    result = yield fetcher.get_page(url)
    ret = yield maybe_future(handler(result))
    ret = json.dumps(ret)
    yield fetcher.write_data(key, ret, options.CACHE_TIME)
    return ret
Пример #4
0
def get_data(url, handler):
    key = parser.convertUrl(url)
    if not options.DEBUG:
        cached = yield fetcher.get_data(key)
        if cached:
            return cached

    result = yield fetcher.get_page(url)
    ret = yield maybe_future(handler(result))
    ret = json.dumps(ret)
    yield fetcher.write_data(key, ret, options.CACHE_TIME)
    return ret
Пример #5
0
def get_link_data(valid_link, link_file_prename):
    f = open('%surl_list.html' % link_file_prename, 'wb+')
    for url in valid_link:
        f.write(url + '\n')
    f.close()
    for num in range(0, len(valid_link)):
        url = valid_link[num]
        print url
        html = fetcher.get_page(url)
        f = open('%s%d.html' % (link_file_prename, num + 1), 'wb+')
        f.write(html)
        f.close()
    pass
Пример #6
0
 def process_page(self, sub, last_modified):
     yield utils.wait_for_host(sub["host"])
     self.debug("HOST OK: %s (page)" % sub["url"])
     try:
         page = yield get_page(sub["url"])
     except NotFound:
         self.dead_url(sub)
     except Exception:
         err = traceback.format_exc()[:-1]
         self.bad_url(sub, err)
     else:
         parsed = yield self._worker.parse(sub, page)
         self.process_parsed(sub, parsed, last_modified)
     # We've done, decrement connections count
     self._conn_count -= 1
Пример #7
0
 def process_page(self, sub, last_modified):
     yield utils.wait_for_host(sub["host"])
     self.debug("HOST OK: %s (page)" % sub["url"])
     try:
         page = yield get_page(sub["url"])
     except NotFound:
         self.dead_url(sub)
     except Exception:
         err = traceback.format_exc()[:-1]
         self.bad_url(sub, err)
     else:
         parsed = yield self._worker.parse(sub, page)
         self.process_parsed(sub, parsed, last_modified)
     # We've done, decrement connections count
     self._conn_count -= 1
def crawl_web(environment, index, graph, seed, max_pages, max_depth):
    tocrawl = ['https://wikipedia.org/wiki/' + seed]
    next_depth = []
    crawled = []
    count = 0
    depth = 0
    while tocrawl and count < max_pages and depth <= max_depth:
        count += 1
        page = tocrawl.pop()
        if page not in crawled:
            links, page_content = get_page(environment, page, max_pages)
            add_page_to_index(index, page, page_content)
            graph[page] = links
            mergeLists(next_depth, links)
            crawled.append(page)
        if not tocrawl:
            tocrawl, next_depth = next_depth, []
            depth = depth + 1
    return index, graph
Пример #9
0
if __name__ == '__main__':
    config = configparser.ConfigParser()
    config.read('crawler.config')

    print("Master started. Initial page {}".format(
        config["SITE"]["initial_page"]))

    # init
    start_time = datetime.datetime.now()

    url_manager = URLManager.URLManager()
    url_manager.insert_url(config["SITE"]["initial_page"], 0, 0, -1)

    fetcher = fetcher.Fetcher(url_manager)

    end_time = datetime.datetime.now()
    delta = end_time - start_time
    print("Init time", delta)

    # start crawling
    while url_manager.has_next_url():  # TODO: change if parallel
        print("queue size", url_manager.get_size())
        next_url = url_manager.get_next_url()

        print("fetching", next_url)
        fetcher.get_page(next_url)
        print("done")

        sys.stdout.flush()
        sys.stderr.flush()