Exemplo n.º 1
0
def assign_url_info_defaults(url, url_info):
    url_info["_id"] = misc.md5(url)
    now = datetime2timestamp(datetime.datetime.utcnow())
    url_info["created_time"] = now
    url_info["crawled_count"] = 0
    url_info["url_class"] = None
    url_info["error_messages"] = []
    #url_info["processed_count"] = 0
    #url_info["last_processed"] = None
    url_info["first_modified"] = None
    url_info["last_modified"] = None
    url_info["modified_count"] = 0
    url_info["valid_link_count"] = None
    url_info["retry_count"] = 0
    url_info["status_last_modified"] = now
    url_info["encoding"] = None
    url_info["encoding_created_time"] = None
    url_info["redirect_url"] = None
    #url_info["last_finished"] = None
    #url_info["expires"] = now
    url_info["doc"] = None
    url_info["headers"] = None
    url_info["md5"] = None
    #url_info["process_status"] = True
    url_info["last_discovered"] = now
    url_info["discovered_count"] = 1
    url_info["comments"] = ""
    url_info["redirect_count"] = 0
    url_info["recrawl_time"] = now
    url_info["recrawl_duration"] = 0
    url_info["recrawl_priority"] = url_info["crawl_priority"]

    _, full_domain, _ = misc.get_url_domain_info(url)
    url_info["full_domain"] = full_domain
Exemplo n.º 2
0
    def test_url_dedup_type(self):
        common_settings.redis_cache_config["validation_enabled"] = False
        common_settings.redis_cache_config["data_types"]["url_dedup_test"] = {"content_type" : "redis/set"}
        client = common_settings.cache_client()
        client.delete("url_dedup_test", None)
        url_list = ["http://www.baidu.com", "http://www.google.com", "http://www.sina.com.cn"]
        for url in url_list:
            md5 = misc.md5(url)
            client.set("url_dedup_test", md5)

        for url in url_list:
            self.assertEqual(True, client.get("url_dedup_test", misc.md5(url)))

        self.assertEqual(False, client.get("url_dedup_test", misc.md5("http://www.google.com/")))
        self.assertFalse(client.set("url_dedup_test", misc.md5("http://www.google.com/"), with_get=True))
        self.assertEqual(True, client.get("url_dedup_test", misc.md5("http://www.google.com/")))
Exemplo n.º 3
0
def get_result_by_url(url, start_index=1, page_type=1):
    cond = {'_id': misc.md5(url)}
    fields = NO_ROW_ID
    if page_type == 1:
        #only query specific page:
        fields = copy.copy(_RESULT_META_FIELDS)
        fields[_CONTENT_COLUMN_NAME % start_index] = 1
    return _db.results.find_one(cond, fields=fields)
Exemplo n.º 4
0
def save_handler_counts(handler_counts, type):
    now = datetime2timestamp(datetime.datetime.utcnow())
    insert = {}
    insert["_id"] = misc.md5(str(now))
    insert["datetime"] = now
    insert["handler_counts"] = handler_counts
    insert["type"] = type
    return db.handlerStatistics.save(insert)
Exemplo n.º 5
0
 def _generate_name(cls, id_generator, content_type, data_type, data_key):
     if content_type == "redis/set":#ignores id_generator for redis/set type
         return data_type
     elif id_generator == "raw":
         return ":".join([data_type, data_key])
     elif id_generator == "md5":
         return ":".join([data_type, misc.md5(data_key)])
     elif id_generator == "none":
         return data_type
     else:
         raise Exception("not supported id_generator %s" % id_generator)
Exemplo n.º 6
0
def save_crawl_domain_info(url, domain_type = "full_domain", crawl_priority = -1, crawl_depth = -1, \
    recrawl_details = False, recrawl_list = False, recrawl_undefined = False):#-1 means auto config needed

    domain_info = misc.get_url_domain_info(url)
    domain_types = common_settings.domain_types
    domain = domain_info[domain_types.index(domain_type)]
    update_map = {"domain" : domain, "domain_type" : domain_type, "url" : url,
        "crawl_priority" : crawl_priority, "crawl_depth" : crawl_depth,
        "recrawl_details" : recrawl_details, "recrawl_list" : recrawl_list, "recrawl_undefined" : recrawl_undefined,
        "_id" :  misc.md5(''.join([domain, domain_type]))
    }

    db.crawlDomainWhitelist.save(update_map)#Note: will override duplicate domain
Exemplo n.º 7
0
def update_transcode_result(result, pages, page_type='details', process_type='batch'):
    i = 1
    for page in pages:
        content = ''
        for node in page:
            content += p.tostring(node)
        result[_CONTENT_COLUMN_NAME%i] = content
        i += 1
    result['statusCode'] = _TYPE_STATUSCODE_DICT[page_type] if page_type in _TYPE_STATUSCODE_DICT else _DEFAULT_STATUSCODE
    result['_id'] = misc.md5(result['url'])
    result['processType'] = process_type
    cond = {'_id': result['_id']}
    update = result
    _db.results.update(cond, update, upsert=True)
Exemplo n.º 8
0
    def build_crawler_request_msg(cls, url, url_info):
        message = misc.clone_dict(url_info, ["url", "page_last_modified"])
        message["__priority"] = url_info["crawl_priority"]
        message["meta"] = misc.clone_dict(url_info, common_settings.crawler_msg_meta_fields)
        if common_settings.strong_politeness:
            message["__group_hash"] = url_info["full_domain"]
        else:
            message["__group_hash"] = misc.md5(url)

        if url_info["crawl_type"] == "static":
            message_type = "__internal_crawler_request"
        elif url_info["crawl_type"] == "dynamic":
            message_type = "__internal_dynamic_crawler_request"
        else:
            raise Exception("unsupported crawl_type %s" % url_info["crawl_type"])

        return message_type, message
Exemplo n.º 9
0
def default_cond(url):
    return {"_id" : misc.md5(url)}
Exemplo n.º 10
0
def save_offline_manipulation(manipulation, result, type):
    now = datetime.datetime.now()
    db.offlineManipulations.save({"_id" : misc.md5(str(now)), "manipulation" : manipulation, "result" : result, "datetime" : now, "type" : type})
Exemplo n.º 11
0
def save_redirect_url(url, redirect_url):
    now = datetime2timestamp(datetime.utcnow())
    _db.urlRedirects.save({"_id" : misc.md5(url), "url" : url, "redirect_url" : redirect_url, "created_time" : now})
Exemplo n.º 12
0
def save_heartbeat(message):
    now = datetime2timestamp(datetime.datetime.utcnow())
    message["_id"] = misc.md5(str(now))
    message["datetime"] = now
    return db.heartbeats.save(message)