def find_and_modify_url_info_md5(url, md5_hash): """ Enabled cache update md5 fields """ update_map = {"md5" : md5_hash} inc_map = None fields = ["md5"] url_info = UrlCacheClient.find_and_modify_url_info_by_not_md5(url, md5_hash, update_map, inc_map, fields) cond = default_cond(url) cond["md5"] = {"$ne" : md5_hash} if url_info is None: url_info = _cond_update_url_info(cond, update_map, inc_map, fields) elif url_info == False: url_info = None else: _async_update_url_info(cond, update_map, inc_map) if url_info is None: return 0 #duplicate md5 elif url_info["md5"] is not None: return 1 #md5 changed else: return 2 #first md5
def update_url_info(url, update_map, inc_map = None): """ Enabled cache """ UrlCacheClient.update_url_info(url, update_map, inc_map) cond = default_cond(url) _async_update_url_info(cond, update_map, inc_map)
def get_url_info(url, fields): """ Enabled cache """ url_info = UrlCacheClient.get_url_info(url, fields) if url_info is not None: return url_info else: return _cond_get_url_info(default_cond(url), fields)
def update_url_info_by_status(url, crawl_status, update_map, inc_map = None): """ Enabled cache """ success = UrlCacheClient.update_url_info_by_status(url, crawl_status, update_map, inc_map) cond = default_cond(url) cond["crawl_status"] = crawl_status _async_update_url_info(cond, update_map, inc_map)
def find_and_modify_url_info(url, update_map, inc_map, fields): """ Enabled cache """ cond = default_cond(url) url_info = UrlCacheClient.find_and_modify_url_info(url, update_map, inc_map, fields) if url_info is None: return _cond_update_url_info(cond, update_map, inc_map, fields) else: _async_update_url_info(cond, update_map, inc_map) return url_info
def get_url_info_by_status(url, crawl_status, fields): """ Enabled cache """ url_info = UrlCacheClient.get_url_info_by_status(url, crawl_status, fields) if url_info is None: cond = default_cond(url) cond["crawl_status"] = crawl_status return _cond_get_url_info(cond, fields) elif url_info == False: return None else: return url_info
def find_and_modify_url_info_by_status(url, crawl_status, update_map, inc_map, fields): """ Enabled cache """ cond = default_cond(url) cond["crawl_status"] = crawl_status url_info = UrlCacheClient.find_and_modify_url_info_by_status(url, crawl_status, update_map, inc_map, fields) if url_info is None: return _cond_update_url_info(cond, update_map, inc_map, fields) elif url_info == False: return None else: _async_update_url_info(cond, update_map, inc_map) return url_info
def get_url_info_meta(url, fields): cond = default_cond(url) fields = make_fields(fields) return _cond_get_url_info_meta(cond, fields)
def update_raw_doc(url, update_map): db.rawDocs.update(default_cond(url), update_map)
def remove_redirect_url(url): _db.urlRedirects.remove(default_cond(url))
def get_redirect_url(url): redirect_info = _db.urlRedirects.find_one(default_cond(url), fields={"redirect_url" : 1}) return redirect_info["redirect_url"] if redirect_info is not None else None
def update_result(url, update): cond = default_cond(url) update = {'$set':update} _db.results.update(cond, update)