示例#1
0
def _cond_update_url_info(cond, update_map, inc_map = None, fields = ["_id"]):
    '''
    Notes: fields are just fields from urlRepository, while update_map/inc_map can include metaUrlRepository fields.
    updates for metaUrlRepository fields just support async mode.
    '''

    fields = make_fields(fields)
    first_update, second_update = _make_update(update_map, inc_map)

    if second_update is not None:
        crawlerMetadb.update_url_info_meta(cond, second_update)

    if first_update is not None:
        return db.urlRepository.find_and_modify(cond, first_update, fields=fields)
    else:
        return _cond_get_url_info(cond, fields)
示例#2
0
def get_url_info_meta(url, fields):
    cond = default_cond(url)
    fields = make_fields(fields)
    return _cond_get_url_info_meta(cond, fields)
示例#3
0
def get_url_infos(cond, fields):
    fields = make_fields(fields)
    return db.urlRepository.find(cond, fields=fields)
示例#4
0
def get_raw_docs_by_statuses(statuses, fields):
    fields = make_fields(fields)
    return db.rawDocs.find({"process_status" : {"$in" : statuses}}, fields=fields)
示例#5
0
def get_url_infos_by_statuses(statuses, fields):
    fields = make_fields(fields)
    return db.urlRepository.find({"crawl_status" : {"$in" : statuses}}, fields=fields)
示例#6
0
def _cond_get_url_info(cond, fields):
    fields = make_fields(fields)
    return db.urlRepository.find_one(cond, fields = fields)