Пример #1
0
def diff_collections(b1, b2, use_parallel=True, step=10000):
    """
    b1, b2 are one of supported backend class in databuild.backend.
    e.g.,
        b1 = DocMongoDBBackend(c1)
        b2 = DocMongoDBBackend(c2)
    """

    id_s1 = set(b1.get_id_list())
    id_s2 = set(b2.get_id_list())
    print("Size of collection 1:\t", len(id_s1))
    print("Size of collection 2:\t", len(id_s2))

    id_in_1 = id_s1 - id_s2
    id_in_2 = id_s2 - id_s1
    id_common = id_s1 & id_s2
    print("# of docs found only in collection 1:\t", len(id_in_1))
    print("# of docs found only in collection 2:\t", len(id_in_2))
    print("# of docs found in both collections:\t", len(id_common))

    print("Comparing matching docs...")
    _updates = []
    if len(id_common) > 0:
        if not use_parallel:
            _updates = _diff_doc_inner_worker(b1, b2, list(id_common))
        else:
            from .parallel import run_jobs_on_ipythoncluster
            _path = os.path.split(os.path.split(
                os.path.abspath(__file__))[0])[0] + "/.."
            id_common = list(id_common)
            _b1 = (get_mongodb_uri(b1), b1.target_collection.database.name,
                   b1.target_name, b1.name)
            _b2 = (get_mongodb_uri(b2), b2.target_collection.database.name,
                   b2.target_name, b2.name)
            task_li = [(_b1, _b2, id_common[i:i + step], _path)
                       for i in range(0, len(id_common), step)]
            job_results = run_jobs_on_ipythoncluster(_diff_doc_worker, task_li)
            _updates = []
            if job_results:
                for res in job_results:
                    _updates.extend(res)
            else:
                print("Parallel jobs failed or were interrupted.")
                return None

        print("Done. [{} docs changed]".format(len(_updates)))

    _deletes = []
    if len(id_in_1) > 0:
        _deletes = sorted(id_in_1)

    _adds = []
    if len(id_in_2) > 0:
        _adds = sorted(id_in_2)

    changes = {'update': _updates, 'delete': _deletes, 'add': _adds}
    return changes
Пример #2
0
def _diff_doc_worker(args):
    _b1, _b2, ids, _path = args
    import biothings.utils.diff
    import importlib
    importlib.reload(biothings.utils.diff)
    from biothings.utils.diff import _diff_doc_inner_worker, get_backend

    b1 = get_backend(*_b1)
    b2 = get_backend(*_b2)

    _updates = _diff_doc_inner_worker(b1, b2, ids)
    return _updates