def diff_collections(b1, b2, use_parallel=True, step=10000): """ b1, b2 are one of supported backend class in databuild.backend. e.g., b1 = DocMongoDBBackend(c1) b2 = DocMongoDBBackend(c2) """ id_s1 = set(b1.get_id_list()) id_s2 = set(b2.get_id_list()) print("Size of collection 1:\t", len(id_s1)) print("Size of collection 2:\t", len(id_s2)) id_in_1 = id_s1 - id_s2 id_in_2 = id_s2 - id_s1 id_common = id_s1 & id_s2 print("# of docs found only in collection 1:\t", len(id_in_1)) print("# of docs found only in collection 2:\t", len(id_in_2)) print("# of docs found in both collections:\t", len(id_common)) print("Comparing matching docs...") _updates = [] if len(id_common) > 0: if not use_parallel: _updates = _diff_doc_inner_worker(b1, b2, list(id_common)) else: from .parallel import run_jobs_on_ipythoncluster _path = os.path.split(os.path.split( os.path.abspath(__file__))[0])[0] + "/.." id_common = list(id_common) _b1 = (get_mongodb_uri(b1), b1.target_collection.database.name, b1.target_name, b1.name) _b2 = (get_mongodb_uri(b2), b2.target_collection.database.name, b2.target_name, b2.name) task_li = [(_b1, _b2, id_common[i:i + step], _path) for i in range(0, len(id_common), step)] job_results = run_jobs_on_ipythoncluster(_diff_doc_worker, task_li) _updates = [] if job_results: for res in job_results: _updates.extend(res) else: print("Parallel jobs failed or were interrupted.") return None print("Done. [{} docs changed]".format(len(_updates))) _deletes = [] if len(id_in_1) > 0: _deletes = sorted(id_in_1) _adds = [] if len(id_in_2) > 0: _adds = sorted(id_in_2) changes = {'update': _updates, 'delete': _deletes, 'add': _adds} return changes
def _diff_doc_worker(args): _b1, _b2, ids, _path = args import biothings.utils.diff import importlib importlib.reload(biothings.utils.diff) from biothings.utils.diff import _diff_doc_inner_worker, get_backend b1 = get_backend(*_b1) b2 = get_backend(*_b2) _updates = _diff_doc_inner_worker(b1, b2, ids) return _updates