def diff_collections2(b1, b2, result_dir, step=10000): ''' b2 is new collection, b1 is old collection ''' DIFFFILE_PATH = '/home/kevinxin/diff_result/' DATA_FOLDER = os.path.join(DIFFFILE_PATH, result_dir) if not os.path.exists(DATA_FOLDER): os.mkdir(DATA_FOLDER) data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields=[]) data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields=[]) cnt = 0 cnt_update = 0 cnt_add = 0 cnt_delete = 0 for _batch in data_new: cnt += 1 id_list_new = [_doc['_id'] for _doc in _batch] docs_common = b1.target_collection.find({'_id': {'$in': id_list_new}}, projection=[]) ids_common = [_doc['_id'] for _doc in docs_common] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common), fastdiff=True) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = {'add': id_in_new, 'update': _updates, 'delete': [], 'source': b2.target_collection.name, 'timestamp': get_timestamp()} if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='') cnt_update += len(_updates) cnt_add += len(id_in_new) print("Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}".format(cnt_update, cnt_add)) print("="*100) for _batch in data_old: cnt += 1 id_list_old = [_doc['_id'] for _doc in _batch] docs_common = b2.target_collection.find({'_id': {'$in': id_list_old}}, projection=[]) ids_common = [_doc['_id'] for _doc in docs_common] id_in_old = list(set(id_list_old)-set(ids_common)) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = {'delete': id_in_old, 'add': [], 'update': [], 'source': b2.target_collection.name, 'timestamp': get_timestamp()} if len(id_in_old) != 0: dump(_result, file_name) print("(Deleted: {})".format(len(id_in_old)), end='') cnt_delete += len(id_in_old) print("Finished calculating diff for the old collection. Total number of docs deleted: {}".format(cnt_delete)) print("="*100) print("Summary: (Updated: {}, Added: {}, Deleted: {})".format(cnt_update, cnt_add, cnt_delete))
def test_0_registerdid_batch(self): # create 5000 account # send 1 etp to each account # issue a did for each account now = common.get_timestamp() def get_account_name(i, j): return "Account_%s_%s" % (i, j) def get_did_symbol(i, j): return "tempdid_%s_%s.%s" % (i, j, now) lastwords = [] addresses = [] batch_amount_i, batch_amount_j = 200, 10 # total 2000 def get_lastword(i, j): return lastwords[i * batch_amount_j + j] def get_address(i, j): return addresses[i * batch_amount_j + j] def get_did_count(): ec, message = mvs_rpc.list_dids() self.assertEqual(ec, 0) return len(message["dids"]) try: for i in xrange(batch_amount_i): receivers = {} for j in xrange(batch_amount_j): ec, (lastword, address) = mvs_rpc.easy_new_account(get_account_name(i, j), "123456") self.assertEqual(ec, 0) lastwords.append(lastword) addresses.append( address ) receivers[address] = 10**8 # 1 etp for each Alice.sendmore_etp(receivers) Alice.mining() previous = get_did_count() for i in xrange(batch_amount_i): for j in xrange(batch_amount_j): ec, message = mvs_rpc.register_did(get_account_name(i, j), "123456", get_address(i, j), get_did_symbol(i, j)) self.assertEqual(ec, 0) current = 0 mine_round = 0 while current < previous + batch_amount_i*batch_amount_j: Alice.mining() current = get_did_count() mine_round += 1 self.assertEqual(mine_round, 2) finally: for i in xrange(batch_amount_i): for j in xrange(batch_amount_j): ec, message = mvs_rpc.delete_account(get_account_name(i, j), "123456", get_lastword(i, j)) self.assertEqual(ec, 0, message)
def make_plots(y, min_n_of_epochs): import matplotlib.pyplot as plt from os.path import join, exists from os import makedirs import sys sys.path.append('..') import configuration.model as config from utils.common import get_timestamp timestamp = get_timestamp() # we are creating directory in which all the plots will be stored storing_directory = join(config.path_for_storing, str(timestamp)+"_iter_middle_misclass") # if directory doesn't exist (which should be always) create it if not exists(storing_directory): makedirs(storing_directory) # for each experiment that has been run for current_idx, experiment in enumerate(y): # if it has seen at least 3 epochs if len(experiment) > min_n_of_epochs: # save plot of the misclass error w.r.t number of epochs seen plt.plot(range(1, len(experiment)+1), experiment, 'r-') plt.xlabel("number of epochs seen") plt.ylabel("misclass error") plt.savefig(join(storing_directory, 'epochs_misclass_model'+str(current_idx)+".png"), bbox_inches='tight') plt.clf()
def diff_collections(b1, b2, use_parallel=True, step=10000): """ b1, b2 are one of supported backend class in databuild.backend. e.g., b1 = GeneDocMongoDBBackend(c1) b2 = GeneDocMongoDBBackend(c2) """ id_s1 = set(b1.get_id_list()) id_s2 = set(b2.get_id_list()) print("Size of collection 1:\t", len(id_s1)) print("Size of collection 2:\t", len(id_s2)) id_in_1 = id_s1 - id_s2 id_in_2 = id_s2 - id_s1 id_common = id_s1 & id_s2 print("# of docs found only in collection 1:\t", len(id_in_1)) print("# of docs found only in collection 2:\t", len(id_in_2)) print("# of docs found in both collections:\t", len(id_common)) print("Comparing matching docs...") _updates = [] if len(id_common) > 0: if not use_parallel: _updates = _diff_doc_inner_worker2(b1, b2, list(id_common)) else: from utils.parallel import run_jobs_on_ipythoncluster _path = os.path.split(os.path.split(os.path.abspath(__file__))[0])[0] id_common = list(id_common) # b1_target_collection = b1.target_collection.name # b2_es_index = b2.target_esidxer.ES_INDEX_NAME _b1 = (b1.target_name, b1.name) _b2 = (b2.target_name, b2.name) task_li = [(_b1, _b2, id_common[i: i + step], _path) for i in range(0, len(id_common), step)] job_results = run_jobs_on_ipythoncluster(_diff_doc_inner_worker2, task_li) _updates = [] if job_results: for res in job_results: _updates.extend(res) else: print("Parallel jobs failed or were interrupted.") return None print("Done. [{} docs changed]".format(len(_updates))) _deletes = [] if len(id_in_1) > 0: _deletes = sorted(id_in_1) _adds = [] if len(id_in_2) > 0: _adds = sorted(id_in_2) changes = {'update': _updates, 'delete': _deletes, 'add': _adds, 'source': b2.target_collection.name, 'timestamp': get_timestamp()} return changes
def _download(__metadata__): from utils.dataload import download as _download output_folder = os.path.join(os.path.split(DATA_FOLDER)[0], get_timestamp()) for species in ['human', 'mouse', 'yeast']: url = __metadata__['__url_{}__'.format(species)] output_file = 'CPDB_pathways_genes_{}.tab'.format(species) _download(url, output_folder, output_file)
def objective_function(samp): current_time = get_timestamp() k = config.number_of_cross_validation_parts yaml_scheme_path = config.yaml_skelton_path data_scheme_yaml = config.data_yaml_scheme dataset_dict = config.data_dict seed = config.seed return CrossValidator.run(k=k, model_dictionary=samp, data_yaml_scheme_path=data_scheme_yaml, dataset_files=dataset_dict, seed=seed)
def switch_collection(self): '''after a successful loading, rename temp_collection to regular collection name, and renaming existing collection to a temp name for archiving purpose. ''' if self.temp_collection and self.temp_collection.count() > 0: if self.src_db[self.src_name].count() > 0: new_name = '_'.join([self.src_name, 'archive', get_timestamp(), get_random_string()]) self.src_db[self.src_name].rename(new_name, dropTarget=True) self.temp_collection.rename(self.src_name) else: return None
def test_2_didsend_more(self): receivers = { Bob.mainaddress(): 100000, 'Invalid_address_' + common.get_timestamp(): 100001, Dale.mainaddress(): 100002, Eric.did_symbol: 100003, } #Invalid_address or did ec, message = mvs_rpc.didsendmore(Alice.name, Alice.password, receivers, Alice.did_symbol) self.assertEqual(ec, 7006, message)
def switch_collection(self): '''after a successful loading, rename temp_collection to regular collection name, and renaming existing collection to a temp name for archiving purpose. ''' if self.temp_collection and self.temp_collection.count() > 0: if self.collection.count() > 0: # renaming existing collections new_name = '_'.join([self.__collection__, 'archive', get_timestamp(), get_random_string()]) self.collection.rename(new_name, dropTarget=True) self.temp_collection.rename(self.__collection__) else: print("Error: load data first.")
def make_plots(y): import matplotlib.pyplot as plt from os.path import join import sys sys.path.append('..') import configuration.model as config from utils.common import get_timestamp timestamp = get_timestamp() plt.plot(range(1, len(y)+1), y, 'r-') plt.xlabel("number of model seen by hyperopt") plt.ylabel("misclass error") plt.savefig(join(config.path_for_storing, timestamp+'hyp_model_misclass.png'), bbox_inches='tight') plt.clf()
def backup_src_configs(): import json import os from utils.common import send_s3_file, get_timestamp, DateTimeJSONEncoder db = get_src_db() for cfg in ['src_dump', 'src_master', 'src_build']: xli = list(db[cfg].find()) bakfile = '/tmp/{}_{}.json'.format(cfg, get_timestamp()) bak_f = file(bakfile, 'w') json.dump(xli, bak_f, cls=DateTimeJSONEncoder, indent=2) bak_f.close() bakfile_key = 'genedoc_src_config_bk/' + os.path.split(bakfile)[1] print('Saving to S3: "{}"... '.format(bakfile_key), end='') send_s3_file(bakfile, bakfile_key, overwrite=True) os.remove(bakfile) print('Done.')
def test_7_change_did(self): '''modify did between Zac's addresses''' temp_did = "ZAC.DIID@" + common.get_timestamp() Alice.send_etp(Zac.mainaddress(), 10**8) Alice.mining() ec, message = mvs_rpc.register_did(Zac.name, Zac.password, Zac.mainaddress(), temp_did) self.assertEqual(ec, 0, message) Alice.mining() # no enough balance, unspent = 0, payment = 10000 ec, message = mvs_rpc.change_did(Zac.name, Zac.password, Zac.addresslist[1], temp_did) self.assertEqual(ec, 3302, message) Alice.send_etp(Zac.addresslist[1], 10**4) Alice.mining() ec, message = mvs_rpc.change_did(Zac.name, Zac.password, Zac.addresslist[1], temp_did) self.assertEqual(ec, 0, message) Alice.mining() ec, message = mvs_rpc.list_dids(Zac.name, Zac.password) self.assertEqual(ec, 0, message) self.assertEqual(message['dids'][0]['symbol'], temp_did, message) self.assertEqual(message['dids'][0]['address'], Zac.addresslist[1], message) # confirm the modification procedure by list_didaddresses ec, message = mvs_rpc.list_didaddresses(temp_did) self.assertEqual(ec, 0, message) self.assertEqual(message['addresses'][0]["address"], Zac.addresslist[1]) self.assertEqual(message['addresses'][0]["status"], "current") self.assertEqual(message['addresses'][1]["address"], Zac.addresslist[0]) self.assertEqual(message['addresses'][1]["status"], "history")
def get_sample_experiment(): from hyperopt.pyll.stochastic import sample from pylearn2.config import yaml_parse from os.path import join import sys sys.path.append('..') from hyperopt_api.parser import build from yaml_parser import yaml_parser as yp from hyperopt_api.search_space import get_search_space import configuration.model as config from utils.common import get_timestamp # prepare all variables that don't need to be updated with each iteration spa = get_search_space() # define search space over possible models path = config.data_path # obtain the yaml skelton with open(config.yaml_skelton_path) as f: default_string = f.read() samp = sample(spa) # generate sample (will give a description of a model) mod = build(samp) # based on description generated build an object that will fit into yaml_paser # define weight decay parameters. They depend on the number of layers (there is one parameter fo each layer) weight_decay_coeffs = yp.parse_weight_decay(mod) # generate a filename to store the best model pkl_filename = join(config.path_for_storing, get_timestamp() + "_best.pkl") # create dictionary with hyper parameters hyper_params = {'model': yp.parse_to_yaml(mod), 'path': yp.parse_to_yaml(path), 'weight_decay_coeffs': weight_decay_coeffs, 'pkl_filename': pkl_filename} # fill the yaml skelton with hyperparameters yaml_string = default_string % hyper_params network = yaml_parse.load(yaml_string) return network
__author__ = 'agnieszka' from os.path import join from pylearn2.config import yaml_parse import sys sys.path.append('..') from utils.common import notify import configuration.model as config import models import yaml_parser as yp from utils.common import get_timestamp current_time = get_timestamp() # setting parameters of convolutional layer con = models.ConvElemwise() con.layer_name = "h0" con.output_channels = 1 con.kernel_shape = [2, 2] con.nonlinearity = models.TanhConvNonlinearity() con.irange = 0.1 con.pool_shape = [2, 3] # setting parameters of softmax layer sof = models.Softmax() sof.n_classes = 2 sof.layer_name = "softmax" sof.irange = 0.1 # creating list of layers layers = [con, sof]
t = Terminal() import configuration.model as config # prepare all variables that don't need to be updated with each iteration spa = get_search_space() # define search space over possible models # define data paths path = config.data_path # obtain the yaml skelton with open(config.yaml_skelton_path) as f: default_string = f.read() # for each sample that will be generated from search space space for i in xrange(20): timestamp = get_timestamp() print t.bold_red('ITERATION:'), t.bold_red(str(i)), "started at: ", timestamp samp = sample(spa) # generate sample (will give a description of a model) print t.bold_cyan('SAMP'), samp mod = build(samp) # based on description generated build an object that will fit into yaml_parser print t.bold_blue('MODEL'), mod # define weight decay parameters. They depend on the number of layers (there is one parameter fo each layer) weight_decay_coeffs = yp.parse_weight_decay(mod) # generate a filename to store the best model pkl_filename = join(config.path_for_storing, timestamp+"best_"+str(i)+'_'+".pkl")
def _get_target_name(self): return 'genedoc_{}_{}_{}'.format(self._build_config['name'], get_timestamp(), get_random_string()).lower()
def test_9_change_did_multisig(self): did_normal_symbal = "Zac@" + common.get_timestamp() Alice.send_etp(Zac.mainaddress(), 10**8) Alice.mining() ec, message = mvs_rpc.register_did(Zac.name, Zac.password, Zac.mainaddress(), did_normal_symbal) self.assertEqual(ec, 0, message) Alice.mining() group = [Alice, Cindy, Dale, Frank, Zac] did_symbol = '@'.join(r.name for r in group) + common.get_timestamp() for i, role in enumerate(group): addr = role.new_multisigaddress( "Alice & Cindy & Zac's Multisig-DID", group[:i] + group[i + 1:], 3) Alice.send_etp(addr, (10**9)) Alice.mining() ec, message = mvs_rpc.register_did(group[0].name, group[0].password, addr, did_symbol) self.assertEqual(ec, 0, message) ec, message = mvs_rpc.sign_multisigtx(group[1].name, group[1].password, message) self.assertEqual(ec, 0, message) ec, message = mvs_rpc.sign_multisigtx(group[2].name, group[2].password, message, True) self.assertEqual(ec, 0, message) Alice.mining() # did not find ec, message = mvs_rpc.change_did(Zac.name, Zac.password, Zac.mainaddress(), common.get_timestamp()) self.assertEqual(ec, 7006, message) #did not owner by account ec, message = mvs_rpc.change_did(Bob.name, Bob.password, Bob.mainaddress(), did_symbol) self.assertEqual(ec, 7009, message) #did address invalid ec, message = mvs_rpc.change_did(Zac.name, Zac.password, "Test" * 20, did_symbol) self.assertEqual(ec, 4012, message) #address didn't owned by the account ec, message = mvs_rpc.change_did(Zac.name, Zac.password, Bob.mainaddress(), did_symbol) self.assertEqual(ec, 4003, message) #address is already binded with did ec, message = mvs_rpc.change_did(Zac.name, Zac.password, Zac.mainaddress(), did_symbol) self.assertEqual(ec, 7002, message) # no enough balance, unspent = 0, payment = 10000 ec, message = mvs_rpc.change_did(Zac.name, Zac.password, Zac.addresslist[1], did_symbol) self.assertEqual(ec, 3302, message) Alice.send_etp(Zac.addresslist[1], 10**5) Alice.mining() #signature must be large than 3 ec, message = mvs_rpc.change_did(Zac.name, Zac.password, Zac.addresslist[1], did_symbol) self.assertEqual(ec, 0, message) #cannot transfer to another multi-signature ec, message = mvs_rpc.sign_multisigtx(group[0].name, group[0].password, message, True) self.assertEqual(ec, 5304, message) group_new = [Bob, Dale, Zac] for i, role in enumerate(group_new): addr_new = role.new_multisigaddress( "Bob & Dale & Zac's Multisig-DID", group_new[:i] + group_new[i + 1:], 2) Alice.send_etp(addr_new, (10**6)) Alice.mining() ec, message = mvs_rpc.change_did(Zac.name, Zac.password, addr_new, did_symbol) self.assertEqual(ec, 7010, message) ec, message = mvs_rpc.change_did(Zac.name, Zac.password, Zac.addresslist[1], did_symbol) self.assertEqual(ec, 0, message) ec, message = mvs_rpc.sign_multisigtx(group[0].name, group[0].password, message) self.assertEqual(ec, 0, message) ec, message = mvs_rpc.sign_multisigtx(group[1].name, group[1].password, message, True) self.assertEqual(ec, 0, message) self.assertNotEqual( Zac.get_didaddress(symbol=did_symbol), Zac.addresslist[1], "Failed where modify did address from multi_signature to multi_signature address" )
def run(k, model_dictionary, data_yaml_scheme_path, dataset_files, seed=1337): print "CV_ENTER" # obtain the yaml skelton assert k >= 3 # we need to have at least 3 sets: train, validation, test with open(config.yaml_skelton_path) as f: default_string = f.read() with open(data_yaml_scheme_path) as f: data_yaml_scheme = f.read() list_of_scores = [] for i in xrange(k): current_time = get_timestamp() # calculate which parts to choose validation_part = [i] test_part = [0] if i < k - 1: test_part = [i + 1] train_parts = [x for x in xrange(k) if x not in validation_part and x not in test_part] train_data_string = data_yaml_scheme % { "path": dataset_files["labeled_paths"], "y_val": dataset_files["labeled_values"], "cv": [k, train_parts], "seed": seed, "middle_path": dataset_files["middle_paths"], "middle_val": dataset_files["middle_values"], } # we don't want any unlabelled examples in validation nor test data validation_data_string = data_yaml_scheme % { "path": dataset_files["labeled_paths"], "y_val": dataset_files["labeled_values"], "cv": [k, validation_part], "seed": seed, "middle_path": [], "middle_val": [], } test_data_string = data_yaml_scheme % { "path": dataset_files["labeled_paths"], "y_val": dataset_files["labeled_values"], "cv": [k, test_part], "seed": seed, "middle_path": [], "middle_val": [], } mod = build(model_dictionary) # based on description generated build an object that will fit into # yaml_parser print t.bold_cyan("SAMP"), model_dictionary print t.bold_blue("MODEL"), mod # define weight decay parameters. They depend on the number of layers (there is one parameter fo each layer) weight_decay_coeffs = yp.parse_weight_decay(mod) # generate a filename to store the best model pkl_filename = join(config.path_for_storing, current_time + "_best.pkl") # create dictionary with hyper parameters hyper_params = { "model": yp.parse_to_yaml(mod), "train_data": train_data_string, "validation_data": validation_data_string, "weight_decay_coeffs": weight_decay_coeffs, "pkl_filename": pkl_filename, } yaml_string = default_string % hyper_params network = None # misclass_error = 1 # f1_score_error = 1 roc_score = 0 try: # create the model based on a yaml network = yaml_parse.load(yaml_string) print t.bold_magenta("NETWORK"), type(network) # train the model network.main_loop() except BaseException: # TODO: this exception is to broad # if exception was thrown save yaml of a model that generated that exception with open(current_time + ".yaml", "w") as YAML_FILE: YAML_FILE.write(yaml_string) # write down errors description to a file with open(current_time + ".error", "w") as ERROR_FILE: ERROR_FILE.write(traceback.format_exc()) finally: if network is not None: try: from numpy import argmax # run predictions to obtain score for this model test_data = yaml_parse.load(test_data_string) best_model = serial.load("best_model_roc_youden.model") roc_score_, threshold = roc_score_threshold_getter(network) predictor = Predictor(best_model) predictions = predictor.get_predictions(test_data.X) fp = 0 fn = 0 tp = 0 tn = 0 for tr, pre in zip(test_data.y, predictions): if pre[0][1] > threshold: if tr[0] == 1: tp += 1 else: fp += 1 else: if tr[0] == 0: tn += 1 else: fn += 1 roc_score = (float(tp) / (float(tp) + fn)) - (float(fp) / (float(fp) + tn)) list_of_scores.append(1 - roc_score) # we want to maximise this score, hyperopt minimises print t.bold_red("_ROC: Best roc score for this model: " + str(roc_score)) print t.bold_red("_ROC: Obtained for threshold: " + str(threshold)) precision = float(tp) / (tp + fp) recall = float(tp) / (tp + fn) f1score = 0 if precision + recall != 0: f1score = 2 * precision * recall / (precision + recall) print "precision:", precision print "recall:", recall print "f1score", f1score except BaseException: # TODO: this exception is to broad with open(current_time + "_ROC_error", "w") as ERROR_FILE: ERROR_FILE.write(traceback.format_exc()) # print t.bold_red("M_01: misclass_error for this model: "+str(misclass_error)) # return misclass_error print t.bold_red("M_02: roc score error for this model: " + str(roc_score)) # list_of_scores.append(f1_score_error) m = mean(list_of_scores) print "CV_MEAN mean(1-ROC) on this architecture:", m return m
def diff_collections2(b1, b2, result_dir, use_parallel=True, step=10000): ''' b2 is new collection, b1 is old collection ''' if use_parallel: import multiprocessing from functools import partial DATA_FOLDER = result_dir data_new = doc_feeder(b2.target_collection, step=step, inbatch=True, fields={'_id': 1}) data_old = doc_feeder(b1.target_collection, step=step, inbatch=True, fields={'_id': 1}) cnt = 0 cnt_update = 0 cnt_add = 0 cnt_delete = 0 _timestamp = get_timestamp() if not os.path.exists(DATA_FOLDER): os.mkdir(DATA_FOLDER) for batch in data_new: cnt += 1 id_list_new = [doc['_id'] for doc in batch] ids_common = [ doc['_id'] for doc in b1.target_collection.find({'_id': { '$in': id_list_new }}, {'_id': 1}) ] id_in_new = list(set(id_list_new) - set(ids_common)) _updates = [] if len(ids_common) > 0: if use_parallel: step = int(len(ids_common) / multiprocessing.cpu_count()) task_list = [ ids_common[i:i + step] for i in range(0, len(ids_common), step) ] pool = multiprocessing.Pool() partial_worker = partial(_diff_parallel_worker, b1.target_collection.name, b2.target_collection.name) results = pool.map(partial_worker, task_list) pool.close() pool.join() for result in results: _updates += result else: _updates = _diff_doc_inner_worker2(b1, b2, list(ids_common)) file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' _result = { 'add': id_in_new, 'update': _updates, 'delete': [], 'source': b2.target_collection.name, 'timestamp': _timestamp } if len(_updates) != 0 or len(id_in_new) != 0: dump(_result, file_name) print("(Updated: {}, Added: {})".format(len(_updates), len(id_in_new)), end='') cnt_update += len(_updates) cnt_add += len(id_in_new) print( "Finished calculating diff for the new collection. Total number of docs updated: {}, added: {}" .format(cnt_update, cnt_add)) print("=" * 100) for _batch in data_old: cnt += 1 id_list_old = [_doc['_id'] for _doc in _batch] ids_common = [ doc['_id'] for doc in b2.target_collection.find({'_id': { '$in': id_list_old }}, {'_id': 1}) ] id_in_old = list(set(id_list_old) - set(ids_common)) _result = { 'delete': id_in_old, 'add': [], 'update': [], 'source': b2.target_collection.name, 'timestamp': _timestamp } file_name = DATA_FOLDER + '/' + str(cnt) + '.pyobj' if len(id_in_old) != 0: dump(_result, file_name) print("(Deleted: {})".format(len(id_in_old)), end='') cnt_delete += len(id_in_old) print( "Finished calculating diff for the old collection. Total number of docs deleted: {}" .format(cnt_delete)) print("=" * 100) print("Summary: (Updated: {}, Added: {}, Deleted: {})".format( cnt_update, cnt_add, cnt_delete))