def count_unique_people(tree_files): all_xrefs = set() for tf in tree_files: _, people_dict = person.read_people(tf, clean=True) all_xrefs |= set(people_dict.keys()) print "{} unique xrefs.".format(len(all_xrefs)) return len(all_xrefs)
def experiment_scalability(n_reps=1, n_trees=5, n_people=500, varied_param='n_people', params=[50,100,500], methods=('unary', 'LD', 'mKlau'), top_k_matches=5, f=1.0, title='scalability', do_save=True, rep_offset=0, subfolder=''): nvv = len(params) res_precision = np.zeros((len(methods), nvv, n_reps)) res_recall = np.zeros((len(methods), nvv, n_reps)) res_fscore = np.zeros((len(methods), nvv, n_reps)) res_t = np.zeros((len(methods), nvv, n_reps)) res_iterations = np.zeros((len(methods), nvv, n_reps)) res_clusters = np.zeros((len(methods), nvv, n_reps)) res_lb = np.zeros((len(methods), nvv, n_reps)) res_ub = np.zeros((len(methods), nvv, n_reps)) t_beg = time.time() start_date_part = str(dt.datetime.now())[:19] start_date_part = re.sub(' ', '_', start_date_part) start_date_part = re.sub(':', '', start_date_part) dir_path = os.path.join("experiment_results", subfolder) if not os.path.exists(dir_path): os.makedirs(dir_path) fname0 = os.path.join(dir_path, "{}_part_{}.pckl".format( title, start_date_part)) for r in range(n_reps): print "\n--- Repetition {}. ---".format(r+1) for i, param_val in enumerate(params): if varied_param == 'n_people': n_people = param_val elif varied_param == 'n_trees': n_trees = param_val else: raise Exception('Invalid varied parameter: ' + varied_param) # Generate data try: tree_files = extract_ft.get_k_fragments( n_trees, n_people, label="first{}".format(r+rep_offset)) people_index_tuples = [] for tf in tree_files: people, people_dict = person.read_people(tf, clean=True) #'family_trees/data/rand_frag_%d/' % i, clean=True) index = create_index(people) people_index_tuples.append((people, index, people_dict)) uniq_people = count_unique_people(tree_files) except: # If it fails, try generating new trees. tree_files = extract_ft.get_k_fragments( n_trees, n_people, label="first{}".format(r+rep_offset), check_if_exists=False) people_index_tuples = [] for tf in tree_files: people, people_dict = person.read_people(tf, clean=True) #'family_trees/data/rand_frag_%d/' % i, clean=True) index = create_index(people) people_index_tuples.append((people, index, people_dict)) uniq_people = count_unique_people(tree_files) print "\n rep={}, {}={}".format(r+1, varied_param, param_val) for mi, m in enumerate(methods): print "\n rep={}, {}={}, method={}".format(r+1, varied_param, param_val, m) t0 = time.time() precision, recall, fscore, n_clusters, lb, ub, iters = \ merge_multiple(people_index_tuples, 10, top_k_matches, method=m, uniq_people=uniq_people, f=f) res_precision[mi, i, r] = precision res_recall[mi, i, r] = recall res_fscore[mi, i, r] = fscore res_clusters[mi, i, r] = n_clusters res_t[mi, i, r] = time.time() - t0 res_iterations[mi, i, r] = iters res_lb[mi, i, r] = lb res_ub[mi, i, r] = ub if do_save and n_reps > 1: pickle.dump(locals(), open(fname0, 'wb')) print "Wrote the results of repetition {} to: {}\n".format(r+1, fname0) print "\nThe whole experiment took {:2f} seconds.".format(time.time()-t_beg) date_part = str(dt.datetime.now())[:19] date_part = re.sub(' ', '_', date_part) date_part = re.sub(':', '', date_part) fname = os.path.join(dir_path, "{}_{}.pckl".format( title, date_part)) if do_save: pickle.dump(locals(), open(fname, 'wb')) print "Wrote the results to: {}\n".format(fname) print "F1 score:", np.mean(res_fscore, axis=2) print "Precision:", np.mean(res_precision, axis=2) print "Recall:", np.mean(res_recall, axis=2) print "Time:", np.mean(res_t, axis=2) print "Clusters:", np.mean(res_clusters, axis=2) print "Lower bounds:", np.mean(res_lb, axis=2) print "Upper bounds:", np.mean(res_ub, axis=2)
def experiment_multiple_trees(n_reps=1, n_trees=5, n_people=500, methods=('unary', 'LD', 'mKlau'), top_k_matches=5, f_vals=(0.1, 0.5, 1, 1.5, 2), title='genealogical', do_save=True, dir_id=None, rep_offset=0): nvv = len(f_vals) res_precision = np.zeros((len(methods), nvv, n_reps)) res_recall = np.zeros((len(methods), nvv, n_reps)) res_fscore = np.zeros((len(methods), nvv, n_reps)) res_t = np.zeros((len(methods), nvv, n_reps)) res_iterations = np.zeros((len(methods), nvv, n_reps)) res_clusters = np.zeros((len(methods), nvv, n_reps)) res_lb = np.zeros((len(methods), nvv, n_reps)) res_ub = np.zeros((len(methods), nvv, n_reps)) t_beg = time.time() start_date_part = str(dt.datetime.now())[:19] start_date_part = re.sub(' ', '_', start_date_part) start_date_part = re.sub(':', '', start_date_part) fname0 = os.path.join("experiment_results", "{}_part_{}.pckl".format( title, start_date_part)) for r in range(n_reps): print "\n--- Repetition {}. ---".format(r+1) # Generate data tree_files = extract_ft.get_k_fragments( n_trees, n_people, label="first{}".format(r+rep_offset)) people_index_tuples = [] for tf in tree_files: people, people_dict = person.read_people(tf, clean=True) #'family_trees/data/rand_frag_%d/' % i, clean=True) index = create_index(people) people_index_tuples.append((people, index, people_dict)) uniq_people = count_unique_people(tree_files) for i, f in enumerate(f_vals): print "\n rep={}, f={}".format(r+1, f) for mi, m in enumerate(methods): if m.startswith('meLD') and i > 0: # No need to compute fixed entity method for different f values. continue print "\n rep={}, f={}, method={}\n".format(r+1, f, m) t0 = time.time() precision, recall, fscore, n_clusters, lb, ub, iters = \ merge_multiple(people_index_tuples, 10, top_k_matches, method=m, uniq_people=uniq_people, f=f) res_precision[mi, i, r] = precision res_recall[mi, i, r] = recall res_fscore[mi, i, r] = fscore res_clusters[mi, i, r] = n_clusters res_t[mi, i, r] = time.time() - t0 res_iterations[mi, i, r] = iters res_lb[mi, i, r] = lb res_ub[mi, i, r] = ub if do_save and n_reps > 1: pickle.dump(locals(), open(fname0, 'wb')) print "Wrote the results of repetition {} to: {}\n".format(r+1, fname0) print "\nThe whole experiment took {:2f} seconds.".format(time.time()-t_beg) if do_save: fname = util.save_data(locals(), title, dir_name='genealogy{}'.format( str(dir_id))) print "Wrote the results to: {}".format(fname) print "F1 score:", np.mean(res_fscore, axis=2) print "Precision:", np.mean(res_precision, axis=2) print "Recall:", np.mean(res_recall, axis=2) print "Time:", np.mean(res_t, axis=2) print "Clusters:", np.mean(res_clusters, axis=2) print "Lower bounds:", np.mean(res_lb, axis=2) print "Upper bounds:", np.mean(res_ub, axis=2)
def experiment_multiple_trees(n_reps=1, n_trees=5, n_people=500, methods=('unary', 'LD', 'mKlau'), top_k_matches=5, f_vals=(0.1, 0.5, 1, 1.5, 2)): nvv = len(f_vals) res_precision = np.zeros((len(methods), nvv, n_reps)) res_recall = np.zeros((len(methods), nvv, n_reps)) res_fscore = np.zeros((len(methods), nvv, n_reps)) res_t = np.zeros((len(methods), nvv, n_reps)) res_iterations = np.zeros((len(methods), nvv, n_reps)) res_clusters = np.zeros((len(methods), nvv, n_reps)) res_lb = np.zeros((len(methods), nvv, n_reps)) res_ub = np.zeros((len(methods), nvv, n_reps)) t_beg = time.time() start_date_part = str(dt.datetime.now())[:19] start_date_part = re.sub(' ', '_', start_date_part) start_date_part = re.sub(':', '', start_date_part) fname0 = os.path.join("experiment_results", "genealogical_{}.pckl".format(start_date_part)) for j in range(n_reps): print "\n--- Repetition {}. ---".format(j + 1) # Generate data tree_files = extract_ft.get_k_fragments(n_trees, n_people, label="first{}".format(j)) people_index_tuples = [] for tf in tree_files: people, people_dict = person.read_people(tf, clean=True) #'family_trees/data/rand_frag_%d/' % i, clean=True) index = create_index(people) people_index_tuples.append((people, index, people_dict)) uniq_people = count_unique_people(tree_files) for i, f in enumerate(f_vals): print "\n rep={}, f={}".format(j + 1, f) for mi, m in enumerate(methods): print "\n rep={}, f={}, method={}\n".format(j + 1, f, m) t0 = time.time() precision, recall, fscore, n_clusters, lb, ub, iters = \ merge_multiple(people_index_tuples, 10, top_k_matches, method=m, uniq_people=uniq_people, f=f) res_precision[mi, i, j] = precision res_recall[mi, i, j] = recall res_fscore[mi, i, j] = fscore res_clusters[mi, i, j] = n_clusters res_t[mi, i, j] = time.time() - t0 res_iterations[mi, i, j] = iters res_lb[mi, i, j] = lb res_ub[mi, i, j] = ub pickle.dump(locals(), open(fname0, 'wb')) print "Wrote the results of repetition {} to: {}\n".format( j + 1, fname0) print "\nThe whole experiment took {:2f} seconds.".format(time.time() - t_beg) date_part = str(dt.datetime.now())[:19] date_part = re.sub(' ', '_', date_part) date_part = re.sub(':', '', date_part) fname = os.path.join("experiment_results", "genealogical_{}.pckl".format(date_part)) pickle.dump(locals(), open(fname, 'wb')) print "Wrote the results to: {}\n".format(fname) print "F1 score:", np.mean(res_fscore, axis=2) print "Precision:", np.mean(res_precision, axis=2) print "Recall:", np.mean(res_recall, axis=2) print "Time:", np.mean(res_t, axis=2) print "Clusters:", np.mean(res_clusters, axis=2) print "Lower bounds:", np.mean(res_lb, axis=2) print "Upper bounds:", np.mean(res_ub, axis=2)
def experiment_scalability(n_reps=1, n_trees=5, n_people=500, varied_param='n_people', params=[50, 100, 500], methods=('unary', 'LD', 'mKlau'), top_k_matches=5, f=1.0, title='scalability', do_save=True, rep_offset=0, subfolder=''): nvv = len(params) res_precision = np.zeros((len(methods), nvv, n_reps)) res_recall = np.zeros((len(methods), nvv, n_reps)) res_fscore = np.zeros((len(methods), nvv, n_reps)) res_t = np.zeros((len(methods), nvv, n_reps)) res_iterations = np.zeros((len(methods), nvv, n_reps)) res_clusters = np.zeros((len(methods), nvv, n_reps)) res_lb = np.zeros((len(methods), nvv, n_reps)) res_ub = np.zeros((len(methods), nvv, n_reps)) t_beg = time.time() start_date_part = str(dt.datetime.now())[:19] start_date_part = re.sub(' ', '_', start_date_part) start_date_part = re.sub(':', '', start_date_part) dir_path = os.path.join("experiment_results", subfolder) if not os.path.exists(dir_path): os.makedirs(dir_path) fname0 = os.path.join(dir_path, "{}_part_{}.pckl".format(title, start_date_part)) for r in range(n_reps): print "\n--- Repetition {}. ---".format(r + 1) for i, param_val in enumerate(params): if varied_param == 'n_people': n_people = param_val elif varied_param == 'n_trees': n_trees = param_val else: raise Exception('Invalid varied parameter: ' + varied_param) # Generate data try: tree_files = extract_ft.get_k_fragments( n_trees, n_people, label="first{}".format(r + rep_offset)) people_index_tuples = [] for tf in tree_files: people, people_dict = person.read_people(tf, clean=True) #'family_trees/data/rand_frag_%d/' % i, clean=True) index = create_index(people) people_index_tuples.append((people, index, people_dict)) uniq_people = count_unique_people(tree_files) except: # If it fails, try generating new trees. tree_files = extract_ft.get_k_fragments( n_trees, n_people, label="first{}".format(r + rep_offset), check_if_exists=False) people_index_tuples = [] for tf in tree_files: people, people_dict = person.read_people(tf, clean=True) #'family_trees/data/rand_frag_%d/' % i, clean=True) index = create_index(people) people_index_tuples.append((people, index, people_dict)) uniq_people = count_unique_people(tree_files) print "\n rep={}, {}={}".format(r + 1, varied_param, param_val) for mi, m in enumerate(methods): print "\n rep={}, {}={}, method={}".format( r + 1, varied_param, param_val, m) t0 = time.time() precision, recall, fscore, n_clusters, lb, ub, iters = \ merge_multiple(people_index_tuples, 10, top_k_matches, method=m, uniq_people=uniq_people, f=f) res_precision[mi, i, r] = precision res_recall[mi, i, r] = recall res_fscore[mi, i, r] = fscore res_clusters[mi, i, r] = n_clusters res_t[mi, i, r] = time.time() - t0 res_iterations[mi, i, r] = iters res_lb[mi, i, r] = lb res_ub[mi, i, r] = ub if do_save and n_reps > 1: pickle.dump(locals(), open(fname0, 'wb')) print "Wrote the results of repetition {} to: {}\n".format( r + 1, fname0) print "\nThe whole experiment took {:2f} seconds.".format(time.time() - t_beg) date_part = str(dt.datetime.now())[:19] date_part = re.sub(' ', '_', date_part) date_part = re.sub(':', '', date_part) fname = os.path.join(dir_path, "{}_{}.pckl".format(title, date_part)) if do_save: pickle.dump(locals(), open(fname, 'wb')) print "Wrote the results to: {}\n".format(fname) print "F1 score:", np.mean(res_fscore, axis=2) print "Precision:", np.mean(res_precision, axis=2) print "Recall:", np.mean(res_recall, axis=2) print "Time:", np.mean(res_t, axis=2) print "Clusters:", np.mean(res_clusters, axis=2) print "Lower bounds:", np.mean(res_lb, axis=2) print "Upper bounds:", np.mean(res_ub, axis=2)