예제 #1
0
def count_unique_people(tree_files):
    all_xrefs = set()
    for tf in tree_files:
        _, people_dict = person.read_people(tf, clean=True)
        all_xrefs |= set(people_dict.keys())
    print "{} unique xrefs.".format(len(all_xrefs))
    return len(all_xrefs)
예제 #2
0
def count_unique_people(tree_files):
    all_xrefs = set()
    for tf in tree_files:
        _, people_dict = person.read_people(tf, clean=True)
        all_xrefs |= set(people_dict.keys())
    print "{} unique xrefs.".format(len(all_xrefs))
    return len(all_xrefs)
예제 #3
0
def experiment_scalability(n_reps=1, n_trees=5, n_people=500,
                           varied_param='n_people', params=[50,100,500],
                           methods=('unary', 'LD', 'mKlau'), top_k_matches=5,
                           f=1.0, title='scalability', do_save=True,
                           rep_offset=0, subfolder=''):
    nvv = len(params)
    res_precision = np.zeros((len(methods), nvv, n_reps))
    res_recall = np.zeros((len(methods), nvv, n_reps))
    res_fscore = np.zeros((len(methods), nvv, n_reps))
    res_t = np.zeros((len(methods), nvv, n_reps))
    res_iterations = np.zeros((len(methods), nvv, n_reps))
    res_clusters = np.zeros((len(methods), nvv, n_reps))
    res_lb = np.zeros((len(methods), nvv, n_reps))
    res_ub = np.zeros((len(methods), nvv, n_reps))
    t_beg = time.time()

    start_date_part = str(dt.datetime.now())[:19]
    start_date_part = re.sub(' ', '_', start_date_part)
    start_date_part = re.sub(':', '', start_date_part)
    dir_path = os.path.join("experiment_results", subfolder)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    fname0 = os.path.join(dir_path, "{}_part_{}.pckl".format(
        title, start_date_part))

    for r in range(n_reps):
        print "\n--- Repetition {}. ---".format(r+1)
        for i, param_val in enumerate(params):
            if varied_param == 'n_people':
                n_people = param_val
            elif varied_param == 'n_trees':
                n_trees = param_val
            else:
                raise Exception('Invalid varied parameter: ' + varied_param)
            # Generate data
            try:
                tree_files = extract_ft.get_k_fragments(
                        n_trees, n_people, label="first{}".format(r+rep_offset))
                people_index_tuples = []
                for tf in tree_files:
                    people, people_dict = person.read_people(tf, clean=True)
                    #'family_trees/data/rand_frag_%d/' % i, clean=True)
                    index = create_index(people)
                    people_index_tuples.append((people, index, people_dict))
                uniq_people = count_unique_people(tree_files)
            except:
                # If it fails, try generating new trees.
                tree_files = extract_ft.get_k_fragments(
                        n_trees, n_people, label="first{}".format(r+rep_offset),
                        check_if_exists=False)
                people_index_tuples = []
                for tf in tree_files:
                    people, people_dict = person.read_people(tf, clean=True)
                    #'family_trees/data/rand_frag_%d/' % i, clean=True)
                    index = create_index(people)
                    people_index_tuples.append((people, index, people_dict))
                uniq_people = count_unique_people(tree_files)

            print "\n  rep={}, {}={}".format(r+1, varied_param, param_val)
            for mi, m in enumerate(methods):
                print "\n    rep={}, {}={}, method={}".format(r+1, varied_param,
                                                              param_val, m)
                t0 = time.time()
                precision, recall, fscore, n_clusters, lb, ub, iters = \
                    merge_multiple(people_index_tuples, 10, top_k_matches,
                                   method=m, uniq_people=uniq_people, f=f)
                res_precision[mi, i, r] = precision
                res_recall[mi, i, r] = recall
                res_fscore[mi, i, r] = fscore
                res_clusters[mi, i, r] = n_clusters
                res_t[mi, i, r] = time.time() - t0
                res_iterations[mi, i, r] = iters
                res_lb[mi, i, r] = lb
                res_ub[mi, i, r] = ub
        if do_save and n_reps > 1:
            pickle.dump(locals(), open(fname0, 'wb'))
            print "Wrote the results of repetition {} to: {}\n".format(r+1,
                                                                       fname0)

    print "\nThe whole experiment took {:2f} seconds.".format(time.time()-t_beg)
    date_part = str(dt.datetime.now())[:19]
    date_part = re.sub(' ', '_', date_part)
    date_part = re.sub(':', '', date_part)
    fname = os.path.join(dir_path, "{}_{}.pckl".format(
        title, date_part))
    if do_save:
        pickle.dump(locals(), open(fname, 'wb'))
        print "Wrote the results to: {}\n".format(fname)

    print "F1 score:", np.mean(res_fscore, axis=2)
    print "Precision:", np.mean(res_precision, axis=2)
    print "Recall:", np.mean(res_recall, axis=2)
    print "Time:", np.mean(res_t, axis=2)
    print "Clusters:", np.mean(res_clusters, axis=2)
    print "Lower bounds:", np.mean(res_lb, axis=2)
    print "Upper bounds:", np.mean(res_ub, axis=2)
예제 #4
0
def experiment_multiple_trees(n_reps=1, n_trees=5, n_people=500,
                              methods=('unary', 'LD', 'mKlau'),
                              top_k_matches=5, f_vals=(0.1, 0.5, 1, 1.5, 2),
                              title='genealogical', do_save=True, dir_id=None,
                              rep_offset=0):
    nvv = len(f_vals)
    res_precision = np.zeros((len(methods), nvv, n_reps))
    res_recall = np.zeros((len(methods), nvv, n_reps))
    res_fscore = np.zeros((len(methods), nvv, n_reps))
    res_t = np.zeros((len(methods), nvv, n_reps))
    res_iterations = np.zeros((len(methods), nvv, n_reps))
    res_clusters = np.zeros((len(methods), nvv, n_reps))
    res_lb = np.zeros((len(methods), nvv, n_reps))
    res_ub = np.zeros((len(methods), nvv, n_reps))
    t_beg = time.time()

    start_date_part = str(dt.datetime.now())[:19]
    start_date_part = re.sub(' ', '_', start_date_part)
    start_date_part = re.sub(':', '', start_date_part)
    fname0 = os.path.join("experiment_results", "{}_part_{}.pckl".format(
        title, start_date_part))

    for r in range(n_reps):
        print "\n--- Repetition {}. ---".format(r+1)
        # Generate data
        tree_files = extract_ft.get_k_fragments(
                n_trees, n_people, label="first{}".format(r+rep_offset))
        people_index_tuples = []
        for tf in tree_files:
            people, people_dict = person.read_people(tf, clean=True)
            #'family_trees/data/rand_frag_%d/' % i, clean=True)
            index = create_index(people)
            people_index_tuples.append((people, index, people_dict))
        uniq_people = count_unique_people(tree_files)

        for i, f in enumerate(f_vals):
            print "\n  rep={}, f={}".format(r+1, f)
            for mi, m in enumerate(methods):
                if m.startswith('meLD') and i > 0:
                    # No need to compute fixed entity method for different f values.
                    continue
                print "\n    rep={}, f={}, method={}\n".format(r+1, f, m)
                t0 = time.time()
                precision, recall, fscore, n_clusters, lb, ub, iters = \
                    merge_multiple(people_index_tuples, 10, top_k_matches,
                                   method=m, uniq_people=uniq_people, f=f)
                res_precision[mi, i, r] = precision
                res_recall[mi, i, r] = recall
                res_fscore[mi, i, r] = fscore
                res_clusters[mi, i, r] = n_clusters
                res_t[mi, i, r] = time.time() - t0
                res_iterations[mi, i, r] = iters
                res_lb[mi, i, r] = lb
                res_ub[mi, i, r] = ub
        if do_save and n_reps > 1:
            pickle.dump(locals(), open(fname0, 'wb'))
            print "Wrote the results of repetition {} to: {}\n".format(r+1, fname0)

    print "\nThe whole experiment took {:2f} seconds.".format(time.time()-t_beg)

    if do_save:
        fname = util.save_data(locals(), title, dir_name='genealogy{}'.format(
            str(dir_id)))
        print "Wrote the results to: {}".format(fname)

    print "F1 score:", np.mean(res_fscore, axis=2)
    print "Precision:", np.mean(res_precision, axis=2)
    print "Recall:", np.mean(res_recall, axis=2)
    print "Time:", np.mean(res_t, axis=2)
    print "Clusters:", np.mean(res_clusters, axis=2)
    print "Lower bounds:", np.mean(res_lb, axis=2)
    print "Upper bounds:", np.mean(res_ub, axis=2)
예제 #5
0
def experiment_multiple_trees(n_reps=1,
                              n_trees=5,
                              n_people=500,
                              methods=('unary', 'LD', 'mKlau'),
                              top_k_matches=5,
                              f_vals=(0.1, 0.5, 1, 1.5, 2)):
    nvv = len(f_vals)
    res_precision = np.zeros((len(methods), nvv, n_reps))
    res_recall = np.zeros((len(methods), nvv, n_reps))
    res_fscore = np.zeros((len(methods), nvv, n_reps))
    res_t = np.zeros((len(methods), nvv, n_reps))
    res_iterations = np.zeros((len(methods), nvv, n_reps))
    res_clusters = np.zeros((len(methods), nvv, n_reps))
    res_lb = np.zeros((len(methods), nvv, n_reps))
    res_ub = np.zeros((len(methods), nvv, n_reps))
    t_beg = time.time()

    start_date_part = str(dt.datetime.now())[:19]
    start_date_part = re.sub(' ', '_', start_date_part)
    start_date_part = re.sub(':', '', start_date_part)
    fname0 = os.path.join("experiment_results",
                          "genealogical_{}.pckl".format(start_date_part))

    for j in range(n_reps):
        print "\n--- Repetition {}. ---".format(j + 1)
        # Generate data
        tree_files = extract_ft.get_k_fragments(n_trees,
                                                n_people,
                                                label="first{}".format(j))
        people_index_tuples = []
        for tf in tree_files:
            people, people_dict = person.read_people(tf, clean=True)
            #'family_trees/data/rand_frag_%d/' % i, clean=True)
            index = create_index(people)
            people_index_tuples.append((people, index, people_dict))
        uniq_people = count_unique_people(tree_files)

        for i, f in enumerate(f_vals):
            print "\n  rep={}, f={}".format(j + 1, f)
            for mi, m in enumerate(methods):
                print "\n    rep={}, f={}, method={}\n".format(j + 1, f, m)
                t0 = time.time()
                precision, recall, fscore, n_clusters, lb, ub, iters = \
                    merge_multiple(people_index_tuples, 10, top_k_matches,
                                   method=m, uniq_people=uniq_people, f=f)
                res_precision[mi, i, j] = precision
                res_recall[mi, i, j] = recall
                res_fscore[mi, i, j] = fscore
                res_clusters[mi, i, j] = n_clusters
                res_t[mi, i, j] = time.time() - t0
                res_iterations[mi, i, j] = iters
                res_lb[mi, i, j] = lb
                res_ub[mi, i, j] = ub
        pickle.dump(locals(), open(fname0, 'wb'))
        print "Wrote the results of repetition {} to: {}\n".format(
            j + 1, fname0)

    print "\nThe whole experiment took {:2f} seconds.".format(time.time() -
                                                              t_beg)
    date_part = str(dt.datetime.now())[:19]
    date_part = re.sub(' ', '_', date_part)
    date_part = re.sub(':', '', date_part)
    fname = os.path.join("experiment_results",
                         "genealogical_{}.pckl".format(date_part))
    pickle.dump(locals(), open(fname, 'wb'))
    print "Wrote the results to: {}\n".format(fname)

    print "F1 score:", np.mean(res_fscore, axis=2)
    print "Precision:", np.mean(res_precision, axis=2)
    print "Recall:", np.mean(res_recall, axis=2)
    print "Time:", np.mean(res_t, axis=2)
    print "Clusters:", np.mean(res_clusters, axis=2)
    print "Lower bounds:", np.mean(res_lb, axis=2)
    print "Upper bounds:", np.mean(res_ub, axis=2)
예제 #6
0
def experiment_scalability(n_reps=1,
                           n_trees=5,
                           n_people=500,
                           varied_param='n_people',
                           params=[50, 100, 500],
                           methods=('unary', 'LD', 'mKlau'),
                           top_k_matches=5,
                           f=1.0,
                           title='scalability',
                           do_save=True,
                           rep_offset=0,
                           subfolder=''):
    nvv = len(params)
    res_precision = np.zeros((len(methods), nvv, n_reps))
    res_recall = np.zeros((len(methods), nvv, n_reps))
    res_fscore = np.zeros((len(methods), nvv, n_reps))
    res_t = np.zeros((len(methods), nvv, n_reps))
    res_iterations = np.zeros((len(methods), nvv, n_reps))
    res_clusters = np.zeros((len(methods), nvv, n_reps))
    res_lb = np.zeros((len(methods), nvv, n_reps))
    res_ub = np.zeros((len(methods), nvv, n_reps))
    t_beg = time.time()

    start_date_part = str(dt.datetime.now())[:19]
    start_date_part = re.sub(' ', '_', start_date_part)
    start_date_part = re.sub(':', '', start_date_part)
    dir_path = os.path.join("experiment_results", subfolder)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    fname0 = os.path.join(dir_path,
                          "{}_part_{}.pckl".format(title, start_date_part))

    for r in range(n_reps):
        print "\n--- Repetition {}. ---".format(r + 1)
        for i, param_val in enumerate(params):
            if varied_param == 'n_people':
                n_people = param_val
            elif varied_param == 'n_trees':
                n_trees = param_val
            else:
                raise Exception('Invalid varied parameter: ' + varied_param)
            # Generate data
            try:
                tree_files = extract_ft.get_k_fragments(
                    n_trees, n_people, label="first{}".format(r + rep_offset))
                people_index_tuples = []
                for tf in tree_files:
                    people, people_dict = person.read_people(tf, clean=True)
                    #'family_trees/data/rand_frag_%d/' % i, clean=True)
                    index = create_index(people)
                    people_index_tuples.append((people, index, people_dict))
                uniq_people = count_unique_people(tree_files)
            except:
                # If it fails, try generating new trees.
                tree_files = extract_ft.get_k_fragments(
                    n_trees,
                    n_people,
                    label="first{}".format(r + rep_offset),
                    check_if_exists=False)
                people_index_tuples = []
                for tf in tree_files:
                    people, people_dict = person.read_people(tf, clean=True)
                    #'family_trees/data/rand_frag_%d/' % i, clean=True)
                    index = create_index(people)
                    people_index_tuples.append((people, index, people_dict))
                uniq_people = count_unique_people(tree_files)

            print "\n  rep={}, {}={}".format(r + 1, varied_param, param_val)
            for mi, m in enumerate(methods):
                print "\n    rep={}, {}={}, method={}".format(
                    r + 1, varied_param, param_val, m)
                t0 = time.time()
                precision, recall, fscore, n_clusters, lb, ub, iters = \
                    merge_multiple(people_index_tuples, 10, top_k_matches,
                                   method=m, uniq_people=uniq_people, f=f)
                res_precision[mi, i, r] = precision
                res_recall[mi, i, r] = recall
                res_fscore[mi, i, r] = fscore
                res_clusters[mi, i, r] = n_clusters
                res_t[mi, i, r] = time.time() - t0
                res_iterations[mi, i, r] = iters
                res_lb[mi, i, r] = lb
                res_ub[mi, i, r] = ub
        if do_save and n_reps > 1:
            pickle.dump(locals(), open(fname0, 'wb'))
            print "Wrote the results of repetition {} to: {}\n".format(
                r + 1, fname0)

    print "\nThe whole experiment took {:2f} seconds.".format(time.time() -
                                                              t_beg)
    date_part = str(dt.datetime.now())[:19]
    date_part = re.sub(' ', '_', date_part)
    date_part = re.sub(':', '', date_part)
    fname = os.path.join(dir_path, "{}_{}.pckl".format(title, date_part))
    if do_save:
        pickle.dump(locals(), open(fname, 'wb'))
        print "Wrote the results to: {}\n".format(fname)

    print "F1 score:", np.mean(res_fscore, axis=2)
    print "Precision:", np.mean(res_precision, axis=2)
    print "Recall:", np.mean(res_recall, axis=2)
    print "Time:", np.mean(res_t, axis=2)
    print "Clusters:", np.mean(res_clusters, axis=2)
    print "Lower bounds:", np.mean(res_lb, axis=2)
    print "Upper bounds:", np.mean(res_ub, axis=2)