Exemplo n.º 1
0
def group_edges(cs):
    plus = []
    minus = []
    pairs = []
    gc.disable()
    interval = 1000
    for current, cl1 in enumerate(cs.clusters):
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in Bib_matrix.special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == Bib_matrix.special_symbols['+']:
                plus.append((bib1, bib2))
            elif val[0] == Bib_matrix.special_symbols['-']:
                minus.append((bib1, bib2))
            else:
                assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d."
                     % (len(plus), len(minus), len(pairs)))
    gc.enable()
    return plus, minus, pairs
Exemplo n.º 2
0
def delayed_cluster_sets_from_marktables(limit_to_surnames=False):
    # { name -> [(table, bibref)] }
    bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' %
                    str(limit_to_surnames))

    name_buket = {}
    if limit_to_surnames:
        limit_to_surnames = set(
            [generate_last_name_cluster_str(s) for s in limit_to_surnames])

    for tab, ref, name in chain(izip(cycle((100, )), *izip(*get_bib10x())),
                                izip(cycle((700, )), *izip(*get_bib70x()))):
        name = generate_last_name_cluster_str(name)
        if limit_to_surnames and not name in limit_to_surnames:
            continue
        name_buket[name] = name_buket.get(name, []) + [(tab, ref)]

    bibauthor_print(
        'Delayed_cluster_set_from_marktables going to get %s  signatures....' %
        str(len(name_buket)))

    all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs))))
                for name, refs in name_buket.items())
    all_refs = sorted(all_refs, key=itemgetter(2))
    return ([
        delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs
    ], map(itemgetter(0), all_refs), map(itemgetter(2), all_refs))
Exemplo n.º 3
0
def create_matrix(cluster_set, force):
    bibs = cluster_set.num_all_bibs
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start building matrix for %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d" %
                    (cluster_set.last_name, bibs, expected))

    return prepare_matirx(cluster_set, force)
Exemplo n.º 4
0
def create_matrix(cluster_set, force):
    bibs = cluster_set.num_all_bibs
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start building matrix for %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d"
                    % (cluster_set.last_name, bibs, expected))

    return prepare_matirx(cluster_set, force)
Exemplo n.º 5
0
def main():
    """Main function """
    try:
        import invenio.bibauthorid_daemon as daemon
    except ImportError:
        bibauthor_print("Hmm...No Daemon process running.")
        return

    daemon.bibauthorid_daemon()
Exemplo n.º 6
0
def main():
    """Main function """
    try:
        import invenio.bibauthorid_daemon as daemon
    except ImportError:
        bibauthor_print("Hmm...No Daemon process running.")
        return

    daemon.bibauthorid_daemon()
Exemplo n.º 7
0
def wedge_and_store(cluster_set):
    bibs = cluster_set.num_all_bibs
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start working on %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d" %
                    (cluster_set.last_name, bibs, expected))

    wedge(cluster_set)
    remove_result_cluster(cluster_set.last_name)
    cluster_set.store()
    return True
Exemplo n.º 8
0
def tortoise_tweak_coefficient(lastnames, min_coef, max_coef, stepping, build_matrix=True):
    bibauthor_print('Coefficient tweaking!')
    bibauthor_print('Cluster sets from mark...')

    lnames = set([generate_last_name_cluster_str(n) for n in lastnames])
    coefficients = [x/100. for x in range(int(min_coef*100),int(max_coef*100),int(stepping*100))]


    if build_matrix:
        schedule_workers(_create_matrix, lnames)
    schedule_workers(_collect_statistics_lname_coeff, ((x,y) for x in lnames for y in coefficients ))
Exemplo n.º 9
0
def wedge_and_store(cluster_set):
    bibs = cluster_set.num_all_bibs
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start working on %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d"
                    % (cluster_set.last_name, bibs, expected))

    wedge(cluster_set)
    remove_result_cluster(cluster_set.last_name)
    cluster_set.store()
    return True
Exemplo n.º 10
0
def tortoise_tweak_coefficient(lastnames, min_coef, max_coef, stepping, build_matrix=True):
    bibauthor_print('Coefficient tweaking!')
    bibauthor_print('Cluster sets from mark...')

    lnames = set([generate_last_name_cluster_str(n) for n in lastnames])
    coefficients = [x/100. for x in range(int(min_coef*100),int(max_coef*100),int(stepping*100))]


    if build_matrix:
        schedule_workers(_create_matrix, lnames)
    schedule_workers(_collect_statistics_lname_coeff, ((x,y) for x in lnames for y in coefficients ))
Exemplo n.º 11
0
def tortoise(pure=False,
             force_matrix_creation=False,
             skip_matrix_creation=False,
             last_run=None):
    assert not force_matrix_creation or not skip_matrix_creation
    # The computation must be forced in case we want
    # to compute pure results
    force_matrix_creation = force_matrix_creation or pure

    if not skip_matrix_creation:
        bibauthor_print("Preparing cluster sets.")
        clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run)
        bibauthor_print("Building all matrices.")
        exit_statuses = schedule_create_matrix(
            clusters,
            sizes,
            force=force_matrix_creation)
        assert len(exit_statuses) == len(clusters)
        assert all(stat == os.EX_OK for stat in exit_statuses)

    bibauthor_print("Preparing cluster sets.")
    clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run)
    bibauthor_print("Starting disambiguation.")
    exit_statuses = schedule_wedge_and_store(
        clusters,
        sizes)
    assert len(exit_statuses) == len(clusters)
    assert all(stat == os.EX_OK for stat in exit_statuses)
Exemplo n.º 12
0
def tortoise(pure=False,
             force_matrix_creation=False,
             skip_matrix_creation=False,
             last_run=None):
    assert not force_matrix_creation or not skip_matrix_creation
    # The computation must be forced in case we want
    # to compute pure results
    force_matrix_creation = force_matrix_creation or pure

    if not skip_matrix_creation:
        bibauthor_print("Preparing cluster sets.")
        clusters, _lnames, sizes = delayed_cluster_sets_from_personid(
            pure, last_run)
        bibauthor_print("Building all matrices.")
        exit_statuses = schedule_create_matrix(clusters,
                                               sizes,
                                               force=force_matrix_creation)
        assert len(exit_statuses) == len(clusters)
        assert all(stat == os.EX_OK for stat in exit_statuses)

    bibauthor_print("Preparing cluster sets.")
    clusters, _lnames, sizes = delayed_cluster_sets_from_personid(
        pure, last_run)
    bibauthor_print("Starting disambiguation.")
    exit_statuses = schedule_wedge_and_store(clusters, sizes)
    assert len(exit_statuses) == len(clusters)
    assert all(stat == os.EX_OK for stat in exit_statuses)
Exemplo n.º 13
0
def prepare_matirx(cluster_set, force):
    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    matr = ProbabilityMatrix(cluster_set.last_name)
    matr.load(load_map=True, load_matrix=False)
    if not force and matr.is_up_to_date(cluster_set):
        bibauthor_print("Cluster %s is up-to-date and therefore will not be computed."
            % cluster_set.last_name)
        return False

    matr.load(load_map=False, load_matrix=True)
    matr.recalculate(cluster_set)
    matr.store()
    return True
Exemplo n.º 14
0
def _create_matrix(lname):

    clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
    idx = lnames.index(lname)
    cluster = clusters[idx]
    size = sizes[idx]
    bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size))
    cluster_set = cluster()
    create_matrix(cluster_set, True)

    bibs = cluster_set.num_all_bibs
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start working on %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d" %
                    (cluster_set.last_name, bibs, expected))
    cluster_set.store()
Exemplo n.º 15
0
def _create_matrix(lname):

    clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
    idx = lnames.index(lname)
    cluster = clusters[idx]
    size = sizes[idx]
    bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size))
    cluster_set = cluster()
    create_matrix(cluster_set, True)

    bibs = cluster_set.num_all_bibs
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start working on %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d"
                    % (cluster_set.last_name, bibs, expected))
    cluster_set.store()
Exemplo n.º 16
0
def prepare_matirx(cluster_set, force):
    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    matr = ProbabilityMatrix(cluster_set.last_name)
    matr.load(load_map=True, load_matrix=False)
    if not force and matr.is_up_to_date(cluster_set):
        bibauthor_print(
            "Cluster %s is up-to-date and therefore will not be computed." %
            cluster_set.last_name)
        return False

    matr.load(load_map=False, load_matrix=True)
    matr.recalculate(cluster_set)
    matr.store()
    return True
Exemplo n.º 17
0
def tortoise_from_scratch():
    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Building all matrices.")
    schedule_workers(lambda x: force_create_matrix(x, force=True), cluster_sets)

    empty_tortoise_results_table()

    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Starting disambiguation.")
    schedule_workers(wedge, cluster_sets)
Exemplo n.º 18
0
def tortoise_from_scratch():
    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Building all matrices.")
    schedule_workers(lambda x: force_create_matrix(x, force=True), cluster_sets)

    empty_tortoise_results_table()

    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Starting disambiguation.")
    schedule_workers(wedge, cluster_sets)
Exemplo n.º 19
0
def _collect_statistics_lname_coeff(params):
    lname = params[0]
    coeff = params[1]

    clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
    idx = lnames.index(lname)
    cluster = clusters[idx]
    size = sizes[idx]
    bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size))
    cluster_set = cluster()
    create_matrix(cluster_set, False)

    bibs = cluster_set.num_all_bibs
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start working on %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d" %
                    (cluster_set.last_name, bibs, expected))

    wedge(cluster_set, True, coeff)
    remove_result_cluster(cluster_set.last_name)
Exemplo n.º 20
0
def _collect_statistics_lname_coeff(params):
    lname = params[0]
    coeff = params[1]

    clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
    idx = lnames.index(lname)
    cluster = clusters[idx]
    size = sizes[idx]
    bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size))
    cluster_set = cluster()
    create_matrix(cluster_set, False)

    bibs = cluster_set.num_all_bibs
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start working on %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d"
                    % (cluster_set.last_name, bibs, expected))

    wedge(cluster_set, True, coeff)
    remove_result_cluster(cluster_set.last_name)
Exemplo n.º 21
0
def tortoise_last_name(name, from_mark=False, pure=False):
    assert not (from_mark and pure)

    lname = generate_last_name_cluster_str(name)

    if from_mark:
        clusters, lnames, sizes = delayed_cluster_sets_from_marktables()
    else:
        clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure)

    try:
        idx = lnames.index(lname)
        cluster = clusters[idx]
        size = sizes[idx]
        bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size))
        cluster_set = cluster()
        create_matrix(cluster_set, True)
        wedge_and_store(cluster_set)
    except IndexError:
        bibauthor_print("Sorry, %s(%s) not found in the last name clusters" % (name, lname))
def delayed_cluster_sets_from_marktables(limit_to_surnames=False):
    # { name -> [(table, bibref)] }
    bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' % str(limit_to_surnames))

    name_buket = {}
    if limit_to_surnames:
        limit_to_surnames = set([generate_last_name_cluster_str(s) for s in limit_to_surnames])

    for tab, ref, name in chain(izip(cycle((100,)), *izip(*get_bib10x())),
                                izip(cycle((700,)), *izip(*get_bib70x()))):
        name = generate_last_name_cluster_str(name)
        if limit_to_surnames and not name in limit_to_surnames:
            continue
        name_buket[name] = name_buket.get(name, []) + [(tab, ref)]

    bibauthor_print('Delayed_cluster_set_from_marktables going to get %s  signatures....' % str(len(name_buket)))

    all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs))))
                for name, refs in name_buket.items())
    all_refs = sorted(all_refs, key=itemgetter(2))
    return ([delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs],
             map(itemgetter(0), all_refs),
             map(itemgetter(2), all_refs))
Exemplo n.º 23
0
def tortoise_from_scratch():
    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Building all matrices.")
    exit_statuses = schedule_create_matrix(cluster_sets, sizes, force=True)
    assert len(exit_statuses) == len(cluster_sets)
    assert all(stat == os.EX_OK for stat in exit_statuses)

    empty_results_table()

    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Starting disambiguation.")
    exit_statuses = schedule_wedge_and_store(cluster_sets, sizes)
    assert len(exit_statuses) == len(cluster_sets)
    assert all(stat == os.EX_OK for stat in exit_statuses)
Exemplo n.º 24
0
def tortoise_from_scratch():
    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Building all matrices.")
    exit_statuses = schedule_create_matrix(cluster_sets, sizes, force=True)
    assert len(exit_statuses) == len(cluster_sets)
    assert all(stat == os.EX_OK for stat in exit_statuses)

    empty_results_table()

    bibauthor_print("Preparing cluster sets.")
    cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables()
    bibauthor_print("Starting disambiguation.")
    exit_statuses = schedule_wedge_and_store(cluster_sets, sizes)
    assert len(exit_statuses) == len(cluster_sets)
    assert all(stat == os.EX_OK for stat in exit_statuses)
Exemplo n.º 25
0
def tortoise(pure=False,
             force_matrix_creation=False,
             skip_matrix_creation=False,
             last_run=None):
    assert not force_matrix_creation or not skip_matrix_creation
    # The computation must be forced in case we want
    # to compute pure results
    force_matrix_creation = force_matrix_creation or pure

    if not skip_matrix_creation:
        bibauthor_print("Preparing cluster sets.")
        clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run)
        bibauthor_print("Building all matrices.")
        schedule_workers(lambda x: force_create_matrix(x, force=force_matrix_creation), clusters)

    bibauthor_print("Preparing cluster sets.")
    clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run)
    bibauthor_print("Starting disambiguation.")
    schedule_workers(wedge_and_store, clusters)
Exemplo n.º 26
0
def tortoise(pure=False,
             force_matrix_creation=False,
             skip_matrix_creation=False,
             last_run=None):
    assert not force_matrix_creation or not skip_matrix_creation
    # The computation must be forced in case we want
    # to compute pure results
    force_matrix_creation = force_matrix_creation or pure

    if not skip_matrix_creation:
        bibauthor_print("Preparing cluster sets.")
        clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run)
        bibauthor_print("Building all matrices.")
        schedule_workers(lambda x: force_create_matrix(x, force=force_matrix_creation), clusters)

    bibauthor_print("Preparing cluster sets.")
    clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run)
    bibauthor_print("Starting disambiguation.")
    schedule_workers(wedge_and_store, clusters)
Exemplo n.º 27
0
def _create_matrix(lname):

    clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
    try:
        idx = lnames.index(lname)
        cluster = clusters[idx]
        size = sizes[idx]
        bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size))
        cluster_set = cluster()
        create_matrix(cluster_set, False)

        bibs = cluster_set.num_all_bibs
        expected = bibs * (bibs - 1) / 2
        bibauthor_print("Start working on %s. Total number of bibs: %d, "
                        "maximum number of comparisons: %d"
                        % (cluster_set.last_name, bibs, expected))
        cluster_set.store()
    except (IndexError, ValueError):
        bibauthor_print("Sorry, %s not found in the last name clusters, not creating matrix" % (lname))
Exemplo n.º 28
0
def _create_matrix(lname):

    clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
    try:
        idx = lnames.index(lname)
        cluster = clusters[idx]
        size = sizes[idx]
        bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size))
        cluster_set = cluster()
        create_matrix(cluster_set, False)

        bibs = cluster_set.num_all_bibs
        expected = bibs * (bibs - 1) / 2
        bibauthor_print("Start working on %s. Total number of bibs: %d, "
                        "maximum number of comparisons: %d"
                        % (cluster_set.last_name, bibs, expected))
        cluster_set.store()
    except (IndexError, ValueError):
        bibauthor_print("Sorry, %s not found in the last name clusters, not creating matrix" % (lname))
Exemplo n.º 29
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    interval = 1000
    for i, (bib1, bib2) in enumerate(plus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, (bib1, bib2) in enumerate(minus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=_edge_sorting, reverse=True)

    interval = 500000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        idcl1 = cluster_set.clusters.index(cl1)
        idcl2 = cluster_set.clusters.index(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
Exemplo n.º 30
0
def force_create_matrix(cluster_set, force):
    bibauthor_print("Building a cluster set.")
    return create_matrix(cluster_set(), force)
Exemplo n.º 31
0
def tortoise_last_name(name, from_mark=False, pure=False):
    bibauthor_print('Start working on %s' % name)
    assert not(from_mark and pure)

    lname = generate_last_name_cluster_str(name)

    if from_mark:
        bibauthor_print(' ... from mark!')
        clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
        bibauthor_print(' ... delayed done')
    else:
        bibauthor_print(' ... from pid, pure')
        clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure)
        bibauthor_print(' ... delayed pure done!')

#    try:
    idx = lnames.index(lname)
    cluster = clusters[idx]
    size = sizes[idx]
    cluster_set = cluster()
    bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size))
    create_matrix(cluster_set, True)
    wedge_and_store(cluster_set)
Exemplo n.º 32
0
def tortoise_last_name(name, from_mark=False, pure=False):
    bibauthor_print('Start working on %s' % name)
    assert not (from_mark and pure)

    lname = generate_last_name_cluster_str(name)

    if from_mark:
        bibauthor_print(' ... from mark!')
        clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname])
        bibauthor_print(' ... delayed done')
    else:
        bibauthor_print(' ... from pid, pure')
        clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure)
        bibauthor_print(' ... delayed pure done!')

#    try:
    idx = lnames.index(lname)
    cluster = clusters[idx]
    size = sizes[idx]
    cluster_set = cluster()
    bibauthor_print("Found, %s(%s). Total number of bibs: %d." %
                    (name, lname, size))
    create_matrix(cluster_set, True)
    wedge_and_store(cluster_set)
Exemplo n.º 33
0
def group_sort_edges(cs, original_process_id):
    bibauthor_print("group_sort_edges spowned by %s" % original_process_id)

    plus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'w')
    minus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'w')
    pairs_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),'w')
    data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'w')

    plus_count = 0
    minus_count = 0
    pairs_count = 0

    default_val = [0.,0.]
    #gc.disable()
    interval = 1000
    current = -1
    for cl1 in cs.clusters:
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = h5file[str(id(cl1))]
        for bib2 in xrange(len(h5file[str(id(cl1))])):
            val = pointers[bib2]
            #if val[0] not in Bib_matrix.special_numbers:
            #optimization: special numbers are assumed to be negative
            if val[0] >= 0:

                if val[0] > edge_cut_prob:
                    pairs_count += 1
                    pairs_fp.write(_pack_vals((bib1, bib2, val)))

            elif val[0] == Bib_matrix.special_symbols['+']:
                plus_count += 1
                plus_fp.write(_pack_vals((bib1, bib2, default_val)))

            elif val[0] == Bib_matrix.special_symbols['-']:
                minus_count += 1
                minus_fp.write(_pack_vals((bib1, bib2, default_val)))
            else:
                assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    plus_fp.close()
    minus_fp.close()
    pairs_fp.close()

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d."
                     % (plus_count, minus_count, pairs_count))
    #gc.enable()
    bibauthor_print("Sorting in-file value edges.")
    sortFileInPlace(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),
                    bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),
                    lambda x: _edge_sorting(_unpack_vals(x)), reverse=True)

    os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id))

    bibauthor_print("Dumping egdes data to file...")
    cPickle.dump((plus_count, minus_count, pairs_count), data_fp)
    data_fp.close()
Exemplo n.º 34
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)
    original_process_id = PID()
    #remember to close the files!
    #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set)

    p = Process(target=group_sort_edges, args=(cluster_set,original_process_id))
    p.start()
    p.join()

    plus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'r')
    minus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'r')
    edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),'r')
    data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'r')

    len_plus,len_minus,len_edges = cPickle.load(data_fp)
    data_fp.close()

    interval = 1000
    for i, s in enumerate(plus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_plus, "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, s in enumerate(minus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_minus, "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    interval = 50000
    wedge_print("Wedge: New wedge, %d edges." % len_edges)
    current = -1
    for  s in edges_fp:
        v1, v2, unused = _unpack_vals(s)
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len_edges, "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        #try using object ids instead of index to boost performances
        #idcl1 = cluster_set.clusters.index(cl1)
        #idcl2 = cluster_set.clusters.index(cl2)
        idcl1 = id(cl1)
        idcl2 = id(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)

    plus_edges_fp.close()
    minus_edges_fp.close()
    edges_fp.close()
    data_fp.close()

    try:
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id))
    except:
        pass
Exemplo n.º 35
0
def group_sort_edges(cs, original_process_id):
    bibauthor_print("group_sort_edges spowned by %s" % original_process_id)

    plus_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
        str(original_process_id), 'w')
    minus_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
        str(original_process_id), 'w')
    pairs_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' +
        str(original_process_id), 'w')
    data_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
        str(original_process_id), 'w')

    plus_count = 0
    minus_count = 0
    pairs_count = 0

    default_val = [0., 0.]
    #gc.disable()
    interval = 1000
    current = -1
    for cl1 in cs.clusters:
        current += 1
        if (current % interval) == 0:
            update_status(
                float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = h5file[str(id(cl1))]
        for bib2 in xrange(len(h5file[str(id(cl1))])):
            val = pointers[bib2]
            #if val[0] not in Bib_matrix.special_numbers:
            #optimization: special numbers are assumed to be negative
            if val[0] >= 0:

                if val[0] > edge_cut_prob:
                    pairs_count += 1
                    pairs_fp.write(_pack_vals((bib1, bib2, val)))

            elif val[0] == Bib_matrix.special_symbols['+']:
                plus_count += 1
                plus_fp.write(_pack_vals((bib1, bib2, default_val)))

            elif val[0] == Bib_matrix.special_symbols['-']:
                minus_count += 1
                minus_fp.write(_pack_vals((bib1, bib2, default_val)))
            else:
                assert val[0] == Bib_matrix.special_symbols[
                    None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    plus_fp.close()
    minus_fp.close()
    pairs_fp.close()

    bibauthor_print(
        "Positive edges: %d, Negative edges: %d, Value edges: %d." %
        (plus_count, minus_count, pairs_count))
    #gc.enable()
    bibauthor_print("Sorting in-file value edges.")
    sortFileInPlace(bconfig.TORTOISE_FILES_PATH +
                    '/wedge_temp_edges_cache_e_' + str(original_process_id),
                    bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
                    str(original_process_id),
                    lambda x: _edge_sorting(_unpack_vals(x)),
                    reverse=True)

    os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' +
              str(original_process_id))

    bibauthor_print("Dumping egdes data to file...")
    cPickle.dump((plus_count, minus_count, pairs_count), data_fp)
    data_fp.close()
Exemplo n.º 36
0
def force_create_matrix(cluster_set, force):
    bibauthor_print("Building a cluster set.")
    return create_matrix(cluster_set(), force)
Exemplo n.º 37
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)
    original_process_id = PID()
    #remember to close the files!
    #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set)

    p = Process(target=group_sort_edges,
                args=(cluster_set, original_process_id))
    p.start()
    p.join()

    plus_edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
        str(original_process_id), 'r')
    minus_edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
        str(original_process_id), 'r')
    edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
        str(original_process_id), 'r')
    data_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
        str(original_process_id), 'r')

    len_plus, len_minus, len_edges = cPickle.load(data_fp)
    data_fp.close()

    interval = 1000
    for i, s in enumerate(plus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(
                float(i) / len_plus, "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, s in enumerate(minus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_minus, "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    interval = 50000
    wedge_print("Wedge: New wedge, %d edges." % len_edges)
    current = -1
    for s in edges_fp:
        v1, v2, unused = _unpack_vals(s)
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len_edges, "Wedge...")

        assert unused != '+' and unused != '-', PID(
        ) + "Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        #try using object ids instead of index to boost performances
        #idcl1 = cluster_set.clusters.index(cl1)
        #idcl2 = cluster_set.clusters.index(cl2)
        idcl1 = id(cl1)
        idcl2 = id(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print(
            "Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)"
            % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(
                    cluster_set,
                    "/tmp/%s%d.dot" % (cluster_set.last_name, current),
                    bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s" %
                            (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %
                            (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" %
                        (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" %
                        (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name,
                      bib_map)

    plus_edges_fp.close()
    minus_edges_fp.close()
    edges_fp.close()
    data_fp.close()

    try:
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
                  str(original_process_id))
    except:
        pass
Exemplo n.º 38
0
def force_wedge_and_store(cluster_set):
    bibauthor_print("Building a cluster set.")
    return wedge_and_store(cluster_set())
Exemplo n.º 39
0
def force_wedge_and_store(cluster_set):
    bibauthor_print("Building a cluster set.")
    return wedge_and_store(cluster_set())