Пример #1
0
def group_edges(cs):
    plus = []
    minus = []
    pairs = []
    gc.disable()
    for current, cl1 in enumerate(cs.clusters):
        update_status(
            float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in Bib_matrix.special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == Bib_matrix.special_symbols['+']:
                plus.append((bib1, bib2))
            elif val[0] == Bib_matrix.special_symbols['-']:
                minus.append((bib1, bib2))
            else:
                assert val[0] == Bib_matrix.special_symbols[
                    None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    bibauthor_print(
        "Positive edges: %d, Negative edges: %d, Value edges: %d." %
        (len(plus), len(minus), len(pairs)))
    gc.enable()
    return plus, minus, pairs
Пример #2
0
def group_edges(cs):
    plus = []
    minus = []
    pairs = []

    for current, cl1 in enumerate(cs.clusters):
        update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == special_symbols["+"]:
                plus.append((bib1, bib2))
            elif val[0] == special_symbols["-"]:
                minus.append((bib1, bib2))
            else:
                assert val[0] == special_symbols[None]

    update_status_final("Finished with the edge grouping.")

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs)))
    return plus, minus, pairs
Пример #3
0
def group_edges(cs):
    plus = []
    minus = []
    pairs = []
    gc.disable()
    interval = 1000
    for current, cl1 in enumerate(cs.clusters):
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in Bib_matrix.special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == Bib_matrix.special_symbols['+']:
                plus.append((bib1, bib2))
            elif val[0] == Bib_matrix.special_symbols['-']:
                minus.append((bib1, bib2))
            else:
                assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d."
                     % (len(plus), len(minus), len(pairs)))
    gc.enable()
    return plus, minus, pairs
Пример #4
0
def main():
    """Main function """
    try:
        import bibauthorid_daemon as daemon
    except ImportError:
        bibauthor_print("Hmm...No Daemon process running.")
        return

    daemon.bibauthorid_daemon()
Пример #5
0
def main():
    """Main function """
    try:
        import bibauthorid_daemon as daemon
    except ImportError:
        bibauthor_print("Hmm...No Daemon process running.")
        return

    daemon.bibauthorid_daemon()
Пример #6
0
def tortoise_last_name(name, pure=False):
    lname = generate_last_name_cluster_str(name)

    names = create_lastname_list_from_personid()
    names = filter(lambda x: x[0] == name, names)

    if names:
        pids = names[0][1]
        bibauthor_print("Found %s(%s), %d pids" % (name, lname, len(pids)))
        disambiguate_last_name(pids, lname, pure, False)
    else:
        bibauthor_print("Sorry, %s(%s) not found in the last name clusters" % (name, lname))
Пример #7
0
def disambiguate(cluster_set):
    '''
    Updates personid from a list of personids, sharing common
    last name, and this last name.
    '''
    bibs = sum(len(c.bibs) for c in cluster_set.clusters)
    expected = bibs * (bibs - 1) / 2
    bibauthor_print("Start working on %s. Total number of bibs: %d, "
                    "maximum number of comparisons: %d"
                     % (cluster_set.last_name, bibs, expected))

    wedge(cluster_set)
    cluster_set.store()
Пример #8
0
def prepare_matirx(cluster_set, force):
    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    matr = ProbabilityMatrix()
    matr.load(cluster_set.last_name, load_map=True, load_matrix=False)
    if not force and matr.is_up_to_date(cluster_set):
        bibauthor_print("Cluster %s is up-to-date and therefore will not be computed."
            % cluster_set.last_name)
        # nothing to do
        return False

    matr.load(cluster_set.last_name, load_map=False, load_matrix=True)
    matr.recalculate(cluster_set)
    matr.store(cluster_set.last_name)
    return True
def prepare_matirx(cluster_set, force):
    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    matr = ProbabilityMatrix()
    matr.load(cluster_set.last_name, load_map=True, load_matrix=False)
    if not force and matr.is_up_to_date(cluster_set):
        bibauthor_print(
            "Cluster %s is up-to-date and therefore will not be computed." %
            cluster_set.last_name)
        # nothing to do
        return False

    matr.load(cluster_set.last_name, load_map=False, load_matrix=True)
    matr.recalculate(cluster_set)
    matr.store(cluster_set.last_name)
    return True
Пример #10
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''
    def decide(cl1, cl2):
        score1 = compare_to(cl1, cl2)
        score2 = compare_to(cl2, cl1)

        return compare_to_final_bounds(score1, score2)

    def compare_to(cl1, cl2):
        pointers = [cl1.out_edges[v] for v in cl2.bibs]

        assert pointers, "Wedge: no edges between clusters!"
        vals, probs = zip(*pointers)

        avg = sum(vals) / len(vals)
        if avg > eps:
            nvals = ((val / avg)**prob for val, prob in pointers)
        else:
            return 0

        coeff = gini(nvals)

        weight = sum(starmap(mul, pointers)) / sum(probs)

        wedge_print("Wedge: Decide: vals = %s, probs = %s" %
                    (str(vals), str(probs)))
        wedge_print("Wedge: Decide: coeff = %f, weight = %f" % (coeff, weight))

        return coeff * weight

    def gini(arr):
        arr = sorted(arr, reverse=True)
        dividend = sum(starmap(mul, izip(arr, xrange(1, 2 * len(arr), 2))))
        divisor = len(arr) * sum(arr)
        return float(dividend) / divisor

    def compare_to_final_bounds(score1, score2):
        return score1 + score2 > bconfig.WEDGE_THRESHOLD

    def edge_sorting(edge):
        '''
        probability + certainty / 10
        '''
        return edge[2][0] + edge[2][1] / 10.

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    for i, (bib1, bib2) in enumerate(plus_edges):
        update_status(
            float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    for i, (bib1, bib2) in enumerate(minus_edges):
        update_status(
            float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=edge_sorting, reverse=True)

    interval = 1000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', "Signed edge after filter!"
        wedge_print("Wedge: poped new edge: Verts = %s, %s Value = (%f, %f)" %
                    (v1, v2, unused[0], unused[1]))
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(
                    cluster_set,
                    "/tmp/%s%d.dot" % (cluster_set.last_name, current),
                    bib_map, (v1, v2, unused))

            if decide(cl1, cl2):
                wedge_print("Wedge: Joined!")
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled!")
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined!")
        else:
            wedge_print("Wedge: Clusters hate each other!")

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name,
                      bib_map)
Пример #11
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    interval = 1000
    for i, (bib1, bib2) in enumerate(plus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, (bib1, bib2) in enumerate(minus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=_edge_sorting, reverse=True)

    interval = 500000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        idcl1 = cluster_set.clusters.index(cl1)
        idcl2 = cluster_set.clusters.index(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
Пример #12
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    def decide(cl1, cl2):
        score1 = compare_to(cl1, cl2)
        score2 = compare_to(cl2, cl1)

        return compare_to_final_bounds(score1, score2)

    def compare_to(cl1, cl2):
        pointers = [cl1.out_edges[v] for v in cl2.bibs]

        assert pointers, "Wedge: no edges between clusters!"
        vals, probs = zip(*pointers)

        avg = sum(vals) / len(vals)
        if avg > eps:
            nvals = ((val / avg) ** prob for val, prob in pointers)
        else:
            return 0

        coeff = gini(nvals)

        weight = sum(starmap(mul, pointers)) / sum(probs)

        wedge_print("Wedge: Decide: vals = %s, probs = %s" % (str(vals), str(probs)))
        wedge_print("Wedge: Decide: coeff = %f, weight = %f" % (coeff, weight))

        return coeff * weight

    def gini(arr):
        arr = sorted(arr, reverse=True)
        dividend = sum(starmap(mul, izip(arr, xrange(1, 2 * len(arr), 2))))
        divisor = len(arr) * sum(arr)
        return float(dividend) / divisor

    def compare_to_final_bounds(score1, score2):
        return score1 + score2 > bconfig.WEDGE_THRESHOLD

    def edge_sorting(edge):
        '''
        probability + certainty / 10
        '''
        return edge[2][0] + edge[2][1] / 10.

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    for i, (bib1, bib2) in enumerate(plus_edges):
        update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    for i, (bib1, bib2) in enumerate(minus_edges):
        update_status(float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=edge_sorting, reverse=True)

    interval = 1000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', "Signed edge after filter!"
        wedge_print("Wedge: poped new edge: Verts = %s, %s Value = (%f, %f)" % (v1, v2, unused[0], unused[1]))
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), cluster_set.mapping, (v1, v2, unused))

            if decide(cl1, cl2):
                wedge_print("Wedge: Joined!")
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled!")
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined!")
        else:
            wedge_print("Wedge: Clusters hate each other!")

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, cluster_set.mapping)