示例#1
0
def _decide(cl1, cl2):
    score1 = _compare_to(cl1, cl2)
    score2 = _compare_to(cl2, cl1)
    s = score1 + score2
    wedge_print("Wedge: _decide (%f+%f) = %f cmp to %f" %
                (score1, score2, s, wedge_thrsh))
    return s > wedge_thrsh, s
示例#2
0
def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False):
    # The lower bound of the edges being processed by the wedge algorithm.
    global edge_cut_prob
    global wedge_thrsh

    if not force_wedge_thrsh:
        edge_cut_prob = bconfig.WEDGE_THRESHOLD / 4.
        wedge_thrsh = bconfig.WEDGE_THRESHOLD
    else:
        edge_cut_prob = force_wedge_thrsh / 4.
        wedge_thrsh = force_wedge_thrsh

    matr = ProbabilityMatrix(cluster_set.last_name)
    matr.load()

    global h5file
    h5filepath = bconfig.TORTOISE_FILES_PATH+'wedge_cache_'+str(PID())
    h5file = h5py.File(h5filepath)

    convert_cluster_set(cluster_set, matr)
    del matr # be sure that this is the last reference!

    do_wedge(cluster_set)

    report = []
    if bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES or report_cluster_status:
        msg = []
        for cl1 in cluster_set.clusters:
            for cl2 in cluster_set.clusters:
                if cl2 > cl1:
                    id1 = cluster_set.clusters.index(cl1)
                    id2 = cluster_set.clusters.index(cl2)
                    c12 = _compare_to(cl1,cl2)
                    c21 = _compare_to(cl2,cl1)
                    report.append((id1,id2,c12+c21))
                    msg.append( ' %s vs %s : %s + %s = %s -- %s' %  (id1, id2, c12, c21, c12+c21, cl1.hates(cl2)))
        msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join(msg)
        if not bconfig.DEBUG_WEDGE_OUTPUT and bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES:
            print
            print msg
            print
        wedge_print(msg)


    restore_cluster_set(cluster_set)

    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    if report_cluster_status:
        destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % (str(PID()),str(cluster_set.last_name),str(wedge_thrsh))
        f = filehandler.open(destfile, 'w')
        SER.dump([wedge_thrsh,cluster_set.last_name,report,cluster_set.num_all_bibs],f)
        f.close()
    gc.collect()

    h5file.close()
    os.remove(h5filepath)
示例#3
0
def wedge(cluster_set, report_cluster_status=False, force_wedge_thrsh=False):
    # The lower bound of the edges being processed by the wedge algorithm.
    global edge_cut_prob
    global wedge_thrsh

    if not force_wedge_thrsh:
        edge_cut_prob = bconfig.WEDGE_THRESHOLD / 4.
        wedge_thrsh = bconfig.WEDGE_THRESHOLD
    else:
        edge_cut_prob = force_wedge_thrsh / 4.
        wedge_thrsh = force_wedge_thrsh

    matr = ProbabilityMatrix(cluster_set.last_name)
    matr.load()

    global h5file
    h5filepath = bconfig.TORTOISE_FILES_PATH+'wedge_cache_'+str(PID())
    h5file = h5py.File(h5filepath)

    convert_cluster_set(cluster_set, matr)
    del matr # be sure that this is the last reference!

    do_wedge(cluster_set)

    report = []
    if bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES or report_cluster_status:
        msg = []
        for cl1 in cluster_set.clusters:
            for cl2 in cluster_set.clusters:
                if cl2 > cl1:
                    id1 = cluster_set.clusters.index(cl1)
                    id2 = cluster_set.clusters.index(cl2)
                    c12 = _compare_to(cl1,cl2)
                    c21 = _compare_to(cl2,cl1)
                    report.append((id1,id2,c12+c21))
                    msg.append( ' %s vs %s : %s + %s = %s -- %s' %  (id1, id2, c12, c21, c12+c21, cl1.hates(cl2)))
        msg = 'Wedge final clusters for %s: \n' % str(wedge_thrsh) + '\n'.join(msg)
        if not bconfig.DEBUG_WEDGE_OUTPUT and bconfig.DEBUG_WEDGE_PRINT_FINAL_CLUSTER_COMPATIBILITIES:
            print
            print msg
            print
        wedge_print(msg)


    restore_cluster_set(cluster_set)

    if bconfig.DEBUG_CHECKS:
        assert cluster_set._debug_test_hate_relation()
        assert cluster_set._debug_duplicated_recs()

    if report_cluster_status:
        destfile = '/tmp/baistats/cluster_status_report_pid_%s_lastname_%s_thrsh_%s' % (str(PID()),str(cluster_set.last_name),str(wedge_thrsh))
        f = filehandler.open(destfile, 'w')
        SER.dump([wedge_thrsh,cluster_set.last_name,report,cluster_set.num_all_bibs],f)
        f.close()
    gc.collect()

    h5file.close()
    os.remove(h5filepath)
示例#4
0
def _compare_to(cl1, cl2):
    cl1_out_edges = h5file[str(id(cl1))]
    pointers = [cl1_out_edges[v] for v in cl2.bibs]

    assert pointers, PID() + "Wedge: no edges between clusters!"
    vals, probs = zip(*pointers)

    wedge_print("Wedge: _compare_to: vals = %s, probs = %s" %
                (str(vals), str(probs)))

    if SP_QUARREL in vals:
        ret = 0.
        wedge_print('Wedge: _compare_to: - edge present, returning 0')

    elif SP_CONFIRM in vals:
        ret = 0.5
        wedge_print('Wedge: _compare_to: + edge present, returning 0.5')

    else:

        avg = sum(vals) / len(vals)
        if avg > eps:
            nvals = [(val / avg)**prob for val, prob in pointers]
        else:
            wedge_print(
                "Wedge: _compare_to: vals too low to compare, skipping")
            return 0

        coeff = _gini(nvals)

        weight = sum(starmap(mul, pointers)) / sum(probs)

        ret = (coeff * weight) / 2.

        assert ret <= 0.5, PID(
        ) + 'COMPARE_TO big value returned ret %s coeff %s weight %s nvals %s vals %s prob %s' % (
            ret, coeff, weight, nvals, vals, probs)

        wedge_print(
            "Wedge: _compare_to: coeff = %f, weight = %f, retval = %f" %
            (coeff, weight, ret))

    return ret
示例#5
0
def _compare_to(cl1, cl2):
    cl1_out_edges = h5file[str(id(cl1))]
    pointers = [cl1_out_edges[v] for v in cl2.bibs]

    assert pointers, PID()+"Wedge: no edges between clusters!"
    vals, probs = zip(*pointers)

    wedge_print("Wedge: _compare_to: vals = %s, probs = %s" % (str(vals), str(probs)))

    if SP_QUARREL in vals:
        ret = 0.
        wedge_print('Wedge: _compare_to: - edge present, returning 0')

    elif SP_CONFIRM in vals:
        ret = 0.5
        wedge_print('Wedge: _compare_to: + edge present, returning 0.5')

    else:

        avg = sum(vals) / len(vals)
        if avg > eps:
            nvals = [(val / avg) ** prob for val, prob in pointers]
        else:
            wedge_print("Wedge: _compare_to: vals too low to compare, skipping")
            return 0

        coeff = _gini(nvals)

        weight = sum(starmap(mul, pointers)) / sum(probs)

        ret = (coeff * weight) / 2.

        assert ret <= 0.5, PID()+'COMPARE_TO big value returned ret %s coeff %s weight %s nvals %s vals %s prob %s' % (ret, coeff, weight, nvals, vals, probs)

        wedge_print("Wedge: _compare_to: coeff = %f, weight = %f, retval = %f" % (coeff, weight, ret))

    return ret
示例#6
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)
    original_process_id = PID()
    #remember to close the files!
    #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set)

    p = Process(target=group_sort_edges, args=(cluster_set,original_process_id))
    p.start()
    p.join()

    plus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'r')
    minus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'r')
    edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),'r')
    data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'r')

    len_plus,len_minus,len_edges = cPickle.load(data_fp)
    data_fp.close()

    interval = 1000
    for i, s in enumerate(plus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_plus, "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, s in enumerate(minus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_minus, "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    interval = 50000
    wedge_print("Wedge: New wedge, %d edges." % len_edges)
    current = -1
    for  s in edges_fp:
        v1, v2, unused = _unpack_vals(s)
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len_edges, "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        #try using object ids instead of index to boost performances
        #idcl1 = cluster_set.clusters.index(cl1)
        #idcl2 = cluster_set.clusters.index(cl2)
        idcl1 = id(cl1)
        idcl2 = id(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)

    plus_edges_fp.close()
    minus_edges_fp.close()
    edges_fp.close()
    data_fp.close()

    try:
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id))
    except:
        pass
示例#7
0
def _decide(cl1, cl2):
    score1 = _compare_to(cl1, cl2)
    score2 = _compare_to(cl2, cl1)
    s = score1 + score2
    wedge_print("Wedge: _decide (%f+%f) = %f cmp to %f" % (score1,score2,s,wedge_thrsh))
    return s > wedge_thrsh, s
示例#8
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    interval = 1000
    for i, (bib1, bib2) in enumerate(plus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, (bib1, bib2) in enumerate(minus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=_edge_sorting, reverse=True)

    interval = 500000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        idcl1 = cluster_set.clusters.index(cl1)
        idcl2 = cluster_set.clusters.index(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
示例#9
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)
    original_process_id = PID()
    #remember to close the files!
    #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set)

    p = Process(target=group_sort_edges,
                args=(cluster_set, original_process_id))
    p.start()
    p.join()

    plus_edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
        str(original_process_id), 'r')
    minus_edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
        str(original_process_id), 'r')
    edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
        str(original_process_id), 'r')
    data_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
        str(original_process_id), 'r')

    len_plus, len_minus, len_edges = cPickle.load(data_fp)
    data_fp.close()

    interval = 1000
    for i, s in enumerate(plus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(
                float(i) / len_plus, "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, s in enumerate(minus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_minus, "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    interval = 50000
    wedge_print("Wedge: New wedge, %d edges." % len_edges)
    current = -1
    for s in edges_fp:
        v1, v2, unused = _unpack_vals(s)
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len_edges, "Wedge...")

        assert unused != '+' and unused != '-', PID(
        ) + "Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        #try using object ids instead of index to boost performances
        #idcl1 = cluster_set.clusters.index(cl1)
        #idcl2 = cluster_set.clusters.index(cl2)
        idcl1 = id(cl1)
        idcl2 = id(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print(
            "Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)"
            % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(
                    cluster_set,
                    "/tmp/%s%d.dot" % (cluster_set.last_name, current),
                    bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s" %
                            (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %
                            (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" %
                        (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" %
                        (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name,
                      bib_map)

    plus_edges_fp.close()
    minus_edges_fp.close()
    edges_fp.close()
    data_fp.close()

    try:
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
                  str(original_process_id))
    except:
        pass