示例#1
0
def group_edges(cs):
    plus = []
    minus = []
    pairs = []

    for current, cl1 in enumerate(cs.clusters):
        update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == special_symbols["+"]:
                plus.append((bib1, bib2))
            elif val[0] == special_symbols["-"]:
                minus.append((bib1, bib2))
            else:
                assert val[0] == special_symbols[None]

    update_status_final("Finished with the edge grouping.")

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs)))
    return plus, minus, pairs
示例#2
0
def group_edges(cs):
    plus = []
    minus = []
    pairs = []
    gc.disable()
    for current, cl1 in enumerate(cs.clusters):
        update_status(
            float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in Bib_matrix.special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == Bib_matrix.special_symbols['+']:
                plus.append((bib1, bib2))
            elif val[0] == Bib_matrix.special_symbols['-']:
                minus.append((bib1, bib2))
            else:
                assert val[0] == Bib_matrix.special_symbols[
                    None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    bibauthor_print(
        "Positive edges: %d, Negative edges: %d, Value edges: %d." %
        (len(plus), len(minus), len(pairs)))
    gc.enable()
    return plus, minus, pairs
def group_edges(cs):
    plus = []
    minus = []
    pairs = []
    gc.disable()
    interval = 1000
    for current, cl1 in enumerate(cs.clusters):
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in Bib_matrix.special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == Bib_matrix.special_symbols['+']:
                plus.append((bib1, bib2))
            elif val[0] == Bib_matrix.special_symbols['-']:
                minus.append((bib1, bib2))
            else:
                assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d."
                     % (len(plus), len(minus), len(pairs)))
    gc.enable()
    return plus, minus, pairs
def convert_cluster_set(cs, prob_matr):
    '''
    Convertes a normal cluster set to a wedge clsuter set.
    @param cs: a cluster set to be converted
    @param type: cluster set
    @return: a mapping from a number to a bibrefrec.
    '''
    gc.disable()

    # step 1:
    #    + Assign a number to each bibrefrec.
    #    + Replace the arrays of bibrefrecs with arrays of numbers.
    #    + Store the result and prepare it to be returned.
    result_mapping = []
    for clus in cs.clusters:
        start = len(result_mapping)
        result_mapping += list(clus.bibs)
        end = len(result_mapping)
        clus.bibs = range(start, end)

    assert len(result_mapping) == len(set(result_mapping)), PID()+"Cluster set conversion failed"
    assert len(result_mapping) == cs.num_all_bibs, PID()+"Cluster set conversion failed"

    cs.new2old = result_mapping

    # step 2:
    #    + Using the prob matrix create a vector values to all other bibs.
    #    + Meld those vectors into one for each cluster.

    special_symbols = Bib_matrix.special_symbols #locality optimization

    interval = 10000
    for current, c1 in enumerate(cs.clusters):
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Converting the cluster set...")

        assert len(c1.bibs) > 0, PID()+"Empty cluster send to wedge"
        pointers = []

        for v1 in c1.bibs:
            pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C')
            pointer.fill(special_symbols[None])
            rm = result_mapping[v1] #locality optimization
            for c2 in cs.clusters:
                if c1 != c2 and not c1.hates(c2):
                    for v2 in c2.bibs:
                        val = prob_matr[rm, result_mapping[v2]]
                        try:
                            numb = special_symbols[val]
                            val = (numb, numb)
                        except KeyError:
                            pass
                        assert len(val) == 2, "Edge coding failed"
                        pointer[v2] = val
            pointers.append((pointer, 1))
        c1.out_edges = reduce(meld_edges, pointers)[0]

    update_status_final("Converting the cluster set done.")
    gc.enable()
def convert_cluster_set(cs, prob_matr):
    '''
    Convertes a normal cluster set to a wedge clsuter set.
    @param cs: a cluster set to be converted
    @param type: cluster set
    @return: a mapping from a number to a bibrefrec.
    '''
    gc.disable()

    # step 1:
    #    + Assign a number to each bibrefrec.
    #    + Replace the arrays of bibrefrecs with arrays of numbers.
    #    + Store the result and prepare it to be returned.
    result_mapping = []
    for clus in cs.clusters:
        start = len(result_mapping)
        result_mapping += list(clus.bibs)
        end = len(result_mapping)
        clus.bibs = range(start, end)

    assert len(result_mapping) == len(set(result_mapping)), "Cluster set conversion failed"
    assert len(result_mapping) == cs.num_all_bibs, "Cluster set conversion failed"

    cs.new2old = result_mapping

    # step 2:
    #    + Using the prob matrix create a vector values to all other bibs.
    #    + Meld those vectors into one for each cluster.

    special_symbols = Bib_matrix.special_symbols #locality optimization

    for current, c1 in enumerate(cs.clusters):
        update_status(float(current) / len(cs.clusters), "Converting the cluster set...")

        assert len(c1.bibs) > 0, "Empty cluster send to wedge"
        pointers = []

        for v1 in c1.bibs:
            pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C')
            pointer.fill(special_symbols[None])
            rm = result_mapping[v1] #locality optimization
            for c2 in cs.clusters:
                if c1 != c2 and not c1.hates(c2):
                    for v2 in c2.bibs:
                        val = prob_matr[rm, result_mapping[v2]]
                        try:
                            numb = special_symbols[val]
                            val = (numb, numb)
                        except KeyError:
                            pass
                        assert len(val) == 2, "Edge coding failed"
                        pointer[v2] = val
            pointers.append((pointer, 1))
        c1.out_edges = reduce(meld_edges, pointers)[0]

    update_status_final("Converting the cluster set done.")
    gc.enable()
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0

        old_matrix = self._bib_matrix
        cached_bibs = self.__get_up_to_date_bibs()
        have_cached_bibs = bool(cached_bibs)
        self._bib_matrix = Bib_matrix(cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        cur_calc, opti = 0, 0
        for cl1 in cluster_set.clusters:
            update_status((float(opti) + cur_calc) / expected,
                          "Prob matrix: calc %d, opti %d." % (cur_calc, opti))

            #clean caches
            if cur_calc - last_cleaned > 2000000:
                clear_comparison_caches()
                last_cleaned = cur_calc

            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(
                                            val,
                                            compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                val = compare_bibrefrecs(bib1, bib2)

                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
示例#7
0
def convert_cluster_set(cs, prob_matr):
    """
    Convertes a normal cluster set to a wedge clsuter set.
    @param cs: a cluster set to be converted
    @param type: cluster set
    @return: a mapping from a number to a bibrefrec.
    """

    # step 1:
    #    + Assign a number to each bibrefrec.
    #    + Replace the arrays of bibrefrecs with arrays of numbers.
    #    + Store the result and prepare it to be returned.

    result_mapping = []
    for clus in cs.clusters:
        start = len(result_mapping)
        result_mapping += list(clus.bibs)
        end = len(result_mapping)
        clus.bibs = range(start, end)

    assert len(result_mapping) == len(set(result_mapping))

    # step 2:
    #    + Using the prob matrix create a vector values to all other bibs.
    #    + Meld those vectors into one for each cluster.

    for current, c1 in enumerate(cs.clusters):
        update_status(float(current) / len(cs.clusters), "Converting the cluster set...")

        assert len(c1.bibs) > 0
        pointers = []

        for v1 in c1.bibs:
            pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order="C")
            pointer.fill(special_symbols[None])
            for c2 in cs.clusters:
                if c1 != c2 and not c1.hates(c2):
                    for v2 in c2.bibs:
                        val = prob_matr[result_mapping[v1], result_mapping[v2]]
                        if val in special_symbols:
                            numb = special_symbols[val]
                            val = (numb, numb)
                        assert len(val) == 2
                        pointer[v2] = val
            pointers.append((pointer, 1))

        c1.out_edges = reduce(meld_edges, pointers)[0]

    update_status_final("Converting the cluster set done.")

    return result_mapping
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0

        old_matrix = self._bib_matrix
        cached_bibs = self.__get_up_to_date_bibs()
        have_cached_bibs = bool(cached_bibs)
        self._bib_matrix = Bib_matrix(cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        cur_calc, opti = 0, 0
        for cl1 in cluster_set.clusters:
            update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))

            #clean caches
            if cur_calc - last_cleaned > 2000000:
                clear_comparison_caches()
                last_cleaned = cur_calc

            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                val = compare_bibrefrecs(bib1, bib2)

                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
                pid = os.fork()
                if pid == 0: # child
                    os.nice(int((float(sizs[idx]) * 20.0 / biggest)))
                    run_job(job_idx)
                else: # parent
                    pid_2_idx[pid] = job_idx
                    assert free > sizs[job_idx]
                    free -= sizs[job_idx]
                    del free_idxs[idx]
            else:
                break

        pid, status = os.wait()
        assert pid in pid_2_idx
        idx = pid_2_idx[pid]
        freed = sizs[idx]
        done += freed
        ret_status[idx] = status
        free += freed
        del pid_2_idx[pid]
        update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs)))

    update_status_final("%d / %d" % (len(jobs), len(jobs)))
    assert is_eq(free, initial)
    assert not pid_2_idx
    assert not free_idxs
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs)
    assert all(stat != None for stat in ret_status)

    return ret_status
示例#10
0
def merge():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
    '''
    last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters())

    def get_free_pids():
        while True:
            yield get_new_personid()

    free_pids = get_free_pids()

    def try_move_signature(sig, target_pid):
        """
        """
        paps = get_signature_info(sig)
        claimed = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps)
        rejected = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if rejected:
            move_signature(sig, free_pids.next())
        else:
            conflicts = find_conflicts(sig, target_pid)
            if not conflicts:
                move_signature(sig, target_pid)
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    move_signature(sig, free_pids.next())
                else:
                    move_signature(conflicts[0][:3], free_pids.next())
                    move_signature(sig, target_pid)

    for idx, last in enumerate(last_names):
        update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            claim = []
            for d in ds:
                pid_flag = personid_from_signature(d)
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)
                    if flag > 1:
                        claim.append((d, pid))

            matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, unused in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match))
        not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                try_move_signature(sig, pid)

    update_status_final()
    delete_empty_persons()
    update_personID_canonical_names()
示例#11
0
def rabbit(bibrecs, check_invalid_papers=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
    def __init__(self, cluster_set, use_cache=False, save_cache=False):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        def check_for_cleaning(cur_calc):
            if cur_calc % 10000000 == 0:
                clear_comparison_caches()

        self._bib_matrix = bib_matrix(cluster_set)

        old_matrix = bib_matrix()

        ncl = sum(len(cl.bibs) for cl in cluster_set.clusters)
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        if use_cache and old_matrix.load(cluster_set.last_name):
            cached_bibs = set(filter_modified_record_ids(
                                  old_matrix.get_keys(),
                                  old_matrix.creation_time))
        else:
            cached_bibs = set()

        if save_cache:
            creation_time = get_sql_time()

        cur_calc, opti = 0, 0
        for cl1 in cluster_set.clusters:
            update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    check_for_cleaning(cur_calc)
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                check_for_cleaning(cur_calc)
                                val = compare_bibrefrecs(bib1, bib2)

                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()

        if save_cache:
            update_status(1., "saving...")
            self._bib_matrix.store(cluster_set.last_name, creation_time)

        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
示例#13
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''
    def decide(cl1, cl2):
        score1 = compare_to(cl1, cl2)
        score2 = compare_to(cl2, cl1)

        return compare_to_final_bounds(score1, score2)

    def compare_to(cl1, cl2):
        pointers = [cl1.out_edges[v] for v in cl2.bibs]

        assert pointers, "Wedge: no edges between clusters!"
        vals, probs = zip(*pointers)

        avg = sum(vals) / len(vals)
        if avg > eps:
            nvals = ((val / avg)**prob for val, prob in pointers)
        else:
            return 0

        coeff = gini(nvals)

        weight = sum(starmap(mul, pointers)) / sum(probs)

        wedge_print("Wedge: Decide: vals = %s, probs = %s" %
                    (str(vals), str(probs)))
        wedge_print("Wedge: Decide: coeff = %f, weight = %f" % (coeff, weight))

        return coeff * weight

    def gini(arr):
        arr = sorted(arr, reverse=True)
        dividend = sum(starmap(mul, izip(arr, xrange(1, 2 * len(arr), 2))))
        divisor = len(arr) * sum(arr)
        return float(dividend) / divisor

    def compare_to_final_bounds(score1, score2):
        return score1 + score2 > bconfig.WEDGE_THRESHOLD

    def edge_sorting(edge):
        '''
        probability + certainty / 10
        '''
        return edge[2][0] + edge[2][1] / 10.

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    for i, (bib1, bib2) in enumerate(plus_edges):
        update_status(
            float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    for i, (bib1, bib2) in enumerate(minus_edges):
        update_status(
            float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=edge_sorting, reverse=True)

    interval = 1000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', "Signed edge after filter!"
        wedge_print("Wedge: poped new edge: Verts = %s, %s Value = (%f, %f)" %
                    (v1, v2, unused[0], unused[1]))
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(
                    cluster_set,
                    "/tmp/%s%d.dot" % (cluster_set.last_name, current),
                    bib_map, (v1, v2, unused))

            if decide(cl1, cl2):
                wedge_print("Wedge: Joined!")
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled!")
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined!")
        else:
            wedge_print("Wedge: Clusters hate each other!")

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name,
                      bib_map)
示例#14
0
                    os.nice(int((float(sizs[idx]) * 20.0 / biggest)))
                    run_job(job_idx)
                else:  # parent
                    pid_2_idx[pid] = job_idx
                    assert free > sizs[job_idx]
                    free -= sizs[job_idx]
                    del free_idxs[idx]
            else:
                break

        pid, status = os.wait()
        assert pid in pid_2_idx
        idx = pid_2_idx[pid]
        freed = sizs[idx]
        done += freed
        ret_status[idx] = status
        free += freed
        del pid_2_idx[pid]
        update_status(
            done / total, "%d / %d" %
            (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs)))

    update_status_final("%d / %d" % (len(jobs), len(jobs)))
    assert is_eq(free, initial)
    assert not pid_2_idx
    assert not free_idxs
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs)
    assert all(stat != None for stat in ret_status)

    return ret_status
 def store(self, name):
     update_status(0., "Saving probability matrix...")
     self._bib_matrix.store(name)
     update_status_final("Probability matrix saved.")
 def load(self, lname, load_map=True, load_matrix=True):
     update_status(0., "Loading probability matrix...")
     self._bib_matrix.load(lname, load_map, load_matrix)
     update_status_final("Probability matrix loaded.")
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    def decide(cl1, cl2):
        score1 = compare_to(cl1, cl2)
        score2 = compare_to(cl2, cl1)

        return compare_to_final_bounds(score1, score2)

    def compare_to(cl1, cl2):
        pointers = [cl1.out_edges[v] for v in cl2.bibs]

        assert pointers, "Wedge: no edges between clusters!"
        vals, probs = zip(*pointers)

        avg = sum(vals) / len(vals)
        if avg > eps:
            nvals = ((val / avg) ** prob for val, prob in pointers)
        else:
            return 0

        coeff = gini(nvals)

        weight = sum(starmap(mul, pointers)) / sum(probs)

        wedge_print("Wedge: Decide: vals = %s, probs = %s" % (str(vals), str(probs)))
        wedge_print("Wedge: Decide: coeff = %f, weight = %f" % (coeff, weight))

        return coeff * weight

    def gini(arr):
        arr = sorted(arr, reverse=True)
        dividend = sum(starmap(mul, izip(arr, xrange(1, 2 * len(arr), 2))))
        divisor = len(arr) * sum(arr)
        return float(dividend) / divisor

    def compare_to_final_bounds(score1, score2):
        return score1 + score2 > bconfig.WEDGE_THRESHOLD

    def edge_sorting(edge):
        '''
        probability + certainty / 10
        '''
        return edge[2][0] + edge[2][1] / 10.

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    for i, (bib1, bib2) in enumerate(plus_edges):
        update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    for i, (bib1, bib2) in enumerate(minus_edges):
        update_status(float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=edge_sorting, reverse=True)

    interval = 1000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', "Signed edge after filter!"
        wedge_print("Wedge: poped new edge: Verts = %s, %s Value = (%f, %f)" % (v1, v2, unused[0], unused[1]))
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), cluster_set.mapping, (v1, v2, unused))

            if decide(cl1, cl2):
                wedge_print("Wedge: Joined!")
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled!")
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined!")
        else:
            wedge_print("Wedge: Clusters hate each other!")

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, cluster_set.mapping)
 def load(self, lname, load_map=True, load_matrix=True):
     update_status(0., "Loading probability matrix...")
     self._bib_matrix.load(lname, load_map, load_matrix)
     update_status_final("Probability matrix loaded.")
示例#19
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    interval = 1000
    for i, (bib1, bib2) in enumerate(plus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, (bib1, bib2) in enumerate(minus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=_edge_sorting, reverse=True)

    interval = 500000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        idcl1 = cluster_set.clusters.index(cl1)
        idcl2 = cluster_set.clusters.index(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
 def store(self, name):
     update_status(0., "Saving probability matrix...")
     self._bib_matrix.store(name)
     update_status_final("Probability matrix saved.")