Пример #1
0
def group_edges(cs):
    plus = []
    minus = []
    pairs = []
    gc.disable()
    interval = 1000
    for current, cl1 in enumerate(cs.clusters):
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in Bib_matrix.special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == Bib_matrix.special_symbols['+']:
                plus.append((bib1, bib2))
            elif val[0] == Bib_matrix.special_symbols['-']:
                minus.append((bib1, bib2))
            else:
                assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d."
                     % (len(plus), len(minus), len(pairs)))
    gc.enable()
    return plus, minus, pairs
Пример #2
0
def convert_cluster_set(cs, prob_matr):
    '''
    Convertes a normal cluster set to a wedge clsuter set.
    @param cs: a cluster set to be converted
    @param type: cluster set
    @return: a mapping from a number to a bibrefrec.
    '''
    gc.disable()

    # step 1:
    #    + Assign a number to each bibrefrec.
    #    + Replace the arrays of bibrefrecs with arrays of numbers.
    #    + Store the result and prepare it to be returned.
    result_mapping = []
    for clus in cs.clusters:
        start = len(result_mapping)
        result_mapping += list(clus.bibs)
        end = len(result_mapping)
        clus.bibs = range(start, end)

    assert len(result_mapping) == len(set(result_mapping)), PID()+"Cluster set conversion failed"
    assert len(result_mapping) == cs.num_all_bibs, PID()+"Cluster set conversion failed"

    cs.new2old = result_mapping

    # step 2:
    #    + Using the prob matrix create a vector values to all other bibs.
    #    + Meld those vectors into one for each cluster.

    special_symbols = Bib_matrix.special_symbols #locality optimization

    interval = 10000
    for current, c1 in enumerate(cs.clusters):
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Converting the cluster set...")

        assert len(c1.bibs) > 0, PID()+"Empty cluster send to wedge"
        pointers = []

        for v1 in c1.bibs:
            pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C')
            pointer.fill(special_symbols[None])
            rm = result_mapping[v1] #locality optimization
            for c2 in cs.clusters:
                if c1 != c2 and not c1.hates(c2):
                    for v2 in c2.bibs:
                        val = prob_matr[rm, result_mapping[v2]]
                        try:
                            numb = special_symbols[val]
                            val = (numb, numb)
                        except KeyError:
                            pass
                        assert len(val) == 2, "Edge coding failed"
                        pointer[v2] = val
            pointers.append((pointer, 1))
        c1.out_edges = reduce(meld_edges, pointers)[0]

    update_status_final("Converting the cluster set done.")
    gc.enable()
Пример #3
0
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0

        old_matrix = self._bib_matrix
        cached_bibs = self.__get_up_to_date_bibs()
        have_cached_bibs = bool(cached_bibs)
        self._bib_matrix = Bib_matrix(cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        cur_calc, opti, prints_counter = 0, 0, 0
        for cl1 in cluster_set.clusters:

            if cur_calc + opti - prints_counter > 100000:
                update_status(
                    (float(opti) + cur_calc) / expected,
                    "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                prints_counter = cur_calc + opti

            #clean caches
            if cur_calc - last_cleaned > 2000000:
                clear_comparison_caches()
                last_cleaned = cur_calc

            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(
                                            val,
                                            compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                val = compare_bibrefrecs(bib1, bib2)
                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0

        old_matrix = self._bib_matrix
        cached_bibs = self.__get_up_to_date_bibs()
        have_cached_bibs = bool(cached_bibs)
        self._bib_matrix = Bib_matrix(cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        cur_calc, opti, prints_counter = 0, 0, 0
        for cl1 in cluster_set.clusters:

            if cur_calc+opti - prints_counter > 100000:
                update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                prints_counter = cur_calc+opti

            #clean caches
            if cur_calc - last_cleaned > 2000000:
                clear_comparison_caches()
                last_cleaned = cur_calc

            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                val = compare_bibrefrecs(bib1, bib2)
                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
Пример #5
0
def merge_static_classy():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.

        NOTE: this is more elegant that merge_static but much slower. Will have to be improved
               before it can replace it.
    '''
    class Sig(object):
        def __init__(self, bibrefrec, pid_flag):
            self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag))
            self.assigned = filter(lambda p: -2 < p[1] and p[1] < 2, pid_flag)
            self.claimed = filter(lambda p: 2 <= p[1], pid_flag)
            self.bibrefrec = bibrefrec

            assert self.invariant()

        def invariant(self):
            return len(self.assigned) + len(self.claimed) <= 1

        def empty(self):
            return not self.isclaimed and not self.isassigned

        def isclaimed(self):
            return len(self.claimed) == 1

        def get_claimed(self):
            return self.claimed[0][0]

        def get_assigned(self):
            return self.assigned[0][0]

        def isassigned(self):
            return len(self.assigned) == 1

        def isrejected(self, pid):
            return pid in self.rejected

        def change_pid(self, pid):
            assert self.invariant()
            assert self.isassigned()
            self.assigned = [(pid, 0)]
            move_signature(self.bibrefrec, pid)

    class Cluster(object):
        def __init__(self, pid, sigs):
            self.pid = pid

            self.sigs = dict(
                (sig.bibrefrec[2], sig) for sig in sigs if not sig.empty())

        def send_sig(self, other, sig):
            paper = sig.bibrefrec[2]
            assert paper in self.sigs and paper not in other.sigs

            del self.sigs[paper]
            other.sigs[paper] = sig

            if sig.isassigned():
                sig.change_pid(other.pid)

    last_names = frozenset(name[0].split('.')[0]
                           for name in get_existing_result_clusters())

    personid = get_bibrefrec_to_pid_flag_mapping()
    free_pids = backinterface_get_free_pids()

    for idx, last in enumerate(last_names):
        update_status(
            float(idx) / len(last_names),
            "Merging, %d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4])
                   for row in get_lastname_results(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d))
                   for k, d in groupby(sorted(results, key=itemgetter(0)),
                                       key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            for d in ds:
                pid_flag = filter(lambda x: x[1] > -2, personid.get(d, []))
                if pid_flag:
                    assert len(pid_flag) == 1
                    pid = pid_flag[0][0]
                    pids.append(pid)
                    old_pids.add(pid)

            matr.append(
                dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids]
                                        for row in matr])

        # [[bibrefrecs] -> pid]
        matched_clusters = [(results[new_idx][1], old_pids[old_idx])
                            for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(
            imap(itemgetter(0), best_match))
        not_matched_clusters = izip(
            (results[i][1] for i in not_matched_clusters), free_pids)

        # pid -> Cluster
        clusters = dict(
            (pid,
             Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs]))
            for sigs, pid in chain(matched_clusters, not_matched_clusters))

        todo = clusters.items()
        for pid, clus in todo:
            assert clus.pid == pid

            for paper, sig in clus.sigs.items():
                if sig.isclaimed():
                    if sig.get_claimed() != pid:
                        target_clus = clusters[sig.get_claimed()]

                        if paper in target_clus.sigs:
                            new_clus = Cluster(free_pids.next(), [])
                            target_clus.send_sig(new_clus, target_clus[paper])
                            todo.append(new_clus)
                            clusters[new_clus.pid] = new_clus

                        assert paper not in target_clus.sigs
                        clus.send_sig(target_clus, sig)
                elif sig.get_assigned() != pid:
                    if not sig.isrejected(pid):
                        move_signature(sig.bibrefrec, pid)
                    else:
                        move_signature(sig.bibrefrec, free_pids.next())
                else:
                    assert not sig.isrejected(pid)

    update_status_final("Merging done.")

    update_status_final()
    delete_empty_persons()
    update_personID_canonical_names()
 def store(self, name):
     update_status(0., "Saving probability matrix...")
     self._bib_matrix.store(name)
     update_status_final("Probability matrix saved.")
Пример #7
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()
Пример #8
0
def group_sort_edges(cs, original_process_id):
    bibauthor_print("group_sort_edges spowned by %s" % original_process_id)

    plus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'w')
    minus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'w')
    pairs_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),'w')
    data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'w')

    plus_count = 0
    minus_count = 0
    pairs_count = 0

    default_val = [0.,0.]
    #gc.disable()
    interval = 1000
    current = -1
    for cl1 in cs.clusters:
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = h5file[str(id(cl1))]
        for bib2 in xrange(len(h5file[str(id(cl1))])):
            val = pointers[bib2]
            #if val[0] not in Bib_matrix.special_numbers:
            #optimization: special numbers are assumed to be negative
            if val[0] >= 0:

                if val[0] > edge_cut_prob:
                    pairs_count += 1
                    pairs_fp.write(_pack_vals((bib1, bib2, val)))

            elif val[0] == Bib_matrix.special_symbols['+']:
                plus_count += 1
                plus_fp.write(_pack_vals((bib1, bib2, default_val)))

            elif val[0] == Bib_matrix.special_symbols['-']:
                minus_count += 1
                minus_fp.write(_pack_vals((bib1, bib2, default_val)))
            else:
                assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    plus_fp.close()
    minus_fp.close()
    pairs_fp.close()

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d."
                     % (plus_count, minus_count, pairs_count))
    #gc.enable()
    bibauthor_print("Sorting in-file value edges.")
    sortFileInPlace(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),
                    bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),
                    lambda x: _edge_sorting(_unpack_vals(x)), reverse=True)

    os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id))

    bibauthor_print("Dumping egdes data to file...")
    cPickle.dump((plus_count, minus_count, pairs_count), data_fp)
    data_fp.close()
Пример #9
0
        raise Exception("""Error happened in convert_cluster_set with
                        v1: %s,
                        real_pointer: %s,
                        pointer: %s,
                        pointers: %s,
                        result_mapping: %s, index: %s,
                        len(real_pointer): %s,
                        len(pointer): %s,
                        len(pointers):  %s,
                        original_exception: %s
                        """%(str(v1),str(real_pointer),str(pointer), str(pointers),
                             str(result_mapping), str(index),
                             str(len(real_pointer)), str(len(pointer)),
                             str(len(pointers)), str(e)) )

    update_status_final("Converting the cluster set done.")
    #gc.enable()


def restore_cluster_set(cs):
    for cl in cs.clusters:
        cl.bibs = set(cs.new2old[b] for b in cl.bibs)
    cs.update_bibs()


def create_bib_2_cluster_dict(cs):
    '''
    Creates and returns a dictionary bibrefrec -> cluster.
    The cluster set must be converted!
    '''
    size = sum(len(cl.bibs) for cl in cs.clusters)
Пример #10
0
def merge_dynamic():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is dynamic: it allows aid* tables to be changed while it is still running,
        hence the claiming faciity for example can stay online during the merge. This comfort
        however is paid off in term of speed.
    '''
    last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names())

    def get_free_pids():
        while True:
            yield get_free_author_id()

    free_pids = get_free_pids()

    def try_move_signature(sig, target_pid):
        """
        """
        paps = get_ordered_author_and_status_of_signature(sig)
        rejected = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps)
        claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if int(target_pid) in [int(x[0]) for x in rejected]:
            move_signature(sig, free_pids.next())
        else:
            conflicts = get_signatures_of_paper_and_author(sig, target_pid)
            if not conflicts:
                move_signature(sig, target_pid)
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    move_signature(sig, free_pids.next())
                else:
                    move_signature(conflicts[0][:3], free_pids.next())
                    move_signature(sig, target_pid)

    for idx, last in enumerate(last_names):
        update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = list()
            for d in ds:
                pid_flag = get_author_and_status_of_confirmed_paper(d)
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)

            matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        #best_match = cluster,pid_idx,n
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, score in best_match if score > 0]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), [x for x in best_match if x[2] > 0]))
        not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                try_move_signature(sig, pid)

    update_status_final()
    remove_empty_authors()
    update_canonical_names_of_authors()
Пример #11
0
                pid = os.fork()
                if pid == 0: # child
                    os.nice(int((float(sizs[idx]) * 20.0 / biggest)))
                    run_job(job_idx)
                else: # parent
                    pid_2_idx[pid] = job_idx
                    assert free > sizs[job_idx]
                    free -= sizs[job_idx]
                    del free_idxs[idx]
            else:
                break

        pid, status = os.wait()
        assert pid in pid_2_idx
        idx = pid_2_idx[pid]
        freed = sizs[idx]
        done += freed
        ret_status[idx] = status
        free += freed
        del pid_2_idx[pid]
        update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs)))

    update_status_final("%d / %d" % (len(jobs), len(jobs)))
    assert is_eq(free, initial)
    assert not pid_2_idx
    assert not free_idxs
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs)
    assert all(stat != None for stat in ret_status)

    return ret_status
Пример #12
0
 def load(self, load_map=True, load_matrix=True):
     update_status(0., "Loading probability matrix...")
     self._bib_matrix.load()
     update_status_final("Probability matrix loaded.")
Пример #13
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    override_stdout_config(stdout=True)

    files = [
        '/tmp/baistats/' + x for x in os.listdir('/tmp/baistats/')
        if x.startswith('cluster_status_report_pid')
    ]
    fnum = float(len(files))
    quanta = .1 / fnum

    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(
        lambda: defaultdict(lambda: [0., 0., 0., 0., 0., 0.]))
    coeff_stats = defaultdict(lambda: [0., 0., 0., 0., 0., 0.])

    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i, c in enumerate(cn):
                update_status(i / l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i, fi in enumerate(files):
        if generate_graphs:
            if i % 1000 == 0:
                gen_graphs(True)

        f = open(fi, 'r')
        status = i / fnum
        update_status(status, 'Loading ' + fi[fi.find('lastname') + 9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status + 0.2 * quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (
                str(cur_clust), str(cur_coef), str(cur_maxlen),
                str(cur_clustnumber))

            if cur_coeffs:

                assert len(
                    cur_coeffs
                ) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s" % (
                    str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all(
                    [x >= 0 and x <= 1 for x in cur_coeffs]
                ), "Error, a coefficient is wrong here! Check me! %s %s %s" % (
                    str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs) / cur_clen

                update_status(status + 0.4 * quanta,
                              '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi + 1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1] * avi +
                                            cur_avg) / (avi + 1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2],
                                               cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3],
                                               cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4] * avi +
                                            cur_clustnumber) / (avi + 1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5] * avi +
                                            cur_clustnumber / cur_maxlen) / (
                                                avi + 1)

                update_status(status + 0.6 * quanta,
                              '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi + 1
                cluster_stats[cur_clust][cur_coef][1] = (
                    cluster_stats[cur_clust][cur_coef][1] * avi +
                    cur_avg) / (avi + 1)
                cluster_stats[cur_clust][cur_coef][2] = min(
                    cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(
                    cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (
                    cluster_stats[cur_clust][cur_coef][4] * avi +
                    cur_clustnumber) / (avi + 1)
                cluster_stats[cur_clust][cur_coef][5] = (
                    cluster_stats[cur_clust][cur_coef][5] * avi +
                    cur_clustnumber / cur_maxlen) / (avi + 1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()

    if pickle_output:
        update_status(0, 'Dumping to file...')
        f = open(pickle_output, 'w')
        SER.dump(
            {
                'cluster_stats':
                dict((x, dict(cluster_stats[x]))
                     for x in cluster_stats.iterkeys()),
                'coeff_stats':
                dict((coeff_stats))
            }, f)
        f.close()
Пример #14
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    interval = 1000
    for i, (bib1, bib2) in enumerate(plus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, (bib1, bib2) in enumerate(minus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=_edge_sorting, reverse=True)

    interval = 500000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        idcl1 = cluster_set.clusters.index(cl1)
        idcl2 = cluster_set.clusters.index(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
Пример #15
0
def group_sort_edges(cs, original_process_id):
    bibauthor_print("group_sort_edges spowned by %s" % original_process_id)

    plus_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
        str(original_process_id), 'w')
    minus_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
        str(original_process_id), 'w')
    pairs_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' +
        str(original_process_id), 'w')
    data_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
        str(original_process_id), 'w')

    plus_count = 0
    minus_count = 0
    pairs_count = 0

    default_val = [0., 0.]
    #gc.disable()
    interval = 1000
    current = -1
    for cl1 in cs.clusters:
        current += 1
        if (current % interval) == 0:
            update_status(
                float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = h5file[str(id(cl1))]
        for bib2 in xrange(len(h5file[str(id(cl1))])):
            val = pointers[bib2]
            #if val[0] not in Bib_matrix.special_numbers:
            #optimization: special numbers are assumed to be negative
            if val[0] >= 0:

                if val[0] > edge_cut_prob:
                    pairs_count += 1
                    pairs_fp.write(_pack_vals((bib1, bib2, val)))

            elif val[0] == Bib_matrix.special_symbols['+']:
                plus_count += 1
                plus_fp.write(_pack_vals((bib1, bib2, default_val)))

            elif val[0] == Bib_matrix.special_symbols['-']:
                minus_count += 1
                minus_fp.write(_pack_vals((bib1, bib2, default_val)))
            else:
                assert val[0] == Bib_matrix.special_symbols[
                    None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    plus_fp.close()
    minus_fp.close()
    pairs_fp.close()

    bibauthor_print(
        "Positive edges: %d, Negative edges: %d, Value edges: %d." %
        (plus_count, minus_count, pairs_count))
    #gc.enable()
    bibauthor_print("Sorting in-file value edges.")
    sortFileInPlace(bconfig.TORTOISE_FILES_PATH +
                    '/wedge_temp_edges_cache_e_' + str(original_process_id),
                    bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
                    str(original_process_id),
                    lambda x: _edge_sorting(_unpack_vals(x)),
                    reverse=True)

    os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' +
              str(original_process_id))

    bibauthor_print("Dumping egdes data to file...")
    cPickle.dump((plus_count, minus_count, pairs_count), data_fp)
    data_fp.close()
Пример #16
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)
    original_process_id = PID()
    #remember to close the files!
    #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set)

    p = Process(target=group_sort_edges,
                args=(cluster_set, original_process_id))
    p.start()
    p.join()

    plus_edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
        str(original_process_id), 'r')
    minus_edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
        str(original_process_id), 'r')
    edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
        str(original_process_id), 'r')
    data_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
        str(original_process_id), 'r')

    len_plus, len_minus, len_edges = cPickle.load(data_fp)
    data_fp.close()

    interval = 1000
    for i, s in enumerate(plus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(
                float(i) / len_plus, "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, s in enumerate(minus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_minus, "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    interval = 50000
    wedge_print("Wedge: New wedge, %d edges." % len_edges)
    current = -1
    for s in edges_fp:
        v1, v2, unused = _unpack_vals(s)
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len_edges, "Wedge...")

        assert unused != '+' and unused != '-', PID(
        ) + "Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        #try using object ids instead of index to boost performances
        #idcl1 = cluster_set.clusters.index(cl1)
        #idcl2 = cluster_set.clusters.index(cl2)
        idcl1 = id(cl1)
        idcl2 = id(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print(
            "Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)"
            % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(
                    cluster_set,
                    "/tmp/%s%d.dot" % (cluster_set.last_name, current),
                    bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s" %
                            (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %
                            (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" %
                        (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" %
                        (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name,
                      bib_map)

    plus_edges_fp.close()
    minus_edges_fp.close()
    edges_fp.close()
    data_fp.close()

    try:
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
                  str(original_process_id))
    except:
        pass
Пример #17
0
            """Error happened in convert_cluster_set with
                        v1: %s,
                        real_pointer: %s,
                        pointer: %s,
                        pointers: %s,
                        result_mapping: %s, index: %s,
                        len(real_pointer): %s,
                        len(pointer): %s,
                        len(pointers):  %s,
                        original_exception: %s
                        """ %
            (str(v1), str(real_pointer), str(pointer), str(pointers),
             str(result_mapping), str(index), str(len(real_pointer)),
             str(len(pointer)), str(len(pointers)), str(e)))

    update_status_final("Converting the cluster set done.")
    #gc.enable()


def restore_cluster_set(cs):
    for cl in cs.clusters:
        cl.bibs = set(cs.new2old[b] for b in cl.bibs)
    cs.update_bibs()


def create_bib_2_cluster_dict(cs):
    '''
    Creates and returns a dictionary bibrefrec -> cluster.
    The cluster set must be converted!
    '''
    size = sum(len(cl.bibs) for cl in cs.clusters)
Пример #18
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    import matplotlib.pyplot as plt
    plt.ioff()
    def _gen_plot(data, filename):
        plt.clf()
        ax = plt.subplot(111)
        ax.grid(visible=True)
        x = sorted(data.keys())

        w = [data[k][0] for k in x]
        try:
            wscf = max(w)
        except:
            wscf = 0
        w = [float(i)/wscf for i in w]
        y = [data[k][1] for k in x]
        maxi = [data[k][3] for k in x]
        mini = [data[k][2] for k in x]

        lengs = [data[k][4] for k in x]
        try:
            ml = float(max(lengs))
        except:
            ml = 1
        lengs = [k/ml for k in lengs]

        normalengs = [data[k][5] for k in x]

        ax.plot(x,y,'-o',label='avg')
        ax.plot(x,maxi,'-o', label='max')
        ax.plot(x,mini,'-o', label='min')
        ax.plot(x,w, '-x', label='norm %s' % str(wscf))
        ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml)))
        ax.plot(x,normalengs, '-o', label='ncl')
        plt.ylim(ymax = 1., ymin = -0.01)
        plt.xlim(xmax = 1., xmin = -0.01)
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.)
        plt.savefig(filename)

    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = filehandler.open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()
Пример #19
0
class ProbabilityMatrix(object):
    '''
    This class contains and maintains the comparison
    between all virtual authors. It is able to write
    and read from the database and update the results.
    '''
    def __init__(self, name):
        self._bib_matrix = Bib_matrix(name)

    def load(self, load_map=True, load_matrix=True):
        update_status(0., "Loading probability matrix...")
        self._bib_matrix.load()
        update_status_final("Probability matrix loaded.")

    def store(self):
        update_status(0., "Saving probability matrix...")
        self._bib_matrix.store()
        update_status_final("Probability matrix saved.")

    def __getitem__(self, bibs):
        return self._bib_matrix[bibs[0], bibs[1]]

    def getitem_numeric(self, bibs):
        return self._bib_matrix.getitem_numeric(bibs)

    def __get_up_to_date_bibs(self, bib_matrix):
        return frozenset(
            get_modified_papers_before(bib_matrix.get_keys(),
                                       bib_matrix.creation_time))

    def is_up_to_date(self, cluster_set):
        return self.__get_up_to_date_bibs(self._bib_matrix) >= frozenset(
            cluster_set.all_bibs())

    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0
        self._bib_matrix.store()
        try:
            old_matrix = Bib_matrix(self._bib_matrix.name + 'copy')
            old_matrix.duplicate_existing(self._bib_matrix.name,
                                          self._bib_matrix.name + 'copy')
            old_matrix.load()
            cached_bibs = self.__get_up_to_date_bibs(old_matrix)
            have_cached_bibs = bool(cached_bibs)
        except IOError:
            old_matrix.destroy()
            cached_bibs = None
            have_cached_bibs = False

        self._bib_matrix.destroy()
        self._bib_matrix = Bib_matrix(cluster_set.last_name,
                                      cluster_set=cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        try:
            cur_calc, opti, prints_counter = 0, 0, 0
            for cl1 in cluster_set.clusters:

                if cur_calc + opti - prints_counter > 100000 or cur_calc == 0:
                    update_status(
                        (float(opti) + cur_calc) / expected,
                        "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                    prints_counter = cur_calc + opti

    #            #clean caches
                if cur_calc - last_cleaned > 20000000:
                    gc.collect()
                    #                clear_comparison_caches()
                    last_cleaned = cur_calc

                for cl2 in cluster_set.clusters:
                    if id(cl1) < id(cl2) and not cl1.hates(cl2):
                        for bib1 in cl1.bibs:
                            for bib2 in cl2.bibs:
                                if have_cached_bibs:
                                    try:
                                        val = old_matrix[bib1, bib2]
                                        opti += 1
                                        if bconfig.DEBUG_CHECKS:
                                            assert _debug_is_eq_v(
                                                val,
                                                compare_bibrefrecs(bib1, bib2))
                                    except KeyError:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                    if not val:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                self._bib_matrix[bib1, bib2] = val

        except Exception, e:
            raise Exception("""Error happened in prob_matrix.recalculate with
            val:%s
            original_exception: %s
            """ % (str(val), str(e)))

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
Пример #20
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    import matplotlib.pyplot as plt
    plt.ioff()
    def _gen_plot(data, filename):
        plt.clf()
        ax = plt.subplot(111)
        ax.grid(visible=True)
        x = sorted(data.keys())

        w = [data[k][0] for k in x]
        try:
            wscf = max(w)
        except:
            wscf = 0
        w = [float(i)/wscf for i in w]
        y = [data[k][1] for k in x]
        maxi = [data[k][3] for k in x]
        mini = [data[k][2] for k in x]

        lengs = [data[k][4] for k in x]
        try:
            ml = float(max(lengs))
        except:
            ml = 1
        lengs = [k/ml for k in lengs]

        normalengs = [data[k][5] for k in x]

        ax.plot(x,y,'-o',label='avg')
        ax.plot(x,maxi,'-o', label='max')
        ax.plot(x,mini,'-o', label='min')
        ax.plot(x,w, '-x', label='norm %s' % str(wscf))
        ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml)))
        ax.plot(x,normalengs, '-o', label='ncl')
        plt.ylim(ymax = 1., ymin = -0.01)
        plt.xlim(xmax = 1., xmin = -0.01)
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.)
        plt.savefig(filename)

    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = filehandler.open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()
Пример #21
0
 def store(self):
     update_status(0., "Saving probability matrix...")
     self._bib_matrix.store()
     update_status_final("Probability matrix saved.")
Пример #22
0
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_string_to_pid_dictionary()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_new_personid()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(
            float(idx) / len(bibrecs),
            "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_authors_from_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_from_rec(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(
                 split_name_parts(get_name_by_bibrecref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = []
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_person_with_extid(inspire_id[0]))
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
        update_personID_external_ids(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()
Пример #23
0
def merge_static():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.
    '''
    last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names())

    def get_free_pids():
        while True:
            yield get_free_author_id()

    free_pids = get_free_pids()

    current_mapping = get_paper_to_author_and_status_mapping()

    def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag):
        move_signature(sig, new_pid_flag[0])
        current_mapping[sig].remove(old_pid_flag)
        current_mapping[sig].append(new_pid_flag)

    def try_move_signature(sig, target_pid):
        """
        """
        paps = current_mapping[sig]
        rejected = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps)
        claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if rejected:
            newpid = free_pids.next()
            move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1]))
        else:
            conflicts = get_signatures_of_paper_and_author(sig, target_pid)
            if not conflicts:
                move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1]))
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    newpid = free_pids.next()
                    move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1]))
                else:
                    newpid = free_pids.next()
                    csig = tuple(conflicts[0][:3])
                    move_sig_and_update_mapping(csig, (target_pid, conflicts[0][3]), (newpid, conflicts[0][3]))
                    move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1]))

    for idx, last in enumerate(last_names):
        update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            claim = []
            for d in ds:
                pid_flag = current_mapping.get(d, [])
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)
                    if flag > 1:
                        claim.append((d, pid))

            matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match))
        not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                if sig in current_mapping:
                    if not pid in map(itemgetter(0), filter(lambda x: x[1] > -2, current_mapping[sig])):
                        try_move_signature(sig, pid)

    update_status_final()
    remove_empty_authors()
    update_canonical_names_of_authors()
Пример #24
0
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and
        len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx%200 == 0:
            task_sleep_now_if_required(True)

            update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(" - Record was deleted, removing from pid and continuing with next record", True)
            remove_papers([rec])
            continue


        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        pids_having_rec = set([int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec,))
                    if inspire_id:
                        matched_pids = list(get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids: # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
Пример #25
0
def merge_static_classy():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.

        NOTE: this is more elegant that merge_static but much slower. Will have to be improved
               before it can replace it.
    '''
    class Sig(object):
        def __init__(self, bibrefrec, pid_flag):
            self.rejected = dict(filter(lambda p:                p[1] <= -2, pid_flag))
            self.assigned = filter(lambda p:-2 < p[1] and p[1] < 2, pid_flag)
            self.claimed = filter(lambda p:  2 <= p[1], pid_flag)
            self.bibrefrec = bibrefrec

            assert self.invariant()

        def invariant(self):
            return len(self.assigned) + len(self.claimed) <= 1

        def empty(self):
            return not self.isclaimed and not self.isassigned

        def isclaimed(self):
            return len(self.claimed) == 1

        def get_claimed(self):
            return self.claimed[0][0]

        def get_assigned(self):
            return self.assigned[0][0]

        def isassigned(self):
            return len(self.assigned) == 1

        def isrejected(self, pid):
            return pid in self.rejected

        def change_pid(self, pid):
            assert self.invariant()
            assert self.isassigned()
            self.assigned = [(pid, 0)]
            move_signature(self.bibrefrec, pid)

    class Cluster(object):
        def __init__(self, pid, sigs):
            self.pid = pid

            self.sigs = dict((sig.bibrefrec[2], sig) for sig in sigs if not sig.empty())

        def send_sig(self, other, sig):
            paper = sig.bibrefrec[2]
            assert paper in self.sigs and paper not in other.sigs

            del self.sigs[paper]
            other.sigs[paper] = sig

            if sig.isassigned():
                sig.change_pid(other.pid)

    last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names())

    personid = get_paper_to_author_and_status_mapping()
    free_pids = backinterface_get_free_pids()

    for idx, last in enumerate(last_names):
        update_status(float(idx) / len(last_names), "Merging, %d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            for d in ds:
                pid_flag = filter(lambda x: x[1] > -2, personid.get(d, []))
                if pid_flag:
                    assert len(pid_flag) == 1
                    pid = pid_flag[0][0]
                    pids.append(pid)
                    old_pids.add(pid)

            matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr])

        # [[bibrefrecs] -> pid]
        matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match))
        not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids)

        # pid -> Cluster
        clusters = dict((pid, Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs]))
                        for sigs, pid in chain(matched_clusters, not_matched_clusters))

        todo = clusters.items()
        for pid, clus in todo:
            assert clus.pid == pid

            for paper, sig in clus.sigs.items():
                if sig.isclaimed():
                    if sig.get_claimed() != pid:
                        target_clus = clusters[sig.get_claimed()]

                        if paper in target_clus.sigs:
                            new_clus = Cluster(free_pids.next(), [])
                            target_clus.send_sig(new_clus, target_clus[paper])
                            todo.append(new_clus)
                            clusters[new_clus.pid] = new_clus

                        assert paper not in target_clus.sigs
                        clus.send_sig(target_clus, sig)
                elif sig.get_assigned() != pid:
                    if not sig.isrejected(pid):
                        move_signature(sig.bibrefrec, pid)
                    else:
                        move_signature(sig.bibrefrec, free_pids.next())
                else:
                    assert not sig.isrejected(pid)

    update_status_final("Merging done.")

    update_status_final()
    remove_empty_authors()
    update_canonical_names_of_authors()
Пример #26
0
                    os.nice(int((float(sizs[idx]) * 20.0 / biggest)))
                    run_job(job_idx)
                else:  # parent
                    pid_2_idx[pid] = job_idx
                    assert free > sizs[job_idx]
                    free -= sizs[job_idx]
                    del free_idxs[idx]
            else:
                break

        pid, status = os.wait()
        assert pid in pid_2_idx
        idx = pid_2_idx[pid]
        freed = sizs[idx]
        done += freed
        ret_status[idx] = status
        free += freed
        del pid_2_idx[pid]
        update_status(
            done / total, "%d / %d" %
            (len(jobs) - len(free_idxs) - len(pid_2_idx), len(jobs)))

    update_status_final("%d / %d" % (len(jobs), len(jobs)))
    assert is_eq(free, initial)
    assert not pid_2_idx
    assert not free_idxs
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs)
    assert all(stat != None for stat in ret_status)

    return ret_status
Пример #27
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)
    original_process_id = PID()
    #remember to close the files!
    #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set)

    p = Process(target=group_sort_edges, args=(cluster_set,original_process_id))
    p.start()
    p.join()

    plus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'r')
    minus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'r')
    edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),'r')
    data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'r')

    len_plus,len_minus,len_edges = cPickle.load(data_fp)
    data_fp.close()

    interval = 1000
    for i, s in enumerate(plus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_plus, "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, s in enumerate(minus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_minus, "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    interval = 50000
    wedge_print("Wedge: New wedge, %d edges." % len_edges)
    current = -1
    for  s in edges_fp:
        v1, v2, unused = _unpack_vals(s)
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len_edges, "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        #try using object ids instead of index to boost performances
        #idcl1 = cluster_set.clusters.index(cl1)
        #idcl2 = cluster_set.clusters.index(cl2)
        idcl1 = id(cl1)
        idcl2 = id(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)

    plus_edges_fp.close()
    minus_edges_fp.close()
    edges_fp.close()
    data_fp.close()

    try:
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id))
    except:
        pass
Пример #28
0
def merge_static():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.
    '''
    last_names = frozenset(name[0].split('.')[0]
                           for name in get_existing_result_clusters())

    def get_free_pids():
        while True:
            yield get_new_personid()

    free_pids = get_free_pids()

    current_mapping = get_bibrefrec_to_pid_flag_mapping()

    def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag):
        move_signature(sig, new_pid_flag[0])
        current_mapping[sig].remove(old_pid_flag)
        current_mapping[sig].append(new_pid_flag)

    def try_move_signature(sig, target_pid):
        """
        """
        paps = current_mapping[sig]
        rejected = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps)
        claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if rejected:
            newpid = free_pids.next()
            move_sig_and_update_mapping(sig, assigned[0],
                                        (newpid, assigned[0][1]))
        else:
            conflicts = find_conflicts(sig, target_pid)
            if not conflicts:
                move_sig_and_update_mapping(sig, assigned[0],
                                            (target_pid, assigned[0][1]))
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    newpid = free_pids.next()
                    move_sig_and_update_mapping(sig, assigned[0],
                                                (newpid, assigned[0][1]))
                else:
                    newpid = free_pids.next()
                    csig = tuple(conflicts[0][:3])
                    move_sig_and_update_mapping(csig,
                                                (target_pid, conflicts[0][3]),
                                                (newpid, conflicts[0][3]))
                    move_sig_and_update_mapping(sig, assigned[0],
                                                (target_pid, assigned[0][1]))

    for idx, last in enumerate(last_names):
        update_status(
            float(idx) / len(last_names),
            "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4])
                   for row in get_lastname_results(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d))
                   for k, d in groupby(sorted(results, key=itemgetter(0)),
                                       key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            claim = []
            for d in ds:
                pid_flag = current_mapping.get(d, [])
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)
                    if flag > 1:
                        claim.append((d, pid))

            matr.append(
                dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids]
                                        for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx])
                            for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(
            imap(itemgetter(0), best_match))
        not_matched_clusters = izip(
            (results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                if sig in current_mapping:
                    if not pid in map(
                            itemgetter(0),
                            filter(lambda x: x[1] > -2, current_mapping[sig])):
                        try_move_signature(sig, pid)

    update_status_final()
    delete_empty_persons()
    update_personID_canonical_names()
Пример #29
0
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_string_to_pid_dictionary()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_new_personid()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and
        len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = []
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id(sig + (rec,))
                    if inspire_id:
                        matched_pids = list(get_person_with_extid(inspire_id[0]))
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
        update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()
Пример #30
0
def merge_dynamic():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is dynamic: it allows aid* tables to be changed while it is still running,
        hence the claiming faciity for example can stay online during the merge. This comfort
        however is paid off in term of speed.
    '''
    last_names = frozenset(name[0].split('.')[0]
                           for name in get_existing_result_clusters())

    def get_free_pids():
        while True:
            yield get_new_personid()

    free_pids = get_free_pids()

    def try_move_signature(sig, target_pid):
        """
        """
        paps = get_signature_info(sig)
        rejected = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps)
        claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if rejected:
            move_signature(sig, free_pids.next())
        else:
            conflicts = find_conflicts(sig, target_pid)
            if not conflicts:
                move_signature(sig, target_pid)
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    move_signature(sig, free_pids.next())
                else:
                    move_signature(conflicts[0][:3], free_pids.next())
                    move_signature(sig, target_pid)

    for idx, last in enumerate(last_names):
        update_status(
            float(idx) / len(last_names),
            "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4])
                   for row in get_lastname_results(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d))
                   for k, d in groupby(sorted(results, key=itemgetter(0)),
                                       key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            claim = []
            for d in ds:
                pid_flag = personid_from_signature(d)
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)
                    if flag > 1:
                        claim.append((d, pid))

            matr.append(
                dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids]
                                        for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx])
                            for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(
            imap(itemgetter(0), best_match))
        not_matched_clusters = izip(
            (results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                try_move_signature(sig, pid)

    update_status_final()
    delete_empty_persons()
    update_personID_canonical_names()
Пример #31
0
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None,
           verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx % 200 == 0:
            task_sleep_now_if_required(True)

            update_status(
                float(idx) / len(bibrecs),
                "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" %
                                 (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(
                " - Record was deleted, removing from pid and continuing with next record",
                True)
            remove_papers([rec])
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_author_refs_of_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_of_paper(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(split_name_parts(get_name_by_bibref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures),
                 bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures),
                 bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(
                " - - Moving signature: %s on %s to %s as %s" %
                (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        pids_having_rec = set(
            [int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(
                                matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
Пример #32
0
 def load(self, lname, load_map=True, load_matrix=True):
     update_status(0., "Loading probability matrix...")
     self._bib_matrix.load(lname, load_map, load_matrix)
     update_status_final("Probability matrix loaded.")