示例#1
0
def group_edges(cs):
    plus = []
    minus = []
    pairs = []
    gc.disable()
    interval = 1000
    for current, cl1 in enumerate(cs.clusters):
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = cl1.out_edges
        for bib2 in xrange(len(cl1.out_edges)):
            val = pointers[bib2]
            if val[0] not in Bib_matrix.special_numbers:
                if val[0] > edge_cut_prob:
                    pairs.append((bib1, bib2, val))
            elif val[0] == Bib_matrix.special_symbols['+']:
                plus.append((bib1, bib2))
            elif val[0] == Bib_matrix.special_symbols['-']:
                minus.append((bib1, bib2))
            else:
                assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d."
                     % (len(plus), len(minus), len(pairs)))
    gc.enable()
    return plus, minus, pairs
示例#2
0
def convert_cluster_set(cs, prob_matr):
    '''
    Convertes a normal cluster set to a wedge clsuter set.
    @param cs: a cluster set to be converted
    @param type: cluster set
    @return: a mapping from a number to a bibrefrec.
    '''
    gc.disable()

    # step 1:
    #    + Assign a number to each bibrefrec.
    #    + Replace the arrays of bibrefrecs with arrays of numbers.
    #    + Store the result and prepare it to be returned.
    result_mapping = []
    for clus in cs.clusters:
        start = len(result_mapping)
        result_mapping += list(clus.bibs)
        end = len(result_mapping)
        clus.bibs = range(start, end)

    assert len(result_mapping) == len(set(result_mapping)), PID()+"Cluster set conversion failed"
    assert len(result_mapping) == cs.num_all_bibs, PID()+"Cluster set conversion failed"

    cs.new2old = result_mapping

    # step 2:
    #    + Using the prob matrix create a vector values to all other bibs.
    #    + Meld those vectors into one for each cluster.

    special_symbols = Bib_matrix.special_symbols #locality optimization

    interval = 10000
    for current, c1 in enumerate(cs.clusters):
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Converting the cluster set...")

        assert len(c1.bibs) > 0, PID()+"Empty cluster send to wedge"
        pointers = []

        for v1 in c1.bibs:
            pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C')
            pointer.fill(special_symbols[None])
            rm = result_mapping[v1] #locality optimization
            for c2 in cs.clusters:
                if c1 != c2 and not c1.hates(c2):
                    for v2 in c2.bibs:
                        val = prob_matr[rm, result_mapping[v2]]
                        try:
                            numb = special_symbols[val]
                            val = (numb, numb)
                        except KeyError:
                            pass
                        assert len(val) == 2, "Edge coding failed"
                        pointer[v2] = val
            pointers.append((pointer, 1))
        c1.out_edges = reduce(meld_edges, pointers)[0]

    update_status_final("Converting the cluster set done.")
    gc.enable()
 def gen_graphs(only_synthetic=False):
     update_status(0, 'Generating coefficients graph...')
     _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
     if not only_synthetic:
         cn = cluster_stats.keys()
         l = float(len(cn))
         for i,c in enumerate(cn):
             update_status(i/l, 'Generating name graphs... %s' % str(c))
             _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))
示例#4
0
 def gen_graphs(only_synthetic=False):
     update_status(0, 'Generating coefficients graph...')
     _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
     if not only_synthetic:
         cn = cluster_stats.keys()
         l = float(len(cn))
         for i, c in enumerate(cn):
             update_status(i / l, 'Generating name graphs... %s' % str(c))
             _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))
示例#5
0
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0

        old_matrix = self._bib_matrix
        cached_bibs = self.__get_up_to_date_bibs()
        have_cached_bibs = bool(cached_bibs)
        self._bib_matrix = Bib_matrix(cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        cur_calc, opti, prints_counter = 0, 0, 0
        for cl1 in cluster_set.clusters:

            if cur_calc + opti - prints_counter > 100000:
                update_status(
                    (float(opti) + cur_calc) / expected,
                    "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                prints_counter = cur_calc + opti

            #clean caches
            if cur_calc - last_cleaned > 2000000:
                clear_comparison_caches()
                last_cleaned = cur_calc

            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(
                                            val,
                                            compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                val = compare_bibrefrecs(bib1, bib2)
                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0

        old_matrix = self._bib_matrix
        cached_bibs = self.__get_up_to_date_bibs()
        have_cached_bibs = bool(cached_bibs)
        self._bib_matrix = Bib_matrix(cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        cur_calc, opti, prints_counter = 0, 0, 0
        for cl1 in cluster_set.clusters:

            if cur_calc+opti - prints_counter > 100000:
                update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                prints_counter = cur_calc+opti

            #clean caches
            if cur_calc - last_cleaned > 2000000:
                clear_comparison_caches()
                last_cleaned = cur_calc

            for cl2 in cluster_set.clusters:
                if id(cl1) < id(cl2) and not cl1.hates(cl2):
                    for bib1 in cl1.bibs:
                        for bib2 in cl2.bibs:
                            if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs:
                                val = old_matrix[bib1, bib2]
                                if not val:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    opti += 1
                                    if bconfig.DEBUG_CHECKS:
                                        assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
                            else:
                                cur_calc += 1
                                val = compare_bibrefrecs(bib1, bib2)
                            self._bib_matrix[bib1, bib2] = val

        clear_comparison_caches()
        update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
示例#7
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)
    original_process_id = PID()
    #remember to close the files!
    #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set)

    p = Process(target=group_sort_edges,
                args=(cluster_set, original_process_id))
    p.start()
    p.join()

    plus_edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
        str(original_process_id), 'r')
    minus_edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
        str(original_process_id), 'r')
    edges_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
        str(original_process_id), 'r')
    data_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
        str(original_process_id), 'r')

    len_plus, len_minus, len_edges = cPickle.load(data_fp)
    data_fp.close()

    interval = 1000
    for i, s in enumerate(plus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(
                float(i) / len_plus, "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, s in enumerate(minus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_minus, "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    interval = 50000
    wedge_print("Wedge: New wedge, %d edges." % len_edges)
    current = -1
    for s in edges_fp:
        v1, v2, unused = _unpack_vals(s)
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len_edges, "Wedge...")

        assert unused != '+' and unused != '-', PID(
        ) + "Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        #try using object ids instead of index to boost performances
        #idcl1 = cluster_set.clusters.index(cl1)
        #idcl2 = cluster_set.clusters.index(cl2)
        idcl1 = id(cl1)
        idcl2 = id(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print(
            "Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)"
            % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(
                    cluster_set,
                    "/tmp/%s%d.dot" % (cluster_set.last_name, current),
                    bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s" %
                            (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %
                            (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" %
                        (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" %
                        (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name,
                      bib_map)

    plus_edges_fp.close()
    minus_edges_fp.close()
    edges_fp.close()
    data_fp.close()

    try:
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
                  str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
                  str(original_process_id))
    except:
        pass
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_string_to_pid_dictionary()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_new_personid()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and
        len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = []
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id(sig + (rec,))
                    if inspire_id:
                        matched_pids = list(get_person_with_extid(inspire_id[0]))
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids: # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
        update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()
示例#9
0
def convert_cluster_set(cs, prob_matr):
    '''
    Convertes a normal cluster set to a wedge cluster set.
    @param cs: a cluster set to be converted
    @param type: cluster set
    @return: a mapping from a number to a bibrefrec.
    '''
    #gc.disable()

    # step 1:
    #    + Assign a number to each bibrefrec.
    #    + Replace the arrays of bibrefrecs with arrays of numbers.
    #    + Store the result and prepare it to be returned.
    result_mapping = list()
    for clus in cs.clusters:
        start = len(result_mapping)
        result_mapping += list(clus.bibs)
        end = len(result_mapping)
        clus.bibs = range(start, end)

    assert len(result_mapping) == len(set(result_mapping)), PID()+"Cluster set conversion failed"
    assert len(result_mapping) == cs.num_all_bibs, PID()+"Cluster set conversion failed"

    cs.new2old = result_mapping

    # step 2:
    #    + Using the prob matrix create a vector values to all other bibs.
    #    + Meld those vectors into one for each cluster.

    special_symbols = Bib_matrix.special_symbols #locality optimization
    pb_getitem_numeric = prob_matr.getitem_numeric

    interval = 100
    gc.set_threshold(100,100,100)
    current = -1
    real_pointer = None
    try:
        for c1 in cs.clusters:
            gc.collect()
            current += 1
            if (current % interval) == 0:
                update_status(float(current) / len(cs.clusters), "Converting the cluster set...")

            assert len(c1.bibs) > 0, PID()+"Empty cluster send to wedge"
            pointers = list()

            for v1 in c1.bibs:
                pointer = list()
                index = list()
                rm = result_mapping[v1] #locality optimization
                for c2 in cs.clusters:
                    if c1 != c2 and not c1.hates(c2):
                        pointer += [pb_getitem_numeric((rm, result_mapping[v2])) for v2 in c2.bibs]
                        index += c2.bibs
                if index and pointer:
                    real_pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C')
                    real_pointer.fill(special_symbols[None])
                    real_pointer[index] = pointer
                    pointers.append((real_pointer, 1))

            if pointers:
                out_edges = reduce(meld_edges, pointers)[0]
                h5file.create_dataset(str(id(c1)), (len(out_edges), 2), 'f')
                dset = h5file[str(id(c1))]
                dset[:] = out_edges
            else:
                h5file.create_dataset(str(id(c1)), (len(cs.clusters), 2), 'f')

    except Exception, e:
        raise Exception("""Error happened in convert_cluster_set with
                        v1: %s,
                        real_pointer: %s,
                        pointer: %s,
                        pointers: %s,
                        result_mapping: %s, index: %s,
                        len(real_pointer): %s,
                        len(pointer): %s,
                        len(pointers):  %s,
                        original_exception: %s
                        """%(str(v1),str(real_pointer),str(pointer), str(pointers),
                             str(result_mapping), str(index),
                             str(len(real_pointer)), str(len(pointer)),
                             str(len(pointers)), str(e)) )
示例#10
0
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0
        self._bib_matrix.store()
        try:
            old_matrix = Bib_matrix(self._bib_matrix.name + 'copy')
            old_matrix.duplicate_existing(self._bib_matrix.name,
                                          self._bib_matrix.name + 'copy')
            old_matrix.load()
            cached_bibs = self.__get_up_to_date_bibs(old_matrix)
            have_cached_bibs = bool(cached_bibs)
        except IOError:
            old_matrix.destroy()
            cached_bibs = None
            have_cached_bibs = False

        self._bib_matrix.destroy()
        self._bib_matrix = Bib_matrix(cluster_set.last_name,
                                      cluster_set=cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        try:
            cur_calc, opti, prints_counter = 0, 0, 0
            for cl1 in cluster_set.clusters:

                if cur_calc + opti - prints_counter > 100000 or cur_calc == 0:
                    update_status(
                        (float(opti) + cur_calc) / expected,
                        "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                    prints_counter = cur_calc + opti

    #            #clean caches
                if cur_calc - last_cleaned > 20000000:
                    gc.collect()
                    #                clear_comparison_caches()
                    last_cleaned = cur_calc

                for cl2 in cluster_set.clusters:
                    if id(cl1) < id(cl2) and not cl1.hates(cl2):
                        for bib1 in cl1.bibs:
                            for bib2 in cl2.bibs:
                                if have_cached_bibs:
                                    try:
                                        val = old_matrix[bib1, bib2]
                                        opti += 1
                                        if bconfig.DEBUG_CHECKS:
                                            assert _debug_is_eq_v(
                                                val,
                                                compare_bibrefrecs(bib1, bib2))
                                    except KeyError:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                    if not val:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                self._bib_matrix[bib1, bib2] = val

        except Exception, e:
            raise Exception("""Error happened in prob_matrix.recalculate with
            val:%s
            original_exception: %s
            """ % (str(val), str(e)))
示例#11
0
 def load(self, load_map=True, load_matrix=True):
     update_status(0., "Loading probability matrix...")
     self._bib_matrix.load()
     update_status_final("Probability matrix loaded.")
示例#12
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    override_stdout_config(stdout=True)

    files = [
        '/tmp/baistats/' + x for x in os.listdir('/tmp/baistats/')
        if x.startswith('cluster_status_report_pid')
    ]
    fnum = float(len(files))
    quanta = .1 / fnum

    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(
        lambda: defaultdict(lambda: [0., 0., 0., 0., 0., 0.]))
    coeff_stats = defaultdict(lambda: [0., 0., 0., 0., 0., 0.])

    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i, c in enumerate(cn):
                update_status(i / l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i, fi in enumerate(files):
        if generate_graphs:
            if i % 1000 == 0:
                gen_graphs(True)

        f = open(fi, 'r')
        status = i / fnum
        update_status(status, 'Loading ' + fi[fi.find('lastname') + 9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status + 0.2 * quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (
                str(cur_clust), str(cur_coef), str(cur_maxlen),
                str(cur_clustnumber))

            if cur_coeffs:

                assert len(
                    cur_coeffs
                ) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s" % (
                    str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all(
                    [x >= 0 and x <= 1 for x in cur_coeffs]
                ), "Error, a coefficient is wrong here! Check me! %s %s %s" % (
                    str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs) / cur_clen

                update_status(status + 0.4 * quanta,
                              '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi + 1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1] * avi +
                                            cur_avg) / (avi + 1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2],
                                               cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3],
                                               cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4] * avi +
                                            cur_clustnumber) / (avi + 1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5] * avi +
                                            cur_clustnumber / cur_maxlen) / (
                                                avi + 1)

                update_status(status + 0.6 * quanta,
                              '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi + 1
                cluster_stats[cur_clust][cur_coef][1] = (
                    cluster_stats[cur_clust][cur_coef][1] * avi +
                    cur_avg) / (avi + 1)
                cluster_stats[cur_clust][cur_coef][2] = min(
                    cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(
                    cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (
                    cluster_stats[cur_clust][cur_coef][4] * avi +
                    cur_clustnumber) / (avi + 1)
                cluster_stats[cur_clust][cur_coef][5] = (
                    cluster_stats[cur_clust][cur_coef][5] * avi +
                    cur_clustnumber / cur_maxlen) / (avi + 1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()

    if pickle_output:
        update_status(0, 'Dumping to file...')
        f = open(pickle_output, 'w')
        SER.dump(
            {
                'cluster_stats':
                dict((x, dict(cluster_stats[x]))
                     for x in cluster_stats.iterkeys()),
                'coeff_stats':
                dict((coeff_stats))
            }, f)
        f.close()
示例#13
0
def merge_dynamic():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is dynamic: it allows aid* tables to be changed while it is still running,
        hence the claiming faciity for example can stay online during the merge. This comfort
        however is paid off in term of speed.
    '''
    last_names = frozenset(name[0].split('.')[0]
                           for name in get_existing_result_clusters())

    def get_free_pids():
        while True:
            yield get_new_personid()

    free_pids = get_free_pids()

    def try_move_signature(sig, target_pid):
        """
        """
        paps = get_signature_info(sig)
        rejected = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps)
        claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if rejected:
            move_signature(sig, free_pids.next())
        else:
            conflicts = find_conflicts(sig, target_pid)
            if not conflicts:
                move_signature(sig, target_pid)
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    move_signature(sig, free_pids.next())
                else:
                    move_signature(conflicts[0][:3], free_pids.next())
                    move_signature(sig, target_pid)

    for idx, last in enumerate(last_names):
        update_status(
            float(idx) / len(last_names),
            "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4])
                   for row in get_lastname_results(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d))
                   for k, d in groupby(sorted(results, key=itemgetter(0)),
                                       key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            claim = []
            for d in ds:
                pid_flag = personid_from_signature(d)
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)
                    if flag > 1:
                        claim.append((d, pid))

            matr.append(
                dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids]
                                        for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx])
                            for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(
            imap(itemgetter(0), best_match))
        not_matched_clusters = izip(
            (results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                try_move_signature(sig, pid)

    update_status_final()
    delete_empty_persons()
    update_personID_canonical_names()
 def load(self, lname, load_map=True, load_matrix=True):
     update_status(0., "Loading probability matrix...")
     self._bib_matrix.load(lname, load_map, load_matrix)
     update_status_final("Probability matrix loaded.")
def main():
    """
    Reads import file and verfies the md5 hash.
    For each line in the import file:
        find new record from bibcode, find new ref from name on record
        find old row in personid tables
        copy row with new authorref (tab:bibref,rec) to temp table
    overwrite personid tables w/ temp table
    """
    ## create temporary tables...
    print "Creating temporary tables..."
    create_temp_pid_sql_table()
    create_temp_piddata_sql_table()
    create_temp_user_input_log_sql_table()
    
    ## fill temp tables w/ static values...
    print "Filling temporary tables with static, unchanged content"
    copy_unaltered_piddata_rows_to_temp()
    copy_unaltered_user_input_log_table_rows_to_temp()
    ## compile regexp for line break removal
    nlr = re.compile('[\n\r]+')

    #verify file integrity
    print ("Verifying file integrity of %s with"
           " MD5 checksum from %s" % (IMPORT_FILE_NAME, IMPORT_MD5_FILE_NAME))
    fp = open(IMPORT_FILE_NAME, "rb")
    fmd5 = md5_for_file(fp)
    fp.close()

    fp = open(IMPORT_MD5_FILE_NAME, "r")
    vmd5 = fp.read()
    fp.close()
    
    if not fmd5 == vmd5:
        print "WARNING: Detected a disturbance in the file. Will exit here."
        return

    total_lines = file_len()
    fp = open(IMPORT_FILE_NAME, "r")
    print "Processing file %s..." % IMPORT_FILE_NAME

    for index, line in enumerate(fp.readlines()):
#        if index == 100:
#            break
        if index % 5000 == 0:
            percent = float(index) / float(total_lines)
            update_status(percent, "%s of %s lines processed in %s" % (index, total_lines, IMPORT_FILE_NAME))

        new_ref = None
        tab1, old_ref, old_rec, tab2, enname, bibcode = line.split("    ")
        
        assert tab1 == tab2

        if tab1 == "table":
            continue

        name = base64.b64decode(enname)
#        name = nq.sub("", name)
        bibcode = nlr.sub("", bibcode)
        new_rec = get_bibrec_from_bibcode(bibcode)

        for ref in get_authorrefs_and_names_from_bibrec(new_rec):
#            refname = create_normalized_name(split_name_parts(ref[2]))
            refname = ref[2]

            if refname == name and str(ref[0]) == tab1:
                #MySQL equivalent: col_name COLLATE utf8_bin = 'Case SenSitive name'
                new_ref = ref[1]
        
        if not new_ref:
            print "WARN: Authorref not found for name %s on new record %s?!" % (name, new_rec)
            continue

        # get personid, flag, lcul and last_updated from old aidPERSONIDPAPERS
        old_data = find_old_pidtable_row(tab1, old_ref, old_rec)

        if old_data:
            ## prepare data in temporary tables...
            pid, flag, lcul, lupdate = old_data
            old_authorref = "%s:%s,%s" % (tab1, old_ref, old_rec)
            new_authorref = "%s:%s,%s" % (tab1, new_ref, new_rec)
            ## Transform the name into a more consistent form
            inname = create_normalized_name(split_name_parts(name))
            ## Insert transformed data into temp tables...
            insert_into_temp_table(pid, tab1, new_ref, new_rec, inname, flag, lcul, lupdate)
            update_temp_piddata_table(old_authorref, new_authorref)
            update_temp_user_input_log_table(old_authorref, new_authorref)
        else:
            print "WARN: %s does not exist in db!" % ([tab1, old_ref, old_rec])

        # The following is true only if applied on the same data set
        # Commented out by default. For testing/debug uses only
        try:
            if RUN_IN_TEST_MODE:
                assert str(old_rec) == str(new_rec)
                assert str(old_ref) == str(new_ref)
                pass
        except AssertionError, e:
            print "ERROR: ", e
            print "%s:%s,%s vs. %s:%s,%s on %s:%s" % (tab1, old_ref, old_rec, tab1, new_ref, new_rec, bibcode, name)
def check_table_integrity(table):
    """
    Check integrity of result table vs. original table. Only works when original
    data and result data are identical!

    @param table: the table to check: aidPIDTEMP, aidPIDUILTEMP or aidPIDDATATEMP
    @type table: str
    """
    if not RUN_IN_TEST_MODE:
        print "Integrity checks only run in TEST_MODE!"
        return

    check_passed = True
    odata = None
    rdata = None

    print "Checking table: %s" % table
    print "  |-- Getting original data..."

    if table == "aidPIDTEMP":
        odata = run_sql("select personid, bibref_table, bibref_value, bibrec, name, flag, "
                        "lcul, last_updated from aidPERSONIDPAPERS")
    elif table == "aidPIDUILTEMP":
        odata = run_sql("select id, transactionid, timestamp, userinfo, personid, "
                        "action, tag, value, comment from aidUSERINPUTLOG")
    elif table == "aidPIDDATATEMP":
        odata = run_sql("select personid, tag, data, opt1, opt2, opt3 "
                    "from aidPERSONIDDATA")
    else:
        print "No table specified for integrity check. Skipped."
        return

    print "  |-- Getting result data..."

    if table == "aidPIDTEMP":
        rdata = run_sql("select personid, bibref_table, bibref_value, bibrec, name, flag, "
                        "lcul, last_updated from aidPIDTEMP")
    elif table == "aidPIDUILTEMP":
        rdata = run_sql("select id, transactionid, timestamp, userinfo, personid, "
                        "action, tag, value, comment from aidPIDUILTEMP")
    elif table == "aidPIDDATATEMP":
        rdata = run_sql("select personid, tag, data, opt1, opt2, opt3 "
                    "from aidPIDDATATEMP")

    print "  |-- Checking..."
    rownum = float(len(rdata))
    odata = set(odata)

    if bool(len(odata)) ^ bool(len(rdata)):
        check_passed = False

    for index, res in enumerate(rdata):
        if not check_passed:
            print "odata xor rdata: %s" % (bool(len(odata)) ^ bool(len(rdata)))
            break

        if index % 1000 == 0:
            percent = float(index) / rownum
            update_status(percent, "%s of %s rows processed" % (index, rownum))

        if not (set([res]) & odata):
            check_passed = False
            print "Test failed for the following pair:\n res: %s and-op: %s" % (str(res), str((set(res) & odata)))
            break

    update_status(1., "Done checking %s\n" % table)


    if check_passed:
        print "  |-- OK"
    else:
        print "  |-- Data Integrity check failed!"
            update_temp_user_input_log_table(old_authorref, new_authorref)
        else:
            print "WARN: %s does not exist in db!" % ([tab1, old_ref, old_rec])

        # The following is true only if applied on the same data set
        # Commented out by default. For testing/debug uses only
        try:
            if RUN_IN_TEST_MODE:
                assert str(old_rec) == str(new_rec)
                assert str(old_ref) == str(new_ref)
                pass
        except AssertionError, e:
            print "ERROR: ", e
            print "%s:%s,%s vs. %s:%s,%s on %s:%s" % (tab1, old_ref, old_rec, tab1, new_ref, new_rec, bibcode, name)

    update_status(1., "Done importing from %s\n" % IMPORT_FILE_NAME)

    if RUN_IN_TEST_MODE:
        perform_integrity_checks()
    else:
        print "Copying NEW data from temp tables to original tables (destroying previous content!...)"
        print "Personid Papers table..."
        copy_temp_to_pid_table()
        print "Personid Data table..."
        copy_temp_piddata_table()
        print "User Input log table..."
        copy_temp_user_input_table_table()

    fp.close()

def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    import matplotlib.pyplot as plt
    plt.ioff()
    def _gen_plot(data, filename):
        plt.clf()
        ax = plt.subplot(111)
        ax.grid(visible=True)
        x = sorted(data.keys())

        w = [data[k][0] for k in x]
        try:
            wscf = max(w)
        except:
            wscf = 0
        w = [float(i)/wscf for i in w]
        y = [data[k][1] for k in x]
        maxi = [data[k][3] for k in x]
        mini = [data[k][2] for k in x]

        lengs = [data[k][4] for k in x]
        try:
            ml = float(max(lengs))
        except:
            ml = 1
        lengs = [k/ml for k in lengs]

        normalengs = [data[k][5] for k in x]

        ax.plot(x,y,'-o',label='avg')
        ax.plot(x,maxi,'-o', label='max')
        ax.plot(x,mini,'-o', label='min')
        ax.plot(x,w, '-x', label='norm %s' % str(wscf))
        ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml)))
        ax.plot(x,normalengs, '-o', label='ncl')
        plt.ylim(ymax = 1., ymin = -0.01)
        plt.xlim(xmax = 1., xmin = -0.01)
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.)
        plt.savefig(filename)

    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = filehandler.open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()
def main():
    """
    Main Function. Acquires data and constructs output format:
    
    <MARC Table> <ID in Table> <record ID> <MARC Table> <Name> <bibcode>
    
    Four (4) spaces "    " are used as a delimiter
    
    Stores the output format to the specified export file
    """
    pidrefs = {}
    records = None
    output = []
    output.append("table    bibref    bibrec    table    name    bibcode")

    if LIMIT_TO_RECORDS_IN_PERSONID:
        print "Finding record IDs from PersonID table..."
        pidrefs_sqldata = run_sql("select bibref_table, bibref_value, bibrec "
                                  "from aidPERSONIDPAPERS")
        
        for data in pidrefs_sqldata:
            tab, ref, rec = data

            if rec in pidrefs:
                pidrefs[rec].append((tab, ref))
            else:
                pidrefs[rec] = [(tab, ref)]

        records = pidrefs.keys()
    else:
        print "Finding ALL record IDs from bibrec table..."
        records = [p[0] for p in run_sql("select id from bibrec")]

    print "Collecting data for %s records..." % len(records)
    
    for index, bibrec in enumerate(records):
        if index % 1000 == 0:
            status = "%s of all %s records done." % (index, len(records))

            if index % 10000 == 0:
                fp = open(EXPORT_FILE_NAME, "w")
                fp.write("\n".join(output))
                fp.close()
                status = "Saving to %s..." % EXPORT_FILE_NAME

            percent = float(index) / len(records)
            update_status(percent, status)

        bibcode = get_bibcode_from_bibrec(bibrec)
        refs = None

        if LIMIT_TO_RECORDS_IN_PERSONID:
            try:
                temp_refs = pidrefs[bibrec]
                refs = []

                for tab, ref in temp_refs:
                    name = get_db_name_from_ref(tab, ref)
                    refs.append((tab, ref, name))

            except KeyError:
                print "No key %s in pidref!" % bibrec
        else:
            refs = get_authorrefs_and_names_from_bibrec(bibrec)

        for ref in refs:
            tab, tid, name = ref
            
            # b64 encode name to avoid data inconsistencies, occuring with spaces and
            # special characters in names (e.g. \n \t \r \0 etc.)
            enname = base64.b64encode(name)
            #out = "%s:%s,%s;;;%s:\"%s\",%s" % (tab, tid, bibrec, tab, name, bibcode)
            #out = """%s    %s    %s    \"\"\"%s\"\"\"    %s    %s""" % (tab, tid, bibrec, tab, name, bibcode)
            out = """%s    %s    %s    %s    %s    %s""" % (tab, tid, bibrec, tab, enname, bibcode)
            output.append(out)

        # For testing, just write out the first 1000 lines...
#        if len(output) > 1000:
#            break

    # write to export file...
    fp = open(EXPORT_FILE_NAME, "w")
    fp.write("\n".join(output))
    fp.close()
    
    # md5 export file...
    fp = open(EXPORT_FILE_NAME, "rb")
    fmd5 = md5_for_file(fp)
    fp.close()
    
    # write to md5 file
    fp = open(EXPORT_FILE_NAME + ".md5", "w")
    fp.write(fmd5)
    fp.close()
    
    update_status(1., "Export OK.\n")
    
    print "Final Export written to %s" % EXPORT_FILE_NAME
    print "MD5 hash of export file saved to %s.md5" % EXPORT_FILE_NAME
    print "All Done."
示例#20
0
def merge_static():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.
    '''
    last_names = frozenset(name[0].split('.')[0]
                           for name in get_existing_result_clusters())

    def get_free_pids():
        while True:
            yield get_new_personid()

    free_pids = get_free_pids()

    current_mapping = get_bibrefrec_to_pid_flag_mapping()

    def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag):
        move_signature(sig, new_pid_flag[0])
        current_mapping[sig].remove(old_pid_flag)
        current_mapping[sig].append(new_pid_flag)

    def try_move_signature(sig, target_pid):
        """
        """
        paps = current_mapping[sig]
        rejected = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps)
        claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if rejected:
            newpid = free_pids.next()
            move_sig_and_update_mapping(sig, assigned[0],
                                        (newpid, assigned[0][1]))
        else:
            conflicts = find_conflicts(sig, target_pid)
            if not conflicts:
                move_sig_and_update_mapping(sig, assigned[0],
                                            (target_pid, assigned[0][1]))
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    newpid = free_pids.next()
                    move_sig_and_update_mapping(sig, assigned[0],
                                                (newpid, assigned[0][1]))
                else:
                    newpid = free_pids.next()
                    csig = tuple(conflicts[0][:3])
                    move_sig_and_update_mapping(csig,
                                                (target_pid, conflicts[0][3]),
                                                (newpid, conflicts[0][3]))
                    move_sig_and_update_mapping(sig, assigned[0],
                                                (target_pid, assigned[0][1]))

    for idx, last in enumerate(last_names):
        update_status(
            float(idx) / len(last_names),
            "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4])
                   for row in get_lastname_results(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d))
                   for k, d in groupby(sorted(results, key=itemgetter(0)),
                                       key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            claim = []
            for d in ds:
                pid_flag = current_mapping.get(d, [])
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)
                    if flag > 1:
                        claim.append((d, pid))

            matr.append(
                dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids]
                                        for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx])
                            for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(
            imap(itemgetter(0), best_match))
        not_matched_clusters = izip(
            (results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                if sig in current_mapping:
                    if not pid in map(
                            itemgetter(0),
                            filter(lambda x: x[1] > -2, current_mapping[sig])):
                        try_move_signature(sig, pid)

    update_status_final()
    delete_empty_persons()
    update_personID_canonical_names()
示例#21
0
def convert_cluster_set(cs, prob_matr):
    '''
    Convertes a normal cluster set to a wedge cluster set.
    @param cs: a cluster set to be converted
    @param type: cluster set
    @return: a mapping from a number to a bibrefrec.
    '''
    #gc.disable()

    # step 1:
    #    + Assign a number to each bibrefrec.
    #    + Replace the arrays of bibrefrecs with arrays of numbers.
    #    + Store the result and prepare it to be returned.
    result_mapping = list()
    for clus in cs.clusters:
        start = len(result_mapping)
        result_mapping += list(clus.bibs)
        end = len(result_mapping)
        clus.bibs = range(start, end)

    assert len(result_mapping) == len(
        set(result_mapping)), PID() + "Cluster set conversion failed"
    assert len(result_mapping
               ) == cs.num_all_bibs, PID() + "Cluster set conversion failed"

    cs.new2old = result_mapping

    # step 2:
    #    + Using the prob matrix create a vector values to all other bibs.
    #    + Meld those vectors into one for each cluster.

    special_symbols = Bib_matrix.special_symbols  #locality optimization
    pb_getitem_numeric = prob_matr.getitem_numeric

    interval = 100
    gc.set_threshold(100, 100, 100)
    current = -1
    real_pointer = None
    try:
        for c1 in cs.clusters:
            gc.collect()
            current += 1
            if (current % interval) == 0:
                update_status(
                    float(current) / len(cs.clusters),
                    "Converting the cluster set...")

            assert len(c1.bibs) > 0, PID() + "Empty cluster send to wedge"
            pointers = list()

            for v1 in c1.bibs:
                pointer = list()
                index = list()
                rm = result_mapping[v1]  #locality optimization
                for c2 in cs.clusters:
                    if c1 != c2 and not c1.hates(c2):
                        pointer += [
                            pb_getitem_numeric((rm, result_mapping[v2]))
                            for v2 in c2.bibs
                        ]
                        index += c2.bibs
                if index and pointer:
                    real_pointer = numpy.ndarray(shape=(len(result_mapping),
                                                        2),
                                                 dtype=float,
                                                 order='C')
                    real_pointer.fill(special_symbols[None])
                    real_pointer[index] = pointer
                    pointers.append((real_pointer, 1))

            if pointers:
                out_edges = reduce(meld_edges, pointers)[0]
                h5file.create_dataset(str(id(c1)), (len(out_edges), 2), 'f')
                dset = h5file[str(id(c1))]
                dset[:] = out_edges
            else:
                h5file.create_dataset(str(id(c1)), (len(cs.clusters), 2), 'f')

    except Exception, e:
        raise Exception(
            """Error happened in convert_cluster_set with
                        v1: %s,
                        real_pointer: %s,
                        pointer: %s,
                        pointers: %s,
                        result_mapping: %s, index: %s,
                        len(real_pointer): %s,
                        len(pointer): %s,
                        len(pointers):  %s,
                        original_exception: %s
                        """ %
            (str(v1), str(real_pointer), str(pointer), str(pointers),
             str(result_mapping), str(index), str(len(real_pointer)),
             str(len(pointer)), str(len(pointers)), str(e)))
示例#22
0
def merge_static_classy():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.

        NOTE: this is more elegant that merge_static but much slower. Will have to be improved
               before it can replace it.
    '''
    class Sig(object):
        def __init__(self, bibrefrec, pid_flag):
            self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag))
            self.assigned = filter(lambda p: -2 < p[1] and p[1] < 2, pid_flag)
            self.claimed = filter(lambda p: 2 <= p[1], pid_flag)
            self.bibrefrec = bibrefrec

            assert self.invariant()

        def invariant(self):
            return len(self.assigned) + len(self.claimed) <= 1

        def empty(self):
            return not self.isclaimed and not self.isassigned

        def isclaimed(self):
            return len(self.claimed) == 1

        def get_claimed(self):
            return self.claimed[0][0]

        def get_assigned(self):
            return self.assigned[0][0]

        def isassigned(self):
            return len(self.assigned) == 1

        def isrejected(self, pid):
            return pid in self.rejected

        def change_pid(self, pid):
            assert self.invariant()
            assert self.isassigned()
            self.assigned = [(pid, 0)]
            move_signature(self.bibrefrec, pid)

    class Cluster(object):
        def __init__(self, pid, sigs):
            self.pid = pid

            self.sigs = dict(
                (sig.bibrefrec[2], sig) for sig in sigs if not sig.empty())

        def send_sig(self, other, sig):
            paper = sig.bibrefrec[2]
            assert paper in self.sigs and paper not in other.sigs

            del self.sigs[paper]
            other.sigs[paper] = sig

            if sig.isassigned():
                sig.change_pid(other.pid)

    last_names = frozenset(name[0].split('.')[0]
                           for name in get_existing_result_clusters())

    personid = get_bibrefrec_to_pid_flag_mapping()
    free_pids = backinterface_get_free_pids()

    for idx, last in enumerate(last_names):
        update_status(
            float(idx) / len(last_names),
            "Merging, %d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4])
                   for row in get_lastname_results(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d))
                   for k, d in groupby(sorted(results, key=itemgetter(0)),
                                       key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            for d in ds:
                pid_flag = filter(lambda x: x[1] > -2, personid.get(d, []))
                if pid_flag:
                    assert len(pid_flag) == 1
                    pid = pid_flag[0][0]
                    pids.append(pid)
                    old_pids.add(pid)

            matr.append(
                dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids]
                                        for row in matr])

        # [[bibrefrecs] -> pid]
        matched_clusters = [(results[new_idx][1], old_pids[old_idx])
                            for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(
            imap(itemgetter(0), best_match))
        not_matched_clusters = izip(
            (results[i][1] for i in not_matched_clusters), free_pids)

        # pid -> Cluster
        clusters = dict(
            (pid,
             Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs]))
            for sigs, pid in chain(matched_clusters, not_matched_clusters))

        todo = clusters.items()
        for pid, clus in todo:
            assert clus.pid == pid

            for paper, sig in clus.sigs.items():
                if sig.isclaimed():
                    if sig.get_claimed() != pid:
                        target_clus = clusters[sig.get_claimed()]

                        if paper in target_clus.sigs:
                            new_clus = Cluster(free_pids.next(), [])
                            target_clus.send_sig(new_clus, target_clus[paper])
                            todo.append(new_clus)
                            clusters[new_clus.pid] = new_clus

                        assert paper not in target_clus.sigs
                        clus.send_sig(target_clus, sig)
                elif sig.get_assigned() != pid:
                    if not sig.isrejected(pid):
                        move_signature(sig.bibrefrec, pid)
                    else:
                        move_signature(sig.bibrefrec, free_pids.next())
                else:
                    assert not sig.isrejected(pid)

    update_status_final("Merging done.")

    update_status_final()
    delete_empty_persons()
    update_personID_canonical_names()
示例#23
0
def group_sort_edges(cs, original_process_id):
    bibauthor_print("group_sort_edges spowned by %s" % original_process_id)

    plus_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' +
        str(original_process_id), 'w')
    minus_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' +
        str(original_process_id), 'w')
    pairs_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' +
        str(original_process_id), 'w')
    data_fp = open(
        bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' +
        str(original_process_id), 'w')

    plus_count = 0
    minus_count = 0
    pairs_count = 0

    default_val = [0., 0.]
    #gc.disable()
    interval = 1000
    current = -1
    for cl1 in cs.clusters:
        current += 1
        if (current % interval) == 0:
            update_status(
                float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = h5file[str(id(cl1))]
        for bib2 in xrange(len(h5file[str(id(cl1))])):
            val = pointers[bib2]
            #if val[0] not in Bib_matrix.special_numbers:
            #optimization: special numbers are assumed to be negative
            if val[0] >= 0:

                if val[0] > edge_cut_prob:
                    pairs_count += 1
                    pairs_fp.write(_pack_vals((bib1, bib2, val)))

            elif val[0] == Bib_matrix.special_symbols['+']:
                plus_count += 1
                plus_fp.write(_pack_vals((bib1, bib2, default_val)))

            elif val[0] == Bib_matrix.special_symbols['-']:
                minus_count += 1
                minus_fp.write(_pack_vals((bib1, bib2, default_val)))
            else:
                assert val[0] == Bib_matrix.special_symbols[
                    None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    plus_fp.close()
    minus_fp.close()
    pairs_fp.close()

    bibauthor_print(
        "Positive edges: %d, Negative edges: %d, Value edges: %d." %
        (plus_count, minus_count, pairs_count))
    #gc.enable()
    bibauthor_print("Sorting in-file value edges.")
    sortFileInPlace(bconfig.TORTOISE_FILES_PATH +
                    '/wedge_temp_edges_cache_e_' + str(original_process_id),
                    bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' +
                    str(original_process_id),
                    lambda x: _edge_sorting(_unpack_vals(x)),
                    reverse=True)

    os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' +
              str(original_process_id))

    bibauthor_print("Dumping egdes data to file...")
    cPickle.dump((plus_count, minus_count, pairs_count), data_fp)
    data_fp.close()
示例#24
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)

    plus_edges, minus_edges, edges = group_edges(cluster_set)

    interval = 1000
    for i, (bib1, bib2) in enumerate(plus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, (bib1, bib2) in enumerate(minus_edges):
        if (i % interval) == 0:
            update_status(float(i) / len(minus_edges), "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    bibauthor_print("Sorting the value edges.")
    edges = sorted(edges, key=_edge_sorting, reverse=True)

    interval = 500000
    wedge_print("Wedge: New wedge, %d edges." % len(edges))
    for current, (v1, v2, unused) in enumerate(edges):
        if (current % interval) == 0:
            update_status(float(current) / len(edges), "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        idcl1 = cluster_set.clusters.index(cl1)
        idcl2 = cluster_set.clusters.index(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
示例#25
0
def merge_static():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.
    '''
    last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names())

    def get_free_pids():
        while True:
            yield get_free_author_id()

    free_pids = get_free_pids()

    current_mapping = get_paper_to_author_and_status_mapping()

    def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag):
        move_signature(sig, new_pid_flag[0])
        current_mapping[sig].remove(old_pid_flag)
        current_mapping[sig].append(new_pid_flag)

    def try_move_signature(sig, target_pid):
        """
        """
        paps = current_mapping[sig]
        rejected = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps)
        claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if rejected:
            newpid = free_pids.next()
            move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1]))
        else:
            conflicts = get_signatures_of_paper_and_author(sig, target_pid)
            if not conflicts:
                move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1]))
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    newpid = free_pids.next()
                    move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1]))
                else:
                    newpid = free_pids.next()
                    csig = tuple(conflicts[0][:3])
                    move_sig_and_update_mapping(csig, (target_pid, conflicts[0][3]), (newpid, conflicts[0][3]))
                    move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1]))

    for idx, last in enumerate(last_names):
        update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            claim = []
            for d in ds:
                pid_flag = current_mapping.get(d, [])
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)
                    if flag > 1:
                        claim.append((d, pid))

            matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match))
        not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                if sig in current_mapping:
                    if not pid in map(itemgetter(0), filter(lambda x: x[1] > -2, current_mapping[sig])):
                        try_move_signature(sig, pid)

    update_status_final()
    remove_empty_authors()
    update_canonical_names_of_authors()
示例#26
0
 def store(self):
     update_status(0., "Saving probability matrix...")
     self._bib_matrix.store()
     update_status_final("Probability matrix saved.")
示例#27
0
def merge_static_classy():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is static: if aid* tables are changed while it's running,
        probably everything will crash and a black hole will open, eating all your data.

        NOTE: this is more elegant that merge_static but much slower. Will have to be improved
               before it can replace it.
    '''
    class Sig(object):
        def __init__(self, bibrefrec, pid_flag):
            self.rejected = dict(filter(lambda p:                p[1] <= -2, pid_flag))
            self.assigned = filter(lambda p:-2 < p[1] and p[1] < 2, pid_flag)
            self.claimed = filter(lambda p:  2 <= p[1], pid_flag)
            self.bibrefrec = bibrefrec

            assert self.invariant()

        def invariant(self):
            return len(self.assigned) + len(self.claimed) <= 1

        def empty(self):
            return not self.isclaimed and not self.isassigned

        def isclaimed(self):
            return len(self.claimed) == 1

        def get_claimed(self):
            return self.claimed[0][0]

        def get_assigned(self):
            return self.assigned[0][0]

        def isassigned(self):
            return len(self.assigned) == 1

        def isrejected(self, pid):
            return pid in self.rejected

        def change_pid(self, pid):
            assert self.invariant()
            assert self.isassigned()
            self.assigned = [(pid, 0)]
            move_signature(self.bibrefrec, pid)

    class Cluster(object):
        def __init__(self, pid, sigs):
            self.pid = pid

            self.sigs = dict((sig.bibrefrec[2], sig) for sig in sigs if not sig.empty())

        def send_sig(self, other, sig):
            paper = sig.bibrefrec[2]
            assert paper in self.sigs and paper not in other.sigs

            del self.sigs[paper]
            other.sigs[paper] = sig

            if sig.isassigned():
                sig.change_pid(other.pid)

    last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names())

    personid = get_paper_to_author_and_status_mapping()
    free_pids = backinterface_get_free_pids()

    for idx, last in enumerate(last_names):
        update_status(float(idx) / len(last_names), "Merging, %d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = []
            for d in ds:
                pid_flag = filter(lambda x: x[1] > -2, personid.get(d, []))
                if pid_flag:
                    assert len(pid_flag) == 1
                    pid = pid_flag[0][0]
                    pids.append(pid)
                    old_pids.add(pid)

            matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        old_pids = list(old_pids)
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr])

        # [[bibrefrecs] -> pid]
        matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match))
        not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids)

        # pid -> Cluster
        clusters = dict((pid, Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs]))
                        for sigs, pid in chain(matched_clusters, not_matched_clusters))

        todo = clusters.items()
        for pid, clus in todo:
            assert clus.pid == pid

            for paper, sig in clus.sigs.items():
                if sig.isclaimed():
                    if sig.get_claimed() != pid:
                        target_clus = clusters[sig.get_claimed()]

                        if paper in target_clus.sigs:
                            new_clus = Cluster(free_pids.next(), [])
                            target_clus.send_sig(new_clus, target_clus[paper])
                            todo.append(new_clus)
                            clusters[new_clus.pid] = new_clus

                        assert paper not in target_clus.sigs
                        clus.send_sig(target_clus, sig)
                elif sig.get_assigned() != pid:
                    if not sig.isrejected(pid):
                        move_signature(sig.bibrefrec, pid)
                    else:
                        move_signature(sig.bibrefrec, free_pids.next())
                else:
                    assert not sig.isrejected(pid)

    update_status_final("Merging done.")

    update_status_final()
    remove_empty_authors()
    update_canonical_names_of_authors()
    #free = get_free_memory()
    initial = get_total_memory()
    free = initial
    output_killer = open(os.devnull, 'w')

    ret_status = [None] * len(jobs)
    bibs = sizs
    sizs = map(estimator, sizs)
    free_idxs = range(len(jobs))
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) == len(free_idxs)

    done = 0.
    total = sum(sizs)
    biggest = max(sizs)

    update_status(0., "0 / %d" % len(jobs))
    too_big = [idx for idx in free_idxs if sizs[idx] > free]
    for idx in too_big:
        pid = os.fork()
        if pid == 0: # child
            run_job(idx)
        else: # parent
            done += sizs[idx]
            del free_idxs[idx]
            cpid, status = os.wait()
            update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs), len(jobs)))
            ret_status[idx] = status
            assert cpid == pid

    while free_idxs or pid_2_idx:
        while len(pid_2_idx) < max_workers:
 def store(self, name):
     update_status(0., "Saving probability matrix...")
     self._bib_matrix.store(name)
     update_status_final("Probability matrix saved.")
示例#30
0
def merge_dynamic():
    '''
        This function merges aidPERSONIDPAPERS with aidRESULTS.
        Use it after tortoise.
        This function is dynamic: it allows aid* tables to be changed while it is still running,
        hence the claiming faciity for example can stay online during the merge. This comfort
        however is paid off in term of speed.
    '''
    last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names())

    def get_free_pids():
        while True:
            yield get_free_author_id()

    free_pids = get_free_pids()

    def try_move_signature(sig, target_pid):
        """
        """
        paps = get_ordered_author_and_status_of_signature(sig)
        rejected = filter(lambda p: p[1] <= -2, paps)
        assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps)
        claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps)

        if claimed or not assigned or assigned[0] == target_pid:
            return

        assert len(assigned) == 1

        if int(target_pid) in [int(x[0]) for x in rejected]:
            move_signature(sig, free_pids.next())
        else:
            conflicts = get_signatures_of_paper_and_author(sig, target_pid)
            if not conflicts:
                move_signature(sig, target_pid)
            else:
                assert len(conflicts) == 1
                if conflicts[0][3] == 2:
                    move_signature(sig, free_pids.next())
                else:
                    move_signature(conflicts[0][:3], free_pids.next())
                    move_signature(sig, target_pid)

    for idx, last in enumerate(last_names):
        update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last))

        results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last))

        # [(last name number, [bibrefrecs])]
        results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))]

        # List of dictionaries.
        # [{new_pid -> N}]
        matr = []

        # Set of all old pids.
        old_pids = set()

        for k, ds in results:
            pids = list()
            for d in ds:
                pid_flag = get_author_and_status_of_confirmed_paper(d)
                if pid_flag:
                    pid, flag = pid_flag[0]
                    pids.append(pid)
                    old_pids.add(pid)

            matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids))))

        # We cast it to list in order to ensure the order persistence.
        old_pids = list(old_pids)
        #best_match = cluster,pid_idx,n
        best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr])

        matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, score in best_match if score > 0]
        not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), [x for x in best_match if x[2] > 0]))
        not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids)

        for sigs, pid in chain(matched_clusters, not_matched_clusters):
            for sig in sigs:
                try_move_signature(sig, pid)

    update_status_final()
    remove_empty_authors()
    update_canonical_names_of_authors()
示例#31
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    import matplotlib.pyplot as plt
    plt.ioff()
    def _gen_plot(data, filename):
        plt.clf()
        ax = plt.subplot(111)
        ax.grid(visible=True)
        x = sorted(data.keys())

        w = [data[k][0] for k in x]
        try:
            wscf = max(w)
        except:
            wscf = 0
        w = [float(i)/wscf for i in w]
        y = [data[k][1] for k in x]
        maxi = [data[k][3] for k in x]
        mini = [data[k][2] for k in x]

        lengs = [data[k][4] for k in x]
        try:
            ml = float(max(lengs))
        except:
            ml = 1
        lengs = [k/ml for k in lengs]

        normalengs = [data[k][5] for k in x]

        ax.plot(x,y,'-o',label='avg')
        ax.plot(x,maxi,'-o', label='max')
        ax.plot(x,mini,'-o', label='min')
        ax.plot(x,w, '-x', label='norm %s' % str(wscf))
        ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml)))
        ax.plot(x,normalengs, '-o', label='ncl')
        plt.ylim(ymax = 1., ymin = -0.01)
        plt.xlim(xmax = 1., xmin = -0.01)
        ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.)
        plt.savefig(filename)

    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = filehandler.open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()
示例#32
0
def do_wedge(cluster_set, deep_debug=False):
    '''
    Rearranges the cluster_set acoarding to be values in the probability_matrix.
    The deep debug option will produce a lot of output. Avoid using it with more
    than 20 bibs in the cluster set.
    '''

    bib_map = create_bib_2_cluster_dict(cluster_set)
    original_process_id = PID()
    #remember to close the files!
    #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set)

    p = Process(target=group_sort_edges, args=(cluster_set,original_process_id))
    p.start()
    p.join()

    plus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'r')
    minus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'r')
    edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),'r')
    data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'r')

    len_plus,len_minus,len_edges = cPickle.load(data_fp)
    data_fp.close()

    interval = 1000
    for i, s in enumerate(plus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_plus, "Agglomerating obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            join(cl1, cl2)
            cluster_set.clusters.remove(cl2)
            for v in cl2.bibs:
                bib_map[v] = cl1
    update_status_final("Agglomerating obvious clusters done.")

    interval = 1000
    for i, s in enumerate(minus_edges_fp):
        bib1, bib2, unused = _unpack_vals(s)
        if (i % interval) == 0:
            update_status(float(i) / len_minus, "Dividing obvious clusters...")
        cl1 = bib_map[bib1]
        cl2 = bib_map[bib2]
        if cl1 != cl2 and not cl1.hates(cl2):
            cl1.quarrel(cl2)
    update_status_final("Dividing obvious clusters done.")

    interval = 50000
    wedge_print("Wedge: New wedge, %d edges." % len_edges)
    current = -1
    for  s in edges_fp:
        v1, v2, unused = _unpack_vals(s)
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len_edges, "Wedge...")

        assert unused != '+' and unused != '-', PID()+"Signed edge after filter!"
        cl1 = bib_map[v1]
        cl2 = bib_map[v2]
        #try using object ids instead of index to boost performances
        #idcl1 = cluster_set.clusters.index(cl1)
        #idcl2 = cluster_set.clusters.index(cl2)
        idcl1 = id(cl1)
        idcl2 = id(cl2)

        #keep the ids low!
        if idcl1 > idcl2:
            idcl1, idcl2 = idcl2, idcl1
            cl1, cl2 = cl2, cl1

        wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1]))

        if cl1 != cl2 and not cl1.hates(cl2):
            if deep_debug:
                export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused))

            decision, value = _decide(cl1, cl2)
            if decision:
                wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value))
                join(cl1, cl2)
                cluster_set.clusters.remove(cl2)
                for v in cl2.bibs:
                    bib_map[v] = cl1
            else:
                wedge_print("Wedge: Quarreled %s from %s with %s " %  (idcl1, idcl2, value))
                cl1.quarrel(cl2)
        elif cl1 == cl2:
            wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2))
        else:
            wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2))

    update_status_final("Wedge done.")
    bibauthor_print("")

    if deep_debug:
        export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)

    plus_edges_fp.close()
    minus_edges_fp.close()
    edges_fp.close()
    data_fp.close()

    try:
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id))
        os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id))
    except:
        pass
示例#33
0
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_string_to_pid_dictionary()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_new_personid()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_bibrecs()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):
        task_sleep_now_if_required(True)
        update_status(
            float(idx) / len(bibrecs),
            "%d/%d current: %d" % (idx, len(bibrecs), rec))
        if rec in deleted:
            delete_paper_from_personid(rec)
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_authors_from_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthors_from_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_from_rec(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(
                 split_name_parts(get_name_by_bibrecref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]
        for new, old in best_match:
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_sigs(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = []
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_person_with_extid(inspire_id[0]))
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_personID_canonical_names(updated_pids)
        update_personID_external_ids(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()
示例#34
0
def group_sort_edges(cs, original_process_id):
    bibauthor_print("group_sort_edges spowned by %s" % original_process_id)

    plus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'w')
    minus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'w')
    pairs_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),'w')
    data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'w')

    plus_count = 0
    minus_count = 0
    pairs_count = 0

    default_val = [0.,0.]
    #gc.disable()
    interval = 1000
    current = -1
    for cl1 in cs.clusters:
        current += 1
        if (current % interval) == 0:
            update_status(float(current) / len(cs.clusters), "Grouping all edges...")

        bib1 = tuple(cl1.bibs)[0]
        pointers = h5file[str(id(cl1))]
        for bib2 in xrange(len(h5file[str(id(cl1))])):
            val = pointers[bib2]
            #if val[0] not in Bib_matrix.special_numbers:
            #optimization: special numbers are assumed to be negative
            if val[0] >= 0:

                if val[0] > edge_cut_prob:
                    pairs_count += 1
                    pairs_fp.write(_pack_vals((bib1, bib2, val)))

            elif val[0] == Bib_matrix.special_symbols['+']:
                plus_count += 1
                plus_fp.write(_pack_vals((bib1, bib2, default_val)))

            elif val[0] == Bib_matrix.special_symbols['-']:
                minus_count += 1
                minus_fp.write(_pack_vals((bib1, bib2, default_val)))
            else:
                assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge"

    update_status_final("Finished with the edge grouping.")

    plus_fp.close()
    minus_fp.close()
    pairs_fp.close()

    bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d."
                     % (plus_count, minus_count, pairs_count))
    #gc.enable()
    bibauthor_print("Sorting in-file value edges.")
    sortFileInPlace(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),
                    bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),
                    lambda x: _edge_sorting(_unpack_vals(x)), reverse=True)

    os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id))

    bibauthor_print("Dumping egdes data to file...")
    cPickle.dump((plus_count, minus_count, pairs_count), data_fp)
    data_fp.close()
示例#35
0
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and
        len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx%200 == 0:
            task_sleep_now_if_required(True)

            update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(" - Record was deleted, removing from pid and continuing with next record", True)
            remove_papers([rec])
            continue


        markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))),
                                   izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec)]
        personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibref(new))))
                                    for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old])
                  for old in old_signatures] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix) if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match))

        pids_having_rec = set([int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec,))
                    if inspire_id:
                        matched_pids = list(get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids: # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
示例#36
0
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True):
    override_stdout_config(stdout=True)

    files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')]
    fnum = float(len(files))
    quanta = .1/fnum


    total_stats = 0
    used_coeffs = set()
    used_clusters = set()

    #av_counter, avg, min, max, nclus, normalized_avg
    cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.]))
    coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.])


    def gen_graphs(only_synthetic=False):
        update_status(0, 'Generating coefficients graph...')
        _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg')
        if not only_synthetic:
            cn = cluster_stats.keys()
            l = float(len(cn))
            for i,c in enumerate(cn):
                update_status(i/l, 'Generating name graphs... %s' % str(c))
                _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))

    for i,fi in enumerate(files):
        if generate_graphs:
            if i%1000 ==0:
                gen_graphs(True)

        f = open(fi,'r')
        status = i/fnum
        update_status(status, 'Loading '+ fi[fi.find('lastname')+9:])
        contents = SER.load(f)
        f.close()

        cur_coef = contents[0]
        cur_clust = contents[1]

        cur_maxlen = float(contents[3])

        if cur_coef:
            total_stats += 1
            used_coeffs.add(cur_coef)
            used_clusters.add(cur_clust)

            update_status(status+0.2*quanta, '  Computing averages...')

            cur_clen = len(contents[2])
            cur_coeffs = [x[2] for x in contents[2]]
            cur_clustnumber = float(len(set([x[0] for x in contents[2]])))

            assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen),
                                                                                                                          str(cur_clustnumber))

            if cur_coeffs:

                assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs))
                assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs))

                cur_min = min(cur_coeffs)
                cur_max = max(cur_coeffs)
                cur_avg = sum(cur_coeffs)/cur_clen

                update_status(status+0.4*quanta, '  comulative per coeff...')

                avi = coeff_stats[cur_coef][0]
                #number of points
                coeff_stats[cur_coef][0] = avi+1
                #average of coefficients
                coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1)
                #min coeff
                coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min)
                #max coeff
                coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max)
                #avg number of clusters
                coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                #normalized avg number of clusters
                coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)


                update_status(status+0.6*quanta, '  comulative per cluster per coeff...')

                avi = cluster_stats[cur_clust][cur_coef][0]
                cluster_stats[cur_clust][cur_coef][0] = avi+1
                cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1)
                cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min)
                cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max)
                cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1)
                cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1)

    update_status_final('Done!')

    if generate_graphs:
        gen_graphs()


    if pickle_output:
        update_status(0,'Dumping to file...')
        f = open(pickle_output,'w')
        SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f)
        f.close()
    initial = get_total_memory()
    free = initial
    output_killer = open(os.devnull, 'w')

    ret_status = [None] * len(jobs)
    bibs = sizs
    sizs = map(estimator, sizs)
    free_idxs = range(len(jobs))
    assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) == len(
        free_idxs)

    done = 0.
    total = sum(sizs)
    biggest = max(sizs)

    update_status(0., "0 / %d" % len(jobs))
    too_big = [idx for idx in free_idxs if sizs[idx] > free]
    for idx in too_big:
        pid = os.fork()
        if pid == 0:  # child
            run_job(idx)
        else:  # parent
            done += sizs[idx]
            del free_idxs[idx]
            cpid, status = os.wait()
            update_status(done / total,
                          "%d / %d" % (len(jobs) - len(free_idxs), len(jobs)))
            ret_status[idx] = status
            assert cpid == pid

    while free_idxs or pid_2_idx:
示例#38
0
def rabbit(bibrecs,
           check_invalid_papers=False,
           personids_to_update_extids=None,
           verbose=False):
    '''
    @param bibrecs: an iterable full of bibrecs
    @type bibrecs: an iterable of ints
    @return: none
    '''
    logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w')
    logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs)))

    def logwrite(msg, is_error):
        verb = 9
        if is_error or verbose:
            verb = 1
        write_message(msg, verbose=verb)

    if bconfig.RABBIT_USE_CACHED_PID:
        PID_NAMES_CACHE = get_name_to_authors_mapping()

        def find_pids_by_exact_names_cache(name):
            try:
                return zip(PID_NAMES_CACHE[name])
            except KeyError:
                return []

        def add_signature_using_names_cache(sig, name, pid):
            try:
                PID_NAMES_CACHE[name].add(pid)
            except KeyError:
                PID_NAMES_CACHE[name] = set([pid])
            _add_signature(sig, name, pid)

        def new_person_from_signature_using_names_cache(sig, name):
            pid = get_free_author_id()
            add_signature_using_names_cache(sig, name, pid)
            return pid

        add_signature = add_signature_using_names_cache
        new_person_from_signature = new_person_from_signature_using_names_cache
        find_pids_by_exact_name = find_pids_by_exact_names_cache
    else:
        add_signature = _add_signature
        new_person_from_signature = _new_person_from_signature
        find_pids_by_exact_name = _find_pids_by_exact_name

    compare_names = cached_sym(lambda x: x)(comp_names)
    # fast assign threshold
    threshold = 0.80

    if not bibrecs or check_invalid_papers:
        all_bibrecs = get_all_valid_papers()

        if not bibrecs:
            bibrecs = all_bibrecs

        if check_invalid_papers:
            filter_bibrecs_outside(all_bibrecs)

    if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) >
            bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD):
        populate_partial_marc_caches()
        SWAPPED_GET_GROUPED_RECORDS = True
    else:
        SWAPPED_GET_GROUPED_RECORDS = False

    updated_pids = set()
    deleted = frozenset(p[0] for p in get_deleted_papers())

    for idx, rec in enumerate(bibrecs):

        logwrite("\nConsidering %s" % str(rec), False)

        if idx % 200 == 0:
            task_sleep_now_if_required(True)

            update_status(
                float(idx) / len(bibrecs),
                "%d/%d current: %d" % (idx, len(bibrecs), rec))
            task_update_progress("%d/%d current: %d" %
                                 (idx, len(bibrecs), rec))

        if rec in deleted:
            logwrite(
                " - Record was deleted, removing from pid and continuing with next record",
                True)
            remove_papers([rec])
            continue

        markrefs = frozenset(
            chain(
                izip(cycle([100]),
                     imap(itemgetter(0), get_author_refs_of_paper(rec))),
                izip(cycle([700]),
                     imap(itemgetter(0), get_coauthor_refs_of_paper(rec)))))

        personid_rows = [
            map(int, row[:3]) + [row[4]]
            for row in get_signatures_of_paper(rec)
        ]
        personidrefs_names = dict(
            ((row[1], row[2]), row[3]) for row in personid_rows)

        personidrefs = frozenset(personidrefs_names.keys())
        new_signatures = list(markrefs - personidrefs)
        old_signatures = list(personidrefs - markrefs)

        new_signatures_names = dict(
            (new,
             create_normalized_name(split_name_parts(get_name_by_bibref(new))))
            for new in new_signatures)

        # matrix |new_signatures| X |old_signatures|
        matrix = [[
            compare_names(new_signatures_names[new], personidrefs_names[old])
            for old in old_signatures
        ] for new in new_signatures]

        logwrite(" - Old signatures: %s" % str(old_signatures),
                 bool(old_signatures))
        logwrite(" - New signatures: %s" % str(new_signatures),
                 bool(new_signatures))
        logwrite(" - Matrix: %s" % str(matrix), bool(matrix))

        # [(new_signatures, old_signatures)]
        best_match = [(new_signatures[new], old_signatures[old])
                      for new, old, score in maximized_mapping(matrix)
                      if score > threshold]

        logwrite(" - Best match: %s " % str(best_match), bool(best_match))

        for new, old in best_match:
            logwrite(
                " - - Moving signature: %s on %s to %s as %s" %
                (old, rec, new, new_signatures_names[new]), True)
            modify_signature(old, rec, new, new_signatures_names[new])

        remove_signatures(tuple(list(old) + [rec]) for old in old_signatures)

        not_matched = frozenset(new_signatures) - frozenset(
            map(itemgetter(0), best_match))

        pids_having_rec = set(
            [int(row[0]) for row in get_signatures_of_paper(rec)])
        logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched))

        if not_matched:
            used_pids = set(r[0] for r in personid_rows)

        for sig in not_matched:
            name = new_signatures_names[sig]
            matched_pids = list()
            if USE_EXT_IDS:
                if USE_INSPIREID:
                    inspire_id = get_inspire_id_of_signature(sig + (rec, ))
                    if inspire_id:
                        matched_pids = list(
                            get_author_by_external_id(inspire_id[0]))
                        if matched_pids and int(
                                matched_pids[0][0]) in pids_having_rec:
                            matched_pids = list()
                if matched_pids:
                    add_signature(list(sig) + [rec], name, matched_pids[0][0])
                    updated_pids.add(matched_pids[0][0])
                    pids_having_rec.add(matched_pids[0][0])
                    continue

            matched_pids = find_pids_by_exact_name(name)
            matched_pids = [
                p for p in matched_pids if int(p[0]) not in used_pids
            ]

            if not matched_pids or int(matched_pids[0][0]) in pids_having_rec:
                new_pid = new_person_from_signature(list(sig) + [rec], name)
                used_pids.add(new_pid)
                updated_pids.add(new_pid)

            else:
                add_signature(list(sig) + [rec], name, matched_pids[0][0])
                used_pids.add(matched_pids[0][0])
                updated_pids.add(matched_pids[0][0])
                pids_having_rec.add(matched_pids[0][0])

        logwrite('Finished with %s' % str(rec), False)

    update_status_final()

    if personids_to_update_extids:
        updated_pids |= personids_to_update_extids
    if updated_pids:  # an empty set will update all canonical_names
        update_canonical_names_of_authors(updated_pids)
        update_external_ids_of_authors(
            updated_pids,
            limit_to_claimed_papers=bconfig.
            LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS)

    if SWAPPED_GET_GROUPED_RECORDS:
        destroy_partial_marc_caches()

    remove_empty_authors()
    def recalculate(self, cluster_set):
        '''
        Constructs probability matrix. If use_cache is true, it will
        try to load old computations from the database. If save cache
        is true it will save the current results into the database.
        @param cluster_set: A cluster set object, used to initialize
        the matrix.
        '''
        last_cleaned = 0
        self._bib_matrix.store()
        try:
            old_matrix = Bib_matrix(self._bib_matrix.name+'copy')
            old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name+'copy')
            old_matrix.load()
            cached_bibs = self.__get_up_to_date_bibs(old_matrix)
            have_cached_bibs = bool(cached_bibs)
        except IOError:
            old_matrix.destroy()
            cached_bibs = None
            have_cached_bibs = False

        self._bib_matrix.destroy()
        self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set)

        ncl = cluster_set.num_all_bibs
        expected = ((ncl * (ncl - 1)) / 2)
        if expected == 0:
            expected = 1

        try:
            cur_calc, opti, prints_counter = 0, 0, 0
            for cl1 in cluster_set.clusters:

                if cur_calc+opti - prints_counter > 100000 or cur_calc == 0:
                    update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti))
                    prints_counter = cur_calc+opti

    #            #clean caches
                if cur_calc - last_cleaned > 20000000:
                    gc.collect()
    #                clear_comparison_caches()
                    last_cleaned = cur_calc

                for cl2 in cluster_set.clusters:
                    if id(cl1) < id(cl2) and not cl1.hates(cl2):
                        for bib1 in cl1.bibs:
                            for bib2 in cl2.bibs:
                                if have_cached_bibs:
                                    try:
                                        val = old_matrix[bib1, bib2]
                                        opti += 1
                                        if bconfig.DEBUG_CHECKS:
                                            assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2))
                                    except KeyError:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                    if not val:
                                        cur_calc += 1
                                        val = compare_bibrefrecs(bib1, bib2)
                                else:
                                    cur_calc += 1
                                    val = compare_bibrefrecs(bib1, bib2)
                                self._bib_matrix[bib1, bib2] = val

        except Exception, e:
            raise Exception("""Error happened in prob_matrix.recalculate with
            val:%s
            original_exception: %s
            """%(str(val),str(e)))