def group_edges(cs): plus = [] minus = [] pairs = [] gc.disable() interval = 1000 for current, cl1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in Bib_matrix.special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == Bib_matrix.special_symbols['+']: plus.append((bib1, bib2)) elif val[0] == Bib_matrix.special_symbols['-']: minus.append((bib1, bib2)) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) gc.enable() return plus, minus, pairs
def convert_cluster_set(cs, prob_matr): ''' Convertes a normal cluster set to a wedge clsuter set. @param cs: a cluster set to be converted @param type: cluster set @return: a mapping from a number to a bibrefrec. ''' gc.disable() # step 1: # + Assign a number to each bibrefrec. # + Replace the arrays of bibrefrecs with arrays of numbers. # + Store the result and prepare it to be returned. result_mapping = [] for clus in cs.clusters: start = len(result_mapping) result_mapping += list(clus.bibs) end = len(result_mapping) clus.bibs = range(start, end) assert len(result_mapping) == len(set(result_mapping)), PID()+"Cluster set conversion failed" assert len(result_mapping) == cs.num_all_bibs, PID()+"Cluster set conversion failed" cs.new2old = result_mapping # step 2: # + Using the prob matrix create a vector values to all other bibs. # + Meld those vectors into one for each cluster. special_symbols = Bib_matrix.special_symbols #locality optimization interval = 10000 for current, c1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Converting the cluster set...") assert len(c1.bibs) > 0, PID()+"Empty cluster send to wedge" pointers = [] for v1 in c1.bibs: pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C') pointer.fill(special_symbols[None]) rm = result_mapping[v1] #locality optimization for c2 in cs.clusters: if c1 != c2 and not c1.hates(c2): for v2 in c2.bibs: val = prob_matr[rm, result_mapping[v2]] try: numb = special_symbols[val] val = (numb, numb) except KeyError: pass assert len(val) == 2, "Edge coding failed" pointer[v2] = val pointers.append((pointer, 1)) c1.out_edges = reduce(meld_edges, pointers)[0] update_status_final("Converting the cluster set done.") gc.enable()
def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))
def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i, c in enumerate(cn): update_status(i / l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c))
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 old_matrix = self._bib_matrix cached_bibs = self.__get_up_to_date_bibs() have_cached_bibs = bool(cached_bibs) self._bib_matrix = Bib_matrix(cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc + opti - prints_counter > 100000: update_status( (float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc + opti #clean caches if cur_calc - last_cleaned > 2000000: clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v( val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 old_matrix = self._bib_matrix cached_bibs = self.__get_up_to_date_bibs() have_cached_bibs = bool(cached_bibs) self._bib_matrix = Bib_matrix(cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc+opti - prints_counter > 100000: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc+opti #clean caches if cur_calc - last_cleaned > 2000000: clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs and bib1 in cached_bibs and bib2 in cached_bibs: val = old_matrix[bib1, bib2] if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val clear_comparison_caches() update_status_final("Matrix done. %d calc, %d opt." % (cur_calc, opti))
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) original_process_id = PID() #remember to close the files! #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set) p = Process(target=group_sort_edges, args=(cluster_set, original_process_id)) p.start() p.join() plus_edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id), 'r') minus_edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id), 'r') edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id), 'r') data_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id), 'r') len_plus, len_minus, len_edges = cPickle.load(data_fp) data_fp.close() interval = 1000 for i, s in enumerate(plus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status( float(i) / len_plus, "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, s in enumerate(minus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_minus, "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") interval = 50000 wedge_print("Wedge: New wedge, %d edges." % len_edges) current = -1 for s in edges_fp: v1, v2, unused = _unpack_vals(s) current += 1 if (current % interval) == 0: update_status(float(current) / len_edges, "Wedge...") assert unused != '+' and unused != '-', PID( ) + "Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] #try using object ids instead of index to boost performances #idcl1 = cluster_set.clusters.index(cl1) #idcl2 = cluster_set.clusters.index(cl2) idcl1 = id(cl1) idcl2 = id(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print( "Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot( cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s" % (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map) plus_edges_fp.close() minus_edges_fp.close() edges_fp.close() data_fp.close() try: os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id)) except: pass
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec,)) if inspire_id: matched_pids = list(get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches()
def convert_cluster_set(cs, prob_matr): ''' Convertes a normal cluster set to a wedge cluster set. @param cs: a cluster set to be converted @param type: cluster set @return: a mapping from a number to a bibrefrec. ''' #gc.disable() # step 1: # + Assign a number to each bibrefrec. # + Replace the arrays of bibrefrecs with arrays of numbers. # + Store the result and prepare it to be returned. result_mapping = list() for clus in cs.clusters: start = len(result_mapping) result_mapping += list(clus.bibs) end = len(result_mapping) clus.bibs = range(start, end) assert len(result_mapping) == len(set(result_mapping)), PID()+"Cluster set conversion failed" assert len(result_mapping) == cs.num_all_bibs, PID()+"Cluster set conversion failed" cs.new2old = result_mapping # step 2: # + Using the prob matrix create a vector values to all other bibs. # + Meld those vectors into one for each cluster. special_symbols = Bib_matrix.special_symbols #locality optimization pb_getitem_numeric = prob_matr.getitem_numeric interval = 100 gc.set_threshold(100,100,100) current = -1 real_pointer = None try: for c1 in cs.clusters: gc.collect() current += 1 if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Converting the cluster set...") assert len(c1.bibs) > 0, PID()+"Empty cluster send to wedge" pointers = list() for v1 in c1.bibs: pointer = list() index = list() rm = result_mapping[v1] #locality optimization for c2 in cs.clusters: if c1 != c2 and not c1.hates(c2): pointer += [pb_getitem_numeric((rm, result_mapping[v2])) for v2 in c2.bibs] index += c2.bibs if index and pointer: real_pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C') real_pointer.fill(special_symbols[None]) real_pointer[index] = pointer pointers.append((real_pointer, 1)) if pointers: out_edges = reduce(meld_edges, pointers)[0] h5file.create_dataset(str(id(c1)), (len(out_edges), 2), 'f') dset = h5file[str(id(c1))] dset[:] = out_edges else: h5file.create_dataset(str(id(c1)), (len(cs.clusters), 2), 'f') except Exception, e: raise Exception("""Error happened in convert_cluster_set with v1: %s, real_pointer: %s, pointer: %s, pointers: %s, result_mapping: %s, index: %s, len(real_pointer): %s, len(pointer): %s, len(pointers): %s, original_exception: %s """%(str(v1),str(real_pointer),str(pointer), str(pointers), str(result_mapping), str(index), str(len(real_pointer)), str(len(pointer)), str(len(pointers)), str(e)) )
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 self._bib_matrix.store() try: old_matrix = Bib_matrix(self._bib_matrix.name + 'copy') old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name + 'copy') old_matrix.load() cached_bibs = self.__get_up_to_date_bibs(old_matrix) have_cached_bibs = bool(cached_bibs) except IOError: old_matrix.destroy() cached_bibs = None have_cached_bibs = False self._bib_matrix.destroy() self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 try: cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc + opti - prints_counter > 100000 or cur_calc == 0: update_status( (float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc + opti # #clean caches if cur_calc - last_cleaned > 20000000: gc.collect() # clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs: try: val = old_matrix[bib1, bib2] opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v( val, compare_bibrefrecs(bib1, bib2)) except KeyError: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val except Exception, e: raise Exception("""Error happened in prob_matrix.recalculate with val:%s original_exception: %s """ % (str(val), str(e)))
def load(self, load_map=True, load_matrix=True): update_status(0., "Loading probability matrix...") self._bib_matrix.load() update_status_final("Probability matrix loaded.")
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): override_stdout_config(stdout=True) files = [ '/tmp/baistats/' + x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid') ] fnum = float(len(files)) quanta = .1 / fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict( lambda: defaultdict(lambda: [0., 0., 0., 0., 0., 0.])) coeff_stats = defaultdict(lambda: [0., 0., 0., 0., 0., 0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i, c in enumerate(cn): update_status(i / l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i, fi in enumerate(files): if generate_graphs: if i % 1000 == 0: gen_graphs(True) f = open(fi, 'r') status = i / fnum update_status(status, 'Loading ' + fi[fi.find('lastname') + 9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status + 0.2 * quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len( cur_coeffs ) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all( [x >= 0 and x <= 1 for x in cur_coeffs] ), "Error, a coefficient is wrong here! Check me! %s %s %s" % ( str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs) / cur_clen update_status(status + 0.4 * quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi + 1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1] * avi + cur_avg) / (avi + 1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4] * avi + cur_clustnumber) / (avi + 1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5] * avi + cur_clustnumber / cur_maxlen) / ( avi + 1) update_status(status + 0.6 * quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi + 1 cluster_stats[cur_clust][cur_coef][1] = ( cluster_stats[cur_clust][cur_coef][1] * avi + cur_avg) / (avi + 1) cluster_stats[cur_clust][cur_coef][2] = min( cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max( cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = ( cluster_stats[cur_clust][cur_coef][4] * avi + cur_clustnumber) / (avi + 1) cluster_stats[cur_clust][cur_coef][5] = ( cluster_stats[cur_clust][cur_coef][5] * avi + cur_clustnumber / cur_maxlen) / (avi + 1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0, 'Dumping to file...') f = open(pickle_output, 'w') SER.dump( { 'cluster_stats': dict((x, dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats': dict((coeff_stats)) }, f) f.close()
def merge_dynamic(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is dynamic: it allows aid* tables to be changed while it is still running, hence the claiming faciity for example can stay online during the merge. This comfort however is paid off in term of speed. ''' last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters()) def get_free_pids(): while True: yield get_new_personid() free_pids = get_free_pids() def try_move_signature(sig, target_pid): """ """ paps = get_signature_info(sig) rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if rejected: move_signature(sig, free_pids.next()) else: conflicts = find_conflicts(sig, target_pid) if not conflicts: move_signature(sig, target_pid) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: move_signature(sig, free_pids.next()) else: move_signature(conflicts[0][:3], free_pids.next()) move_signature(sig, target_pid) for idx, last in enumerate(last_names): update_status( float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = personid_from_signature(d) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append( dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset( imap(itemgetter(0), best_match)) not_matched_clusters = izip( (results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: try_move_signature(sig, pid) update_status_final() delete_empty_persons() update_personID_canonical_names()
def load(self, lname, load_map=True, load_matrix=True): update_status(0., "Loading probability matrix...") self._bib_matrix.load(lname, load_map, load_matrix) update_status_final("Probability matrix loaded.")
def main(): """ Reads import file and verfies the md5 hash. For each line in the import file: find new record from bibcode, find new ref from name on record find old row in personid tables copy row with new authorref (tab:bibref,rec) to temp table overwrite personid tables w/ temp table """ ## create temporary tables... print "Creating temporary tables..." create_temp_pid_sql_table() create_temp_piddata_sql_table() create_temp_user_input_log_sql_table() ## fill temp tables w/ static values... print "Filling temporary tables with static, unchanged content" copy_unaltered_piddata_rows_to_temp() copy_unaltered_user_input_log_table_rows_to_temp() ## compile regexp for line break removal nlr = re.compile('[\n\r]+') #verify file integrity print ("Verifying file integrity of %s with" " MD5 checksum from %s" % (IMPORT_FILE_NAME, IMPORT_MD5_FILE_NAME)) fp = open(IMPORT_FILE_NAME, "rb") fmd5 = md5_for_file(fp) fp.close() fp = open(IMPORT_MD5_FILE_NAME, "r") vmd5 = fp.read() fp.close() if not fmd5 == vmd5: print "WARNING: Detected a disturbance in the file. Will exit here." return total_lines = file_len() fp = open(IMPORT_FILE_NAME, "r") print "Processing file %s..." % IMPORT_FILE_NAME for index, line in enumerate(fp.readlines()): # if index == 100: # break if index % 5000 == 0: percent = float(index) / float(total_lines) update_status(percent, "%s of %s lines processed in %s" % (index, total_lines, IMPORT_FILE_NAME)) new_ref = None tab1, old_ref, old_rec, tab2, enname, bibcode = line.split(" ") assert tab1 == tab2 if tab1 == "table": continue name = base64.b64decode(enname) # name = nq.sub("", name) bibcode = nlr.sub("", bibcode) new_rec = get_bibrec_from_bibcode(bibcode) for ref in get_authorrefs_and_names_from_bibrec(new_rec): # refname = create_normalized_name(split_name_parts(ref[2])) refname = ref[2] if refname == name and str(ref[0]) == tab1: #MySQL equivalent: col_name COLLATE utf8_bin = 'Case SenSitive name' new_ref = ref[1] if not new_ref: print "WARN: Authorref not found for name %s on new record %s?!" % (name, new_rec) continue # get personid, flag, lcul and last_updated from old aidPERSONIDPAPERS old_data = find_old_pidtable_row(tab1, old_ref, old_rec) if old_data: ## prepare data in temporary tables... pid, flag, lcul, lupdate = old_data old_authorref = "%s:%s,%s" % (tab1, old_ref, old_rec) new_authorref = "%s:%s,%s" % (tab1, new_ref, new_rec) ## Transform the name into a more consistent form inname = create_normalized_name(split_name_parts(name)) ## Insert transformed data into temp tables... insert_into_temp_table(pid, tab1, new_ref, new_rec, inname, flag, lcul, lupdate) update_temp_piddata_table(old_authorref, new_authorref) update_temp_user_input_log_table(old_authorref, new_authorref) else: print "WARN: %s does not exist in db!" % ([tab1, old_ref, old_rec]) # The following is true only if applied on the same data set # Commented out by default. For testing/debug uses only try: if RUN_IN_TEST_MODE: assert str(old_rec) == str(new_rec) assert str(old_ref) == str(new_ref) pass except AssertionError, e: print "ERROR: ", e print "%s:%s,%s vs. %s:%s,%s on %s:%s" % (tab1, old_ref, old_rec, tab1, new_ref, new_rec, bibcode, name)
def check_table_integrity(table): """ Check integrity of result table vs. original table. Only works when original data and result data are identical! @param table: the table to check: aidPIDTEMP, aidPIDUILTEMP or aidPIDDATATEMP @type table: str """ if not RUN_IN_TEST_MODE: print "Integrity checks only run in TEST_MODE!" return check_passed = True odata = None rdata = None print "Checking table: %s" % table print " |-- Getting original data..." if table == "aidPIDTEMP": odata = run_sql("select personid, bibref_table, bibref_value, bibrec, name, flag, " "lcul, last_updated from aidPERSONIDPAPERS") elif table == "aidPIDUILTEMP": odata = run_sql("select id, transactionid, timestamp, userinfo, personid, " "action, tag, value, comment from aidUSERINPUTLOG") elif table == "aidPIDDATATEMP": odata = run_sql("select personid, tag, data, opt1, opt2, opt3 " "from aidPERSONIDDATA") else: print "No table specified for integrity check. Skipped." return print " |-- Getting result data..." if table == "aidPIDTEMP": rdata = run_sql("select personid, bibref_table, bibref_value, bibrec, name, flag, " "lcul, last_updated from aidPIDTEMP") elif table == "aidPIDUILTEMP": rdata = run_sql("select id, transactionid, timestamp, userinfo, personid, " "action, tag, value, comment from aidPIDUILTEMP") elif table == "aidPIDDATATEMP": rdata = run_sql("select personid, tag, data, opt1, opt2, opt3 " "from aidPIDDATATEMP") print " |-- Checking..." rownum = float(len(rdata)) odata = set(odata) if bool(len(odata)) ^ bool(len(rdata)): check_passed = False for index, res in enumerate(rdata): if not check_passed: print "odata xor rdata: %s" % (bool(len(odata)) ^ bool(len(rdata))) break if index % 1000 == 0: percent = float(index) / rownum update_status(percent, "%s of %s rows processed" % (index, rownum)) if not (set([res]) & odata): check_passed = False print "Test failed for the following pair:\n res: %s and-op: %s" % (str(res), str((set(res) & odata))) break update_status(1., "Done checking %s\n" % table) if check_passed: print " |-- OK" else: print " |-- Data Integrity check failed!"
update_temp_user_input_log_table(old_authorref, new_authorref) else: print "WARN: %s does not exist in db!" % ([tab1, old_ref, old_rec]) # The following is true only if applied on the same data set # Commented out by default. For testing/debug uses only try: if RUN_IN_TEST_MODE: assert str(old_rec) == str(new_rec) assert str(old_ref) == str(new_ref) pass except AssertionError, e: print "ERROR: ", e print "%s:%s,%s vs. %s:%s,%s on %s:%s" % (tab1, old_ref, old_rec, tab1, new_ref, new_rec, bibcode, name) update_status(1., "Done importing from %s\n" % IMPORT_FILE_NAME) if RUN_IN_TEST_MODE: perform_integrity_checks() else: print "Copying NEW data from temp tables to original tables (destroying previous content!...)" print "Personid Papers table..." copy_temp_to_pid_table() print "Personid Data table..." copy_temp_piddata_table() print "User Input log table..." copy_temp_user_input_table_table() fp.close()
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): import matplotlib.pyplot as plt plt.ioff() def _gen_plot(data, filename): plt.clf() ax = plt.subplot(111) ax.grid(visible=True) x = sorted(data.keys()) w = [data[k][0] for k in x] try: wscf = max(w) except: wscf = 0 w = [float(i)/wscf for i in w] y = [data[k][1] for k in x] maxi = [data[k][3] for k in x] mini = [data[k][2] for k in x] lengs = [data[k][4] for k in x] try: ml = float(max(lengs)) except: ml = 1 lengs = [k/ml for k in lengs] normalengs = [data[k][5] for k in x] ax.plot(x,y,'-o',label='avg') ax.plot(x,maxi,'-o', label='max') ax.plot(x,mini,'-o', label='min') ax.plot(x,w, '-x', label='norm %s' % str(wscf)) ax.plot(x,lengs,'-o',label='acl %s' % str(int(ml))) ax.plot(x,normalengs, '-o', label='ncl') plt.ylim(ymax = 1., ymin = -0.01) plt.xlim(xmax = 1., xmin = -0.01) ax.legend(bbox_to_anchor=(0., 1.02, 1., .102), loc=3,ncol=6, mode="expand", borderaxespad=0.) plt.savefig(filename) override_stdout_config(stdout=True) files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')] fnum = float(len(files)) quanta = .1/fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.])) coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i,fi in enumerate(files): if generate_graphs: if i%1000 ==0: gen_graphs(True) f = filehandler.open(fi,'r') status = i/fnum update_status(status, 'Loading '+ fi[fi.find('lastname')+9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status+0.2*quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs)/cur_clen update_status(status+0.4*quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi+1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status(status+0.6*quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi+1 cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1) cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1) cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0,'Dumping to file...') f = open(pickle_output,'w') SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f) f.close()
def main(): """ Main Function. Acquires data and constructs output format: <MARC Table> <ID in Table> <record ID> <MARC Table> <Name> <bibcode> Four (4) spaces " " are used as a delimiter Stores the output format to the specified export file """ pidrefs = {} records = None output = [] output.append("table bibref bibrec table name bibcode") if LIMIT_TO_RECORDS_IN_PERSONID: print "Finding record IDs from PersonID table..." pidrefs_sqldata = run_sql("select bibref_table, bibref_value, bibrec " "from aidPERSONIDPAPERS") for data in pidrefs_sqldata: tab, ref, rec = data if rec in pidrefs: pidrefs[rec].append((tab, ref)) else: pidrefs[rec] = [(tab, ref)] records = pidrefs.keys() else: print "Finding ALL record IDs from bibrec table..." records = [p[0] for p in run_sql("select id from bibrec")] print "Collecting data for %s records..." % len(records) for index, bibrec in enumerate(records): if index % 1000 == 0: status = "%s of all %s records done." % (index, len(records)) if index % 10000 == 0: fp = open(EXPORT_FILE_NAME, "w") fp.write("\n".join(output)) fp.close() status = "Saving to %s..." % EXPORT_FILE_NAME percent = float(index) / len(records) update_status(percent, status) bibcode = get_bibcode_from_bibrec(bibrec) refs = None if LIMIT_TO_RECORDS_IN_PERSONID: try: temp_refs = pidrefs[bibrec] refs = [] for tab, ref in temp_refs: name = get_db_name_from_ref(tab, ref) refs.append((tab, ref, name)) except KeyError: print "No key %s in pidref!" % bibrec else: refs = get_authorrefs_and_names_from_bibrec(bibrec) for ref in refs: tab, tid, name = ref # b64 encode name to avoid data inconsistencies, occuring with spaces and # special characters in names (e.g. \n \t \r \0 etc.) enname = base64.b64encode(name) #out = "%s:%s,%s;;;%s:\"%s\",%s" % (tab, tid, bibrec, tab, name, bibcode) #out = """%s %s %s \"\"\"%s\"\"\" %s %s""" % (tab, tid, bibrec, tab, name, bibcode) out = """%s %s %s %s %s %s""" % (tab, tid, bibrec, tab, enname, bibcode) output.append(out) # For testing, just write out the first 1000 lines... # if len(output) > 1000: # break # write to export file... fp = open(EXPORT_FILE_NAME, "w") fp.write("\n".join(output)) fp.close() # md5 export file... fp = open(EXPORT_FILE_NAME, "rb") fmd5 = md5_for_file(fp) fp.close() # write to md5 file fp = open(EXPORT_FILE_NAME + ".md5", "w") fp.write(fmd5) fp.close() update_status(1., "Export OK.\n") print "Final Export written to %s" % EXPORT_FILE_NAME print "MD5 hash of export file saved to %s.md5" % EXPORT_FILE_NAME print "All Done."
def merge_static(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. ''' last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters()) def get_free_pids(): while True: yield get_new_personid() free_pids = get_free_pids() current_mapping = get_bibrefrec_to_pid_flag_mapping() def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag): move_signature(sig, new_pid_flag[0]) current_mapping[sig].remove(old_pid_flag) current_mapping[sig].append(new_pid_flag) def try_move_signature(sig, target_pid): """ """ paps = current_mapping[sig] rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p: -2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if rejected: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: conflicts = find_conflicts(sig, target_pid) if not conflicts: move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: newpid = free_pids.next() csig = tuple(conflicts[0][:3]) move_sig_and_update_mapping(csig, (target_pid, conflicts[0][3]), (newpid, conflicts[0][3])) move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) for idx, last in enumerate(last_names): update_status( float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = current_mapping.get(d, []) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append( dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset( imap(itemgetter(0), best_match)) not_matched_clusters = izip( (results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: if sig in current_mapping: if not pid in map( itemgetter(0), filter(lambda x: x[1] > -2, current_mapping[sig])): try_move_signature(sig, pid) update_status_final() delete_empty_persons() update_personID_canonical_names()
def convert_cluster_set(cs, prob_matr): ''' Convertes a normal cluster set to a wedge cluster set. @param cs: a cluster set to be converted @param type: cluster set @return: a mapping from a number to a bibrefrec. ''' #gc.disable() # step 1: # + Assign a number to each bibrefrec. # + Replace the arrays of bibrefrecs with arrays of numbers. # + Store the result and prepare it to be returned. result_mapping = list() for clus in cs.clusters: start = len(result_mapping) result_mapping += list(clus.bibs) end = len(result_mapping) clus.bibs = range(start, end) assert len(result_mapping) == len( set(result_mapping)), PID() + "Cluster set conversion failed" assert len(result_mapping ) == cs.num_all_bibs, PID() + "Cluster set conversion failed" cs.new2old = result_mapping # step 2: # + Using the prob matrix create a vector values to all other bibs. # + Meld those vectors into one for each cluster. special_symbols = Bib_matrix.special_symbols #locality optimization pb_getitem_numeric = prob_matr.getitem_numeric interval = 100 gc.set_threshold(100, 100, 100) current = -1 real_pointer = None try: for c1 in cs.clusters: gc.collect() current += 1 if (current % interval) == 0: update_status( float(current) / len(cs.clusters), "Converting the cluster set...") assert len(c1.bibs) > 0, PID() + "Empty cluster send to wedge" pointers = list() for v1 in c1.bibs: pointer = list() index = list() rm = result_mapping[v1] #locality optimization for c2 in cs.clusters: if c1 != c2 and not c1.hates(c2): pointer += [ pb_getitem_numeric((rm, result_mapping[v2])) for v2 in c2.bibs ] index += c2.bibs if index and pointer: real_pointer = numpy.ndarray(shape=(len(result_mapping), 2), dtype=float, order='C') real_pointer.fill(special_symbols[None]) real_pointer[index] = pointer pointers.append((real_pointer, 1)) if pointers: out_edges = reduce(meld_edges, pointers)[0] h5file.create_dataset(str(id(c1)), (len(out_edges), 2), 'f') dset = h5file[str(id(c1))] dset[:] = out_edges else: h5file.create_dataset(str(id(c1)), (len(cs.clusters), 2), 'f') except Exception, e: raise Exception( """Error happened in convert_cluster_set with v1: %s, real_pointer: %s, pointer: %s, pointers: %s, result_mapping: %s, index: %s, len(real_pointer): %s, len(pointer): %s, len(pointers): %s, original_exception: %s """ % (str(v1), str(real_pointer), str(pointer), str(pointers), str(result_mapping), str(index), str(len(real_pointer)), str(len(pointer)), str(len(pointers)), str(e)))
def merge_static_classy(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. NOTE: this is more elegant that merge_static but much slower. Will have to be improved before it can replace it. ''' class Sig(object): def __init__(self, bibrefrec, pid_flag): self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag)) self.assigned = filter(lambda p: -2 < p[1] and p[1] < 2, pid_flag) self.claimed = filter(lambda p: 2 <= p[1], pid_flag) self.bibrefrec = bibrefrec assert self.invariant() def invariant(self): return len(self.assigned) + len(self.claimed) <= 1 def empty(self): return not self.isclaimed and not self.isassigned def isclaimed(self): return len(self.claimed) == 1 def get_claimed(self): return self.claimed[0][0] def get_assigned(self): return self.assigned[0][0] def isassigned(self): return len(self.assigned) == 1 def isrejected(self, pid): return pid in self.rejected def change_pid(self, pid): assert self.invariant() assert self.isassigned() self.assigned = [(pid, 0)] move_signature(self.bibrefrec, pid) class Cluster(object): def __init__(self, pid, sigs): self.pid = pid self.sigs = dict( (sig.bibrefrec[2], sig) for sig in sigs if not sig.empty()) def send_sig(self, other, sig): paper = sig.bibrefrec[2] assert paper in self.sigs and paper not in other.sigs del self.sigs[paper] other.sigs[paper] = sig if sig.isassigned(): sig.change_pid(other.pid) last_names = frozenset(name[0].split('.')[0] for name in get_existing_result_clusters()) personid = get_bibrefrec_to_pid_flag_mapping() free_pids = backinterface_get_free_pids() for idx, last in enumerate(last_names): update_status( float(idx) / len(last_names), "Merging, %d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_lastname_results(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] for d in ds: pid_flag = filter(lambda x: x[1] > -2, personid.get(d, [])) if pid_flag: assert len(pid_flag) == 1 pid = pid_flag[0][0] pids.append(pid) old_pids.add(pid) matr.append( dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) # [[bibrefrecs] -> pid] matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset( imap(itemgetter(0), best_match)) not_matched_clusters = izip( (results[i][1] for i in not_matched_clusters), free_pids) # pid -> Cluster clusters = dict( (pid, Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs])) for sigs, pid in chain(matched_clusters, not_matched_clusters)) todo = clusters.items() for pid, clus in todo: assert clus.pid == pid for paper, sig in clus.sigs.items(): if sig.isclaimed(): if sig.get_claimed() != pid: target_clus = clusters[sig.get_claimed()] if paper in target_clus.sigs: new_clus = Cluster(free_pids.next(), []) target_clus.send_sig(new_clus, target_clus[paper]) todo.append(new_clus) clusters[new_clus.pid] = new_clus assert paper not in target_clus.sigs clus.send_sig(target_clus, sig) elif sig.get_assigned() != pid: if not sig.isrejected(pid): move_signature(sig.bibrefrec, pid) else: move_signature(sig.bibrefrec, free_pids.next()) else: assert not sig.isrejected(pid) update_status_final("Merging done.") update_status_final() delete_empty_persons() update_personID_canonical_names()
def group_sort_edges(cs, original_process_id): bibauthor_print("group_sort_edges spowned by %s" % original_process_id) plus_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id), 'w') minus_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id), 'w') pairs_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id), 'w') data_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id), 'w') plus_count = 0 minus_count = 0 pairs_count = 0 default_val = [0., 0.] #gc.disable() interval = 1000 current = -1 for cl1 in cs.clusters: current += 1 if (current % interval) == 0: update_status( float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = h5file[str(id(cl1))] for bib2 in xrange(len(h5file[str(id(cl1))])): val = pointers[bib2] #if val[0] not in Bib_matrix.special_numbers: #optimization: special numbers are assumed to be negative if val[0] >= 0: if val[0] > edge_cut_prob: pairs_count += 1 pairs_fp.write(_pack_vals((bib1, bib2, val))) elif val[0] == Bib_matrix.special_symbols['+']: plus_count += 1 plus_fp.write(_pack_vals((bib1, bib2, default_val))) elif val[0] == Bib_matrix.special_symbols['-']: minus_count += 1 minus_fp.write(_pack_vals((bib1, bib2, default_val))) else: assert val[0] == Bib_matrix.special_symbols[ None], "Invalid Edge" update_status_final("Finished with the edge grouping.") plus_fp.close() minus_fp.close() pairs_fp.close() bibauthor_print( "Positive edges: %d, Negative edges: %d, Value edges: %d." % (plus_count, minus_count, pairs_count)) #gc.enable() bibauthor_print("Sorting in-file value edges.") sortFileInPlace(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id), bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id), lambda x: _edge_sorting(_unpack_vals(x)), reverse=True) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id)) bibauthor_print("Dumping egdes data to file...") cPickle.dump((plus_count, minus_count, pairs_count), data_fp) data_fp.close()
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) interval = 1000 for i, (bib1, bib2) in enumerate(plus_edges): if (i % interval) == 0: update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, (bib1, bib2) in enumerate(minus_edges): if (i % interval) == 0: update_status(float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=_edge_sorting, reverse=True) interval = 500000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] idcl1 = cluster_set.clusters.index(cl1) idcl2 = cluster_set.clusters.index(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
def merge_static(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. ''' last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) def get_free_pids(): while True: yield get_free_author_id() free_pids = get_free_pids() current_mapping = get_paper_to_author_and_status_mapping() def move_sig_and_update_mapping(sig, old_pid_flag, new_pid_flag): move_signature(sig, new_pid_flag[0]) current_mapping[sig].remove(old_pid_flag) current_mapping[sig].append(new_pid_flag) def try_move_signature(sig, target_pid): """ """ paps = current_mapping[sig] rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if rejected: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: conflicts = get_signatures_of_paper_and_author(sig, target_pid) if not conflicts: move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: newpid = free_pids.next() move_sig_and_update_mapping(sig, assigned[0], (newpid, assigned[0][1])) else: newpid = free_pids.next() csig = tuple(conflicts[0][:3]) move_sig_and_update_mapping(csig, (target_pid, conflicts[0][3]), (newpid, conflicts[0][3])) move_sig_and_update_mapping(sig, assigned[0], (target_pid, assigned[0][1])) for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] claim = [] for d in ds: pid_flag = current_mapping.get(d, []) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) if flag > 1: claim.append((d, pid)) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match)) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: if sig in current_mapping: if not pid in map(itemgetter(0), filter(lambda x: x[1] > -2, current_mapping[sig])): try_move_signature(sig, pid) update_status_final() remove_empty_authors() update_canonical_names_of_authors()
def store(self): update_status(0., "Saving probability matrix...") self._bib_matrix.store() update_status_final("Probability matrix saved.")
def merge_static_classy(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is static: if aid* tables are changed while it's running, probably everything will crash and a black hole will open, eating all your data. NOTE: this is more elegant that merge_static but much slower. Will have to be improved before it can replace it. ''' class Sig(object): def __init__(self, bibrefrec, pid_flag): self.rejected = dict(filter(lambda p: p[1] <= -2, pid_flag)) self.assigned = filter(lambda p:-2 < p[1] and p[1] < 2, pid_flag) self.claimed = filter(lambda p: 2 <= p[1], pid_flag) self.bibrefrec = bibrefrec assert self.invariant() def invariant(self): return len(self.assigned) + len(self.claimed) <= 1 def empty(self): return not self.isclaimed and not self.isassigned def isclaimed(self): return len(self.claimed) == 1 def get_claimed(self): return self.claimed[0][0] def get_assigned(self): return self.assigned[0][0] def isassigned(self): return len(self.assigned) == 1 def isrejected(self, pid): return pid in self.rejected def change_pid(self, pid): assert self.invariant() assert self.isassigned() self.assigned = [(pid, 0)] move_signature(self.bibrefrec, pid) class Cluster(object): def __init__(self, pid, sigs): self.pid = pid self.sigs = dict((sig.bibrefrec[2], sig) for sig in sigs if not sig.empty()) def send_sig(self, other, sig): paper = sig.bibrefrec[2] assert paper in self.sigs and paper not in other.sigs del self.sigs[paper] other.sigs[paper] = sig if sig.isassigned(): sig.change_pid(other.pid) last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) personid = get_paper_to_author_and_status_mapping() free_pids = backinterface_get_free_pids() for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "Merging, %d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = [] for d in ds: pid_flag = filter(lambda x: x[1] > -2, personid.get(d, [])) if pid_flag: assert len(pid_flag) == 1 pid = pid_flag[0][0] pids.append(pid) old_pids.add(pid) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) old_pids = list(old_pids) best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) # [[bibrefrecs] -> pid] matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, _ in best_match] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), best_match)) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) # pid -> Cluster clusters = dict((pid, Cluster(pid, [Sig(bib, personid.get(bib, [])) for bib in sigs])) for sigs, pid in chain(matched_clusters, not_matched_clusters)) todo = clusters.items() for pid, clus in todo: assert clus.pid == pid for paper, sig in clus.sigs.items(): if sig.isclaimed(): if sig.get_claimed() != pid: target_clus = clusters[sig.get_claimed()] if paper in target_clus.sigs: new_clus = Cluster(free_pids.next(), []) target_clus.send_sig(new_clus, target_clus[paper]) todo.append(new_clus) clusters[new_clus.pid] = new_clus assert paper not in target_clus.sigs clus.send_sig(target_clus, sig) elif sig.get_assigned() != pid: if not sig.isrejected(pid): move_signature(sig.bibrefrec, pid) else: move_signature(sig.bibrefrec, free_pids.next()) else: assert not sig.isrejected(pid) update_status_final("Merging done.") update_status_final() remove_empty_authors() update_canonical_names_of_authors()
#free = get_free_memory() initial = get_total_memory() free = initial output_killer = open(os.devnull, 'w') ret_status = [None] * len(jobs) bibs = sizs sizs = map(estimator, sizs) free_idxs = range(len(jobs)) assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) == len(free_idxs) done = 0. total = sum(sizs) biggest = max(sizs) update_status(0., "0 / %d" % len(jobs)) too_big = [idx for idx in free_idxs if sizs[idx] > free] for idx in too_big: pid = os.fork() if pid == 0: # child run_job(idx) else: # parent done += sizs[idx] del free_idxs[idx] cpid, status = os.wait() update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs), len(jobs))) ret_status[idx] = status assert cpid == pid while free_idxs or pid_2_idx: while len(pid_2_idx) < max_workers:
def store(self, name): update_status(0., "Saving probability matrix...") self._bib_matrix.store(name) update_status_final("Probability matrix saved.")
def merge_dynamic(): ''' This function merges aidPERSONIDPAPERS with aidRESULTS. Use it after tortoise. This function is dynamic: it allows aid* tables to be changed while it is still running, hence the claiming faciity for example can stay online during the merge. This comfort however is paid off in term of speed. ''' last_names = frozenset(name[0].split('.')[0] for name in get_cluster_names()) def get_free_pids(): while True: yield get_free_author_id() free_pids = get_free_pids() def try_move_signature(sig, target_pid): """ """ paps = get_ordered_author_and_status_of_signature(sig) rejected = filter(lambda p: p[1] <= -2, paps) assigned = filter(lambda p:-2 < p[1] and p[1] < 2, paps) claimed = filter(lambda p: 2 <= p[1] and p[0] == target_pid, paps) if claimed or not assigned or assigned[0] == target_pid: return assert len(assigned) == 1 if int(target_pid) in [int(x[0]) for x in rejected]: move_signature(sig, free_pids.next()) else: conflicts = get_signatures_of_paper_and_author(sig, target_pid) if not conflicts: move_signature(sig, target_pid) else: assert len(conflicts) == 1 if conflicts[0][3] == 2: move_signature(sig, free_pids.next()) else: move_signature(conflicts[0][:3], free_pids.next()) move_signature(sig, target_pid) for idx, last in enumerate(last_names): update_status(float(idx) / len(last_names), "%d/%d current: %s" % (idx, len(last_names), last)) results = ((int(row[0].split(".")[1]), row[1:4]) for row in get_clusters_by_surname(last)) # [(last name number, [bibrefrecs])] results = [(k, map(itemgetter(1), d)) for k, d in groupby(sorted(results, key=itemgetter(0)), key=itemgetter(0))] # List of dictionaries. # [{new_pid -> N}] matr = [] # Set of all old pids. old_pids = set() for k, ds in results: pids = list() for d in ds: pid_flag = get_author_and_status_of_confirmed_paper(d) if pid_flag: pid, flag = pid_flag[0] pids.append(pid) old_pids.add(pid) matr.append(dict((k, len(list(d))) for k, d in groupby(sorted(pids)))) # We cast it to list in order to ensure the order persistence. old_pids = list(old_pids) #best_match = cluster,pid_idx,n best_match = maximized_mapping([[row.get(old, 0) for old in old_pids] for row in matr]) matched_clusters = [(results[new_idx][1], old_pids[old_idx]) for new_idx, old_idx, score in best_match if score > 0] not_matched_clusters = frozenset(xrange(len(results))) - frozenset(imap(itemgetter(0), [x for x in best_match if x[2] > 0])) not_matched_clusters = izip((results[i][1] for i in not_matched_clusters), free_pids) for sigs, pid in chain(matched_clusters, not_matched_clusters): for sig in sigs: try_move_signature(sig, pid) update_status_final() remove_empty_authors() update_canonical_names_of_authors()
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) original_process_id = PID() #remember to close the files! #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set) p = Process(target=group_sort_edges, args=(cluster_set,original_process_id)) p.start() p.join() plus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'r') minus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'r') edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),'r') data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'r') len_plus,len_minus,len_edges = cPickle.load(data_fp) data_fp.close() interval = 1000 for i, s in enumerate(plus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_plus, "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, s in enumerate(minus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_minus, "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") interval = 50000 wedge_print("Wedge: New wedge, %d edges." % len_edges) current = -1 for s in edges_fp: v1, v2, unused = _unpack_vals(s) current += 1 if (current % interval) == 0: update_status(float(current) / len_edges, "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] #try using object ids instead of index to boost performances #idcl1 = cluster_set.clusters.index(cl1) #idcl2 = cluster_set.clusters.index(cl2) idcl1 = id(cl1) idcl2 = id(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map) plus_edges_fp.close() minus_edges_fp.close() edges_fp.close() data_fp.close() try: os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id)) except: pass
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_string_to_pid_dictionary() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_new_personid() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_bibrecs() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: delete_paper_from_personid(rec) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_authors_from_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthors_from_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_from_rec(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name( split_name_parts(get_name_by_bibrecref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] for new, old in best_match: modify_signature(old, rec, new, new_signatures_names[new]) remove_sigs(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = [] if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id(sig + (rec, )) if inspire_id: matched_pids = list( get_person_with_extid(inspire_id[0])) if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_personID_canonical_names(updated_pids) update_personID_external_ids( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches()
def group_sort_edges(cs, original_process_id): bibauthor_print("group_sort_edges spowned by %s" % original_process_id) plus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'w') minus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'w') pairs_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),'w') data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'w') plus_count = 0 minus_count = 0 pairs_count = 0 default_val = [0.,0.] #gc.disable() interval = 1000 current = -1 for cl1 in cs.clusters: current += 1 if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = h5file[str(id(cl1))] for bib2 in xrange(len(h5file[str(id(cl1))])): val = pointers[bib2] #if val[0] not in Bib_matrix.special_numbers: #optimization: special numbers are assumed to be negative if val[0] >= 0: if val[0] > edge_cut_prob: pairs_count += 1 pairs_fp.write(_pack_vals((bib1, bib2, val))) elif val[0] == Bib_matrix.special_symbols['+']: plus_count += 1 plus_fp.write(_pack_vals((bib1, bib2, default_val))) elif val[0] == Bib_matrix.special_symbols['-']: minus_count += 1 minus_fp.write(_pack_vals((bib1, bib2, default_val))) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") plus_fp.close() minus_fp.close() pairs_fp.close() bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (plus_count, minus_count, pairs_count)) #gc.enable() bibauthor_print("Sorting in-file value edges.") sortFileInPlace(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id), bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id), lambda x: _edge_sorting(_unpack_vals(x)), reverse=True) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id)) bibauthor_print("Dumping egdes data to file...") cPickle.dump((plus_count, minus_count, pairs_count), data_fp) data_fp.close()
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx%200 == 0: task_sleep_now_if_required(True) update_status(float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite(" - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset(chain(izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec)] personidrefs_names = dict(((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict((new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite(" - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset(map(itemgetter(0), best_match)) pids_having_rec = set([int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec,)) if inspire_id: matched_pids = list(get_author_by_external_id(inspire_id[0])) if matched_pids and int(matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [p for p in matched_pids if int(p[0]) not in used_pids] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors(updated_pids, limit_to_claimed_papers=bconfig.LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()
def tortoise_coefficient_statistics(pickle_output=None, generate_graphs=True): override_stdout_config(stdout=True) files = ['/tmp/baistats/'+x for x in os.listdir('/tmp/baistats/') if x.startswith('cluster_status_report_pid')] fnum = float(len(files)) quanta = .1/fnum total_stats = 0 used_coeffs = set() used_clusters = set() #av_counter, avg, min, max, nclus, normalized_avg cluster_stats = defaultdict(lambda : defaultdict(lambda : [0.,0.,0.,0.,0.,0.])) coeff_stats = defaultdict(lambda : [0.,0.,0.,0.,0.,0.]) def gen_graphs(only_synthetic=False): update_status(0, 'Generating coefficients graph...') _gen_plot(coeff_stats, '/tmp/graphs/AAAAA-coefficients.svg') if not only_synthetic: cn = cluster_stats.keys() l = float(len(cn)) for i,c in enumerate(cn): update_status(i/l, 'Generating name graphs... %s' % str(c)) _gen_plot(cluster_stats[c], '/tmp/graphs/CS-%s.png' % str(c)) for i,fi in enumerate(files): if generate_graphs: if i%1000 ==0: gen_graphs(True) f = open(fi,'r') status = i/fnum update_status(status, 'Loading '+ fi[fi.find('lastname')+9:]) contents = SER.load(f) f.close() cur_coef = contents[0] cur_clust = contents[1] cur_maxlen = float(contents[3]) if cur_coef: total_stats += 1 used_coeffs.add(cur_coef) used_clusters.add(cur_clust) update_status(status+0.2*quanta, ' Computing averages...') cur_clen = len(contents[2]) cur_coeffs = [x[2] for x in contents[2]] cur_clustnumber = float(len(set([x[0] for x in contents[2]]))) assert cur_clustnumber > 0 and cur_clustnumber < cur_maxlen, "Error, found log with strange clustnumber! %s %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_maxlen), str(cur_clustnumber)) if cur_coeffs: assert len(cur_coeffs) == cur_clen and cur_coeffs, "Error, there is a cluster witohut stuff? %s %s %s"% (str(cur_clust), str(cur_coef), str(cur_coeffs)) assert all([x >= 0 and x <= 1 for x in cur_coeffs]), "Error, a coefficient is wrong here! Check me! %s %s %s" % (str(cur_clust), str(cur_coef), str(cur_coeffs)) cur_min = min(cur_coeffs) cur_max = max(cur_coeffs) cur_avg = sum(cur_coeffs)/cur_clen update_status(status+0.4*quanta, ' comulative per coeff...') avi = coeff_stats[cur_coef][0] #number of points coeff_stats[cur_coef][0] = avi+1 #average of coefficients coeff_stats[cur_coef][1] = (coeff_stats[cur_coef][1]*avi + cur_avg)/(avi+1) #min coeff coeff_stats[cur_coef][2] = min(coeff_stats[cur_coef][2], cur_min) #max coeff coeff_stats[cur_coef][3] = max(coeff_stats[cur_coef][3], cur_max) #avg number of clusters coeff_stats[cur_coef][4] = (coeff_stats[cur_coef][4]*avi + cur_clustnumber)/(avi+1) #normalized avg number of clusters coeff_stats[cur_coef][5] = (coeff_stats[cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status(status+0.6*quanta, ' comulative per cluster per coeff...') avi = cluster_stats[cur_clust][cur_coef][0] cluster_stats[cur_clust][cur_coef][0] = avi+1 cluster_stats[cur_clust][cur_coef][1] = (cluster_stats[cur_clust][cur_coef][1]*avi + cur_avg)/(avi+1) cluster_stats[cur_clust][cur_coef][2] = min(cluster_stats[cur_clust][cur_coef][2], cur_min) cluster_stats[cur_clust][cur_coef][3] = max(cluster_stats[cur_clust][cur_coef][3], cur_max) cluster_stats[cur_clust][cur_coef][4] = (cluster_stats[cur_clust][cur_coef][4]*avi + cur_clustnumber)/(avi+1) cluster_stats[cur_clust][cur_coef][5] = (cluster_stats[cur_clust][cur_coef][5]*avi + cur_clustnumber/cur_maxlen)/(avi+1) update_status_final('Done!') if generate_graphs: gen_graphs() if pickle_output: update_status(0,'Dumping to file...') f = open(pickle_output,'w') SER.dump({'cluster_stats':dict((x,dict(cluster_stats[x])) for x in cluster_stats.iterkeys()), 'coeff_stats':dict((coeff_stats))}, f) f.close()
initial = get_total_memory() free = initial output_killer = open(os.devnull, 'w') ret_status = [None] * len(jobs) bibs = sizs sizs = map(estimator, sizs) free_idxs = range(len(jobs)) assert len(jobs) == len(sizs) == len(ret_status) == len(bibs) == len( free_idxs) done = 0. total = sum(sizs) biggest = max(sizs) update_status(0., "0 / %d" % len(jobs)) too_big = [idx for idx in free_idxs if sizs[idx] > free] for idx in too_big: pid = os.fork() if pid == 0: # child run_job(idx) else: # parent done += sizs[idx] del free_idxs[idx] cpid, status = os.wait() update_status(done / total, "%d / %d" % (len(jobs) - len(free_idxs), len(jobs))) ret_status[idx] = status assert cpid == pid while free_idxs or pid_2_idx:
def rabbit(bibrecs, check_invalid_papers=False, personids_to_update_extids=None, verbose=False): ''' @param bibrecs: an iterable full of bibrecs @type bibrecs: an iterable of ints @return: none ''' logfile = open('/tmp/RABBITLOG-%s' % str(now()).replace(" ", "_"), 'w') logfile.write("RABBIT %s running on %s \n" % (str(now()), str(bibrecs))) def logwrite(msg, is_error): verb = 9 if is_error or verbose: verb = 1 write_message(msg, verbose=verb) if bconfig.RABBIT_USE_CACHED_PID: PID_NAMES_CACHE = get_name_to_authors_mapping() def find_pids_by_exact_names_cache(name): try: return zip(PID_NAMES_CACHE[name]) except KeyError: return [] def add_signature_using_names_cache(sig, name, pid): try: PID_NAMES_CACHE[name].add(pid) except KeyError: PID_NAMES_CACHE[name] = set([pid]) _add_signature(sig, name, pid) def new_person_from_signature_using_names_cache(sig, name): pid = get_free_author_id() add_signature_using_names_cache(sig, name, pid) return pid add_signature = add_signature_using_names_cache new_person_from_signature = new_person_from_signature_using_names_cache find_pids_by_exact_name = find_pids_by_exact_names_cache else: add_signature = _add_signature new_person_from_signature = _new_person_from_signature find_pids_by_exact_name = _find_pids_by_exact_name compare_names = cached_sym(lambda x: x)(comp_names) # fast assign threshold threshold = 0.80 if not bibrecs or check_invalid_papers: all_bibrecs = get_all_valid_papers() if not bibrecs: bibrecs = all_bibrecs if check_invalid_papers: filter_bibrecs_outside(all_bibrecs) if (bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS and len(bibrecs) > bconfig.RABBIT_USE_CACHED_GET_GROUPED_RECORDS_THRESHOLD): populate_partial_marc_caches() SWAPPED_GET_GROUPED_RECORDS = True else: SWAPPED_GET_GROUPED_RECORDS = False updated_pids = set() deleted = frozenset(p[0] for p in get_deleted_papers()) for idx, rec in enumerate(bibrecs): logwrite("\nConsidering %s" % str(rec), False) if idx % 200 == 0: task_sleep_now_if_required(True) update_status( float(idx) / len(bibrecs), "%d/%d current: %d" % (idx, len(bibrecs), rec)) task_update_progress("%d/%d current: %d" % (idx, len(bibrecs), rec)) if rec in deleted: logwrite( " - Record was deleted, removing from pid and continuing with next record", True) remove_papers([rec]) continue markrefs = frozenset( chain( izip(cycle([100]), imap(itemgetter(0), get_author_refs_of_paper(rec))), izip(cycle([700]), imap(itemgetter(0), get_coauthor_refs_of_paper(rec))))) personid_rows = [ map(int, row[:3]) + [row[4]] for row in get_signatures_of_paper(rec) ] personidrefs_names = dict( ((row[1], row[2]), row[3]) for row in personid_rows) personidrefs = frozenset(personidrefs_names.keys()) new_signatures = list(markrefs - personidrefs) old_signatures = list(personidrefs - markrefs) new_signatures_names = dict( (new, create_normalized_name(split_name_parts(get_name_by_bibref(new)))) for new in new_signatures) # matrix |new_signatures| X |old_signatures| matrix = [[ compare_names(new_signatures_names[new], personidrefs_names[old]) for old in old_signatures ] for new in new_signatures] logwrite(" - Old signatures: %s" % str(old_signatures), bool(old_signatures)) logwrite(" - New signatures: %s" % str(new_signatures), bool(new_signatures)) logwrite(" - Matrix: %s" % str(matrix), bool(matrix)) # [(new_signatures, old_signatures)] best_match = [(new_signatures[new], old_signatures[old]) for new, old, score in maximized_mapping(matrix) if score > threshold] logwrite(" - Best match: %s " % str(best_match), bool(best_match)) for new, old in best_match: logwrite( " - - Moving signature: %s on %s to %s as %s" % (old, rec, new, new_signatures_names[new]), True) modify_signature(old, rec, new, new_signatures_names[new]) remove_signatures(tuple(list(old) + [rec]) for old in old_signatures) not_matched = frozenset(new_signatures) - frozenset( map(itemgetter(0), best_match)) pids_having_rec = set( [int(row[0]) for row in get_signatures_of_paper(rec)]) logwrite(" - Not matched: %s" % str(not_matched), bool(not_matched)) if not_matched: used_pids = set(r[0] for r in personid_rows) for sig in not_matched: name = new_signatures_names[sig] matched_pids = list() if USE_EXT_IDS: if USE_INSPIREID: inspire_id = get_inspire_id_of_signature(sig + (rec, )) if inspire_id: matched_pids = list( get_author_by_external_id(inspire_id[0])) if matched_pids and int( matched_pids[0][0]) in pids_having_rec: matched_pids = list() if matched_pids: add_signature(list(sig) + [rec], name, matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) continue matched_pids = find_pids_by_exact_name(name) matched_pids = [ p for p in matched_pids if int(p[0]) not in used_pids ] if not matched_pids or int(matched_pids[0][0]) in pids_having_rec: new_pid = new_person_from_signature(list(sig) + [rec], name) used_pids.add(new_pid) updated_pids.add(new_pid) else: add_signature(list(sig) + [rec], name, matched_pids[0][0]) used_pids.add(matched_pids[0][0]) updated_pids.add(matched_pids[0][0]) pids_having_rec.add(matched_pids[0][0]) logwrite('Finished with %s' % str(rec), False) update_status_final() if personids_to_update_extids: updated_pids |= personids_to_update_extids if updated_pids: # an empty set will update all canonical_names update_canonical_names_of_authors(updated_pids) update_external_ids_of_authors( updated_pids, limit_to_claimed_papers=bconfig. LIMIT_EXTERNAL_IDS_COLLECTION_TO_CLAIMED_PAPERS) if SWAPPED_GET_GROUPED_RECORDS: destroy_partial_marc_caches() remove_empty_authors()
def recalculate(self, cluster_set): ''' Constructs probability matrix. If use_cache is true, it will try to load old computations from the database. If save cache is true it will save the current results into the database. @param cluster_set: A cluster set object, used to initialize the matrix. ''' last_cleaned = 0 self._bib_matrix.store() try: old_matrix = Bib_matrix(self._bib_matrix.name+'copy') old_matrix.duplicate_existing(self._bib_matrix.name, self._bib_matrix.name+'copy') old_matrix.load() cached_bibs = self.__get_up_to_date_bibs(old_matrix) have_cached_bibs = bool(cached_bibs) except IOError: old_matrix.destroy() cached_bibs = None have_cached_bibs = False self._bib_matrix.destroy() self._bib_matrix = Bib_matrix(cluster_set.last_name, cluster_set=cluster_set) ncl = cluster_set.num_all_bibs expected = ((ncl * (ncl - 1)) / 2) if expected == 0: expected = 1 try: cur_calc, opti, prints_counter = 0, 0, 0 for cl1 in cluster_set.clusters: if cur_calc+opti - prints_counter > 100000 or cur_calc == 0: update_status((float(opti) + cur_calc) / expected, "Prob matrix: calc %d, opti %d." % (cur_calc, opti)) prints_counter = cur_calc+opti # #clean caches if cur_calc - last_cleaned > 20000000: gc.collect() # clear_comparison_caches() last_cleaned = cur_calc for cl2 in cluster_set.clusters: if id(cl1) < id(cl2) and not cl1.hates(cl2): for bib1 in cl1.bibs: for bib2 in cl2.bibs: if have_cached_bibs: try: val = old_matrix[bib1, bib2] opti += 1 if bconfig.DEBUG_CHECKS: assert _debug_is_eq_v(val, compare_bibrefrecs(bib1, bib2)) except KeyError: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) if not val: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) else: cur_calc += 1 val = compare_bibrefrecs(bib1, bib2) self._bib_matrix[bib1, bib2] = val except Exception, e: raise Exception("""Error happened in prob_matrix.recalculate with val:%s original_exception: %s """%(str(val),str(e)))