def group_edges(cs): plus = [] minus = [] pairs = [] gc.disable() interval = 1000 for current, cl1 in enumerate(cs.clusters): if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = cl1.out_edges for bib2 in xrange(len(cl1.out_edges)): val = pointers[bib2] if val[0] not in Bib_matrix.special_numbers: if val[0] > edge_cut_prob: pairs.append((bib1, bib2, val)) elif val[0] == Bib_matrix.special_symbols['+']: plus.append((bib1, bib2)) elif val[0] == Bib_matrix.special_symbols['-']: minus.append((bib1, bib2)) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (len(plus), len(minus), len(pairs))) gc.enable() return plus, minus, pairs
def delayed_cluster_sets_from_marktables(limit_to_surnames=False): # { name -> [(table, bibref)] } bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' % str(limit_to_surnames)) name_buket = {} if limit_to_surnames: limit_to_surnames = set( [generate_last_name_cluster_str(s) for s in limit_to_surnames]) for tab, ref, name in chain(izip(cycle((100, )), *izip(*get_bib10x())), izip(cycle((700, )), *izip(*get_bib70x()))): name = generate_last_name_cluster_str(name) if limit_to_surnames and not name in limit_to_surnames: continue name_buket[name] = name_buket.get(name, []) + [(tab, ref)] bibauthor_print( 'Delayed_cluster_set_from_marktables going to get %s signatures....' % str(len(name_buket))) all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs)))) for name, refs in name_buket.items()) all_refs = sorted(all_refs, key=itemgetter(2)) return ([ delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs ], map(itemgetter(0), all_refs), map(itemgetter(2), all_refs))
def create_matrix(cluster_set, force): bibs = cluster_set.num_all_bibs expected = bibs * (bibs - 1) / 2 bibauthor_print("Start building matrix for %s. Total number of bibs: %d, " "maximum number of comparisons: %d" % (cluster_set.last_name, bibs, expected)) return prepare_matirx(cluster_set, force)
def main(): """Main function """ try: import invenio.bibauthorid_daemon as daemon except ImportError: bibauthor_print("Hmm...No Daemon process running.") return daemon.bibauthorid_daemon()
def wedge_and_store(cluster_set): bibs = cluster_set.num_all_bibs expected = bibs * (bibs - 1) / 2 bibauthor_print("Start working on %s. Total number of bibs: %d, " "maximum number of comparisons: %d" % (cluster_set.last_name, bibs, expected)) wedge(cluster_set) remove_result_cluster(cluster_set.last_name) cluster_set.store() return True
def tortoise_tweak_coefficient(lastnames, min_coef, max_coef, stepping, build_matrix=True): bibauthor_print('Coefficient tweaking!') bibauthor_print('Cluster sets from mark...') lnames = set([generate_last_name_cluster_str(n) for n in lastnames]) coefficients = [x/100. for x in range(int(min_coef*100),int(max_coef*100),int(stepping*100))] if build_matrix: schedule_workers(_create_matrix, lnames) schedule_workers(_collect_statistics_lname_coeff, ((x,y) for x in lnames for y in coefficients ))
def tortoise(pure=False, force_matrix_creation=False, skip_matrix_creation=False, last_run=None): assert not force_matrix_creation or not skip_matrix_creation # The computation must be forced in case we want # to compute pure results force_matrix_creation = force_matrix_creation or pure if not skip_matrix_creation: bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run) bibauthor_print("Building all matrices.") exit_statuses = schedule_create_matrix( clusters, sizes, force=force_matrix_creation) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses) bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run) bibauthor_print("Starting disambiguation.") exit_statuses = schedule_wedge_and_store( clusters, sizes) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses)
def tortoise(pure=False, force_matrix_creation=False, skip_matrix_creation=False, last_run=None): assert not force_matrix_creation or not skip_matrix_creation # The computation must be forced in case we want # to compute pure results force_matrix_creation = force_matrix_creation or pure if not skip_matrix_creation: bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid( pure, last_run) bibauthor_print("Building all matrices.") exit_statuses = schedule_create_matrix(clusters, sizes, force=force_matrix_creation) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses) bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid( pure, last_run) bibauthor_print("Starting disambiguation.") exit_statuses = schedule_wedge_and_store(clusters, sizes) assert len(exit_statuses) == len(clusters) assert all(stat == os.EX_OK for stat in exit_statuses)
def prepare_matirx(cluster_set, force): if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs() matr = ProbabilityMatrix(cluster_set.last_name) matr.load(load_map=True, load_matrix=False) if not force and matr.is_up_to_date(cluster_set): bibauthor_print("Cluster %s is up-to-date and therefore will not be computed." % cluster_set.last_name) return False matr.load(load_map=False, load_matrix=True) matr.recalculate(cluster_set) matr.store() return True
def _create_matrix(lname): clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname]) idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size)) cluster_set = cluster() create_matrix(cluster_set, True) bibs = cluster_set.num_all_bibs expected = bibs * (bibs - 1) / 2 bibauthor_print("Start working on %s. Total number of bibs: %d, " "maximum number of comparisons: %d" % (cluster_set.last_name, bibs, expected)) cluster_set.store()
def prepare_matirx(cluster_set, force): if bconfig.DEBUG_CHECKS: assert cluster_set._debug_test_hate_relation() assert cluster_set._debug_duplicated_recs() matr = ProbabilityMatrix(cluster_set.last_name) matr.load(load_map=True, load_matrix=False) if not force and matr.is_up_to_date(cluster_set): bibauthor_print( "Cluster %s is up-to-date and therefore will not be computed." % cluster_set.last_name) return False matr.load(load_map=False, load_matrix=True) matr.recalculate(cluster_set) matr.store() return True
def tortoise_from_scratch(): bibauthor_print("Preparing cluster sets.") cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables() bibauthor_print("Building all matrices.") schedule_workers(lambda x: force_create_matrix(x, force=True), cluster_sets) empty_tortoise_results_table() bibauthor_print("Preparing cluster sets.") cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables() bibauthor_print("Starting disambiguation.") schedule_workers(wedge, cluster_sets)
def _collect_statistics_lname_coeff(params): lname = params[0] coeff = params[1] clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname]) idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size)) cluster_set = cluster() create_matrix(cluster_set, False) bibs = cluster_set.num_all_bibs expected = bibs * (bibs - 1) / 2 bibauthor_print("Start working on %s. Total number of bibs: %d, " "maximum number of comparisons: %d" % (cluster_set.last_name, bibs, expected)) wedge(cluster_set, True, coeff) remove_result_cluster(cluster_set.last_name)
def tortoise_last_name(name, from_mark=False, pure=False): assert not (from_mark and pure) lname = generate_last_name_cluster_str(name) if from_mark: clusters, lnames, sizes = delayed_cluster_sets_from_marktables() else: clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure) try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size)) cluster_set = cluster() create_matrix(cluster_set, True) wedge_and_store(cluster_set) except IndexError: bibauthor_print("Sorry, %s(%s) not found in the last name clusters" % (name, lname))
def delayed_cluster_sets_from_marktables(limit_to_surnames=False): # { name -> [(table, bibref)] } bibauthor_print('Delayed_cluster_set_from_marktables limited to %s' % str(limit_to_surnames)) name_buket = {} if limit_to_surnames: limit_to_surnames = set([generate_last_name_cluster_str(s) for s in limit_to_surnames]) for tab, ref, name in chain(izip(cycle((100,)), *izip(*get_bib10x())), izip(cycle((700,)), *izip(*get_bib70x()))): name = generate_last_name_cluster_str(name) if limit_to_surnames and not name in limit_to_surnames: continue name_buket[name] = name_buket.get(name, []) + [(tab, ref)] bibauthor_print('Delayed_cluster_set_from_marktables going to get %s signatures....' % str(len(name_buket))) all_refs = ((name, refs, len(list(get_signatures_from_bibrefs(refs)))) for name, refs in name_buket.items()) all_refs = sorted(all_refs, key=itemgetter(2)) return ([delayed_create_from_mark(set(refs), name) for name, refs, _ in all_refs], map(itemgetter(0), all_refs), map(itemgetter(2), all_refs))
def tortoise_from_scratch(): bibauthor_print("Preparing cluster sets.") cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables() bibauthor_print("Building all matrices.") exit_statuses = schedule_create_matrix(cluster_sets, sizes, force=True) assert len(exit_statuses) == len(cluster_sets) assert all(stat == os.EX_OK for stat in exit_statuses) empty_results_table() bibauthor_print("Preparing cluster sets.") cluster_sets, _lnames, sizes = delayed_cluster_sets_from_marktables() bibauthor_print("Starting disambiguation.") exit_statuses = schedule_wedge_and_store(cluster_sets, sizes) assert len(exit_statuses) == len(cluster_sets) assert all(stat == os.EX_OK for stat in exit_statuses)
def tortoise(pure=False, force_matrix_creation=False, skip_matrix_creation=False, last_run=None): assert not force_matrix_creation or not skip_matrix_creation # The computation must be forced in case we want # to compute pure results force_matrix_creation = force_matrix_creation or pure if not skip_matrix_creation: bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run) bibauthor_print("Building all matrices.") schedule_workers(lambda x: force_create_matrix(x, force=force_matrix_creation), clusters) bibauthor_print("Preparing cluster sets.") clusters, _lnames, sizes = delayed_cluster_sets_from_personid(pure, last_run) bibauthor_print("Starting disambiguation.") schedule_workers(wedge_and_store, clusters)
def _create_matrix(lname): clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname]) try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] bibauthor_print("Found, %s. Total number of bibs: %d." % (lname, size)) cluster_set = cluster() create_matrix(cluster_set, False) bibs = cluster_set.num_all_bibs expected = bibs * (bibs - 1) / 2 bibauthor_print("Start working on %s. Total number of bibs: %d, " "maximum number of comparisons: %d" % (cluster_set.last_name, bibs, expected)) cluster_set.store() except (IndexError, ValueError): bibauthor_print("Sorry, %s not found in the last name clusters, not creating matrix" % (lname))
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) plus_edges, minus_edges, edges = group_edges(cluster_set) interval = 1000 for i, (bib1, bib2) in enumerate(plus_edges): if (i % interval) == 0: update_status(float(i) / len(plus_edges), "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, (bib1, bib2) in enumerate(minus_edges): if (i % interval) == 0: update_status(float(i) / len(minus_edges), "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") bibauthor_print("Sorting the value edges.") edges = sorted(edges, key=_edge_sorting, reverse=True) interval = 500000 wedge_print("Wedge: New wedge, %d edges." % len(edges)) for current, (v1, v2, unused) in enumerate(edges): if (current % interval) == 0: update_status(float(current) / len(edges), "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] idcl1 = cluster_set.clusters.index(cl1) idcl2 = cluster_set.clusters.index(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map)
def force_create_matrix(cluster_set, force): bibauthor_print("Building a cluster set.") return create_matrix(cluster_set(), force)
def tortoise_last_name(name, from_mark=False, pure=False): bibauthor_print('Start working on %s' % name) assert not(from_mark and pure) lname = generate_last_name_cluster_str(name) if from_mark: bibauthor_print(' ... from mark!') clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname]) bibauthor_print(' ... delayed done') else: bibauthor_print(' ... from pid, pure') clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure) bibauthor_print(' ... delayed pure done!') # try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] cluster_set = cluster() bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size)) create_matrix(cluster_set, True) wedge_and_store(cluster_set)
def tortoise_last_name(name, from_mark=False, pure=False): bibauthor_print('Start working on %s' % name) assert not (from_mark and pure) lname = generate_last_name_cluster_str(name) if from_mark: bibauthor_print(' ... from mark!') clusters, lnames, sizes = delayed_cluster_sets_from_marktables([lname]) bibauthor_print(' ... delayed done') else: bibauthor_print(' ... from pid, pure') clusters, lnames, sizes = delayed_cluster_sets_from_personid(pure) bibauthor_print(' ... delayed pure done!') # try: idx = lnames.index(lname) cluster = clusters[idx] size = sizes[idx] cluster_set = cluster() bibauthor_print("Found, %s(%s). Total number of bibs: %d." % (name, lname, size)) create_matrix(cluster_set, True) wedge_and_store(cluster_set)
def group_sort_edges(cs, original_process_id): bibauthor_print("group_sort_edges spowned by %s" % original_process_id) plus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'w') minus_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'w') pairs_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id),'w') data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'w') plus_count = 0 minus_count = 0 pairs_count = 0 default_val = [0.,0.] #gc.disable() interval = 1000 current = -1 for cl1 in cs.clusters: current += 1 if (current % interval) == 0: update_status(float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = h5file[str(id(cl1))] for bib2 in xrange(len(h5file[str(id(cl1))])): val = pointers[bib2] #if val[0] not in Bib_matrix.special_numbers: #optimization: special numbers are assumed to be negative if val[0] >= 0: if val[0] > edge_cut_prob: pairs_count += 1 pairs_fp.write(_pack_vals((bib1, bib2, val))) elif val[0] == Bib_matrix.special_symbols['+']: plus_count += 1 plus_fp.write(_pack_vals((bib1, bib2, default_val))) elif val[0] == Bib_matrix.special_symbols['-']: minus_count += 1 minus_fp.write(_pack_vals((bib1, bib2, default_val))) else: assert val[0] == Bib_matrix.special_symbols[None], "Invalid Edge" update_status_final("Finished with the edge grouping.") plus_fp.close() minus_fp.close() pairs_fp.close() bibauthor_print("Positive edges: %d, Negative edges: %d, Value edges: %d." % (plus_count, minus_count, pairs_count)) #gc.enable() bibauthor_print("Sorting in-file value edges.") sortFileInPlace(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id), bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id), lambda x: _edge_sorting(_unpack_vals(x)), reverse=True) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_temp_edges_cache_e_'+str(original_process_id)) bibauthor_print("Dumping egdes data to file...") cPickle.dump((plus_count, minus_count, pairs_count), data_fp) data_fp.close()
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) original_process_id = PID() #remember to close the files! #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set) p = Process(target=group_sort_edges, args=(cluster_set,original_process_id)) p.start() p.join() plus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id),'r') minus_edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id),'r') edges_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id),'r') data_fp = open(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id),'r') len_plus,len_minus,len_edges = cPickle.load(data_fp) data_fp.close() interval = 1000 for i, s in enumerate(plus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_plus, "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, s in enumerate(minus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_minus, "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") interval = 50000 wedge_print("Wedge: New wedge, %d edges." % len_edges) current = -1 for s in edges_fp: v1, v2, unused = _unpack_vals(s) current += 1 if (current % interval) == 0: update_status(float(current) / len_edges, "Wedge...") assert unused != '+' and unused != '-', PID()+"Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] #try using object ids instead of index to boost performances #idcl1 = cluster_set.clusters.index(cl1) #idcl2 = cluster_set.clusters.index(cl2) idcl1 = id(cl1) idcl2 = id(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print("Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot(cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s"% (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map) plus_edges_fp.close() minus_edges_fp.close() edges_fp.close() data_fp.close() try: os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_p_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_m_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_e_'+str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH+'/wedge_edges_cache_data_'+str(original_process_id)) except: pass
def group_sort_edges(cs, original_process_id): bibauthor_print("group_sort_edges spowned by %s" % original_process_id) plus_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id), 'w') minus_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id), 'w') pairs_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id), 'w') data_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id), 'w') plus_count = 0 minus_count = 0 pairs_count = 0 default_val = [0., 0.] #gc.disable() interval = 1000 current = -1 for cl1 in cs.clusters: current += 1 if (current % interval) == 0: update_status( float(current) / len(cs.clusters), "Grouping all edges...") bib1 = tuple(cl1.bibs)[0] pointers = h5file[str(id(cl1))] for bib2 in xrange(len(h5file[str(id(cl1))])): val = pointers[bib2] #if val[0] not in Bib_matrix.special_numbers: #optimization: special numbers are assumed to be negative if val[0] >= 0: if val[0] > edge_cut_prob: pairs_count += 1 pairs_fp.write(_pack_vals((bib1, bib2, val))) elif val[0] == Bib_matrix.special_symbols['+']: plus_count += 1 plus_fp.write(_pack_vals((bib1, bib2, default_val))) elif val[0] == Bib_matrix.special_symbols['-']: minus_count += 1 minus_fp.write(_pack_vals((bib1, bib2, default_val))) else: assert val[0] == Bib_matrix.special_symbols[ None], "Invalid Edge" update_status_final("Finished with the edge grouping.") plus_fp.close() minus_fp.close() pairs_fp.close() bibauthor_print( "Positive edges: %d, Negative edges: %d, Value edges: %d." % (plus_count, minus_count, pairs_count)) #gc.enable() bibauthor_print("Sorting in-file value edges.") sortFileInPlace(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id), bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id), lambda x: _edge_sorting(_unpack_vals(x)), reverse=True) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_temp_edges_cache_e_' + str(original_process_id)) bibauthor_print("Dumping egdes data to file...") cPickle.dump((plus_count, minus_count, pairs_count), data_fp) data_fp.close()
def do_wedge(cluster_set, deep_debug=False): ''' Rearranges the cluster_set acoarding to be values in the probability_matrix. The deep debug option will produce a lot of output. Avoid using it with more than 20 bibs in the cluster set. ''' bib_map = create_bib_2_cluster_dict(cluster_set) original_process_id = PID() #remember to close the files! #plus_edges_fp, len_plus, minus_edges_fp, len_minus, edges_fp, len_edges = group_sort_edges(cluster_set) p = Process(target=group_sort_edges, args=(cluster_set, original_process_id)) p.start() p.join() plus_edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id), 'r') minus_edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id), 'r') edges_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id), 'r') data_fp = open( bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id), 'r') len_plus, len_minus, len_edges = cPickle.load(data_fp) data_fp.close() interval = 1000 for i, s in enumerate(plus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status( float(i) / len_plus, "Agglomerating obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 update_status_final("Agglomerating obvious clusters done.") interval = 1000 for i, s in enumerate(minus_edges_fp): bib1, bib2, unused = _unpack_vals(s) if (i % interval) == 0: update_status(float(i) / len_minus, "Dividing obvious clusters...") cl1 = bib_map[bib1] cl2 = bib_map[bib2] if cl1 != cl2 and not cl1.hates(cl2): cl1.quarrel(cl2) update_status_final("Dividing obvious clusters done.") interval = 50000 wedge_print("Wedge: New wedge, %d edges." % len_edges) current = -1 for s in edges_fp: v1, v2, unused = _unpack_vals(s) current += 1 if (current % interval) == 0: update_status(float(current) / len_edges, "Wedge...") assert unused != '+' and unused != '-', PID( ) + "Signed edge after filter!" cl1 = bib_map[v1] cl2 = bib_map[v2] #try using object ids instead of index to boost performances #idcl1 = cluster_set.clusters.index(cl1) #idcl2 = cluster_set.clusters.index(cl2) idcl1 = id(cl1) idcl2 = id(cl2) #keep the ids low! if idcl1 > idcl2: idcl1, idcl2 = idcl2, idcl1 cl1, cl2 = cl2, cl1 wedge_print( "Wedge: popped new edge: Verts = (%s,%s) from (%s, %s) Value = (%f, %f)" % (idcl1, idcl2, v1, v2, unused[0], unused[1])) if cl1 != cl2 and not cl1.hates(cl2): if deep_debug: export_to_dot( cluster_set, "/tmp/%s%d.dot" % (cluster_set.last_name, current), bib_map, (v1, v2, unused)) decision, value = _decide(cl1, cl2) if decision: wedge_print("Wedge: Joined %s to %s with %s" % (idcl1, idcl2, value)) join(cl1, cl2) cluster_set.clusters.remove(cl2) for v in cl2.bibs: bib_map[v] = cl1 else: wedge_print("Wedge: Quarreled %s from %s with %s " % (idcl1, idcl2, value)) cl1.quarrel(cl2) elif cl1 == cl2: wedge_print("Wedge: Clusters already joined! (%s,%s)" % (idcl1, idcl2)) else: wedge_print("Wedge: Clusters hate each other! (%s,%s)" % (idcl1, idcl2)) update_status_final("Wedge done.") bibauthor_print("") if deep_debug: export_to_dot(cluster_set, "/tmp/%sfinal.dot" % cluster_set.last_name, bib_map) plus_edges_fp.close() minus_edges_fp.close() edges_fp.close() data_fp.close() try: os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_p_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_m_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_e_' + str(original_process_id)) os.remove(bconfig.TORTOISE_FILES_PATH + '/wedge_edges_cache_data_' + str(original_process_id)) except: pass
def force_wedge_and_store(cluster_set): bibauthor_print("Building a cluster set.") return wedge_and_store(cluster_set())