def transform2(self, X): self.g = helperigraph.load_matrix(X) self.g['level'] = 0 while not self.g['level'] == self.max_levels: matching = range(self.g.vcount()) levels = self.g['level'] levels += 1 self.g['similarity'] = getattr( Similarity(self.g, self.g['adjlist']), self.similarity) start = sum(self.g['vertices'][0:1]) end = sum(self.g['vertices'][0:1 + 1]) vertices = range(start, end) param = dict(reduction_factor=self.reduction_factor) if self.matching in ['gmb', 'rgmb']: param['vertices'] = vertices if self.matching in ['hem', 'lem', 'rm']: one_mode_graph = self.g.weighted_one_mode_projection(vertices) matching_method = getattr(one_mode_graph, self.matching) else: matching_method = getattr(self.g, self.matching) matching_method(matching, **param) coarse = self.g.contract(matching) coarse['level'] = levels self.g = coarse return helperigraph.biajcent_matrix(self.g)
def find_similarities(text_1, text_2): model = SentenceTransformer("distilbert-base-nli-stsb-mean-tokens") sequences, distances = get_distances(model, window_tokenizer, text_1, text_2) similarities = [] score = [] for i, qi in enumerate(sequences[0]): for j, qj in enumerate(sequences[1]): distanceNormalized = 1 - distances[i][j] if(distanceNormalized > BIAS): similarities.append(Similarity(distanceNormalized, qi, qj)) score.append(distanceNormalized) return similarities,score
def weighted_one_mode_projection(self, vertices, similarity='common_neighbors'): """ Application of a one-mode projection to a bipartite network generates two unipartite networks, one for each layer, so that vertices with common neighbors are connected by edges in their respective projection. """ graph = MGraph() graph.add_vertices(vertices) graph['source_vertices'] = self.vcount() graph['source_edges'] = self.ecount() graph.vs['name'] = self.vs[vertices]['name'] name_to_id = dict(zip(vertices, range(graph.vcount()))) dict_edges = dict() visited = [0] * self.vcount() for vertex in vertices: neighborhood = self.neighborhood(vertices=vertex, order=2) twohops = neighborhood[(len(self['adjlist'][vertex]) + 1):] for twohop in twohops: if visited[twohop] == 1: continue dict_edges[(name_to_id[vertex], name_to_id[twohop])] = self['projection'](vertex, twohop) visited[vertex] = 1 if len(dict_edges) > 0: edges, weights = list(zip(*dict_edges.items())) graph.add_edges(edges) graph.es['weight'] = weights graph['adjlist'] = list(map(set, graph.get_adjlist())) graph['similarity'] = getattr(Similarity(graph, graph['adjlist']), similarity) return graph
def run(self): graph = self.source_graph.copy() while True: level = graph['level'] contract = False args = [] for layer in range(graph['layers']): do_matching = True if self.global_min_vertices[layer] is None and level[ layer] >= self.max_levels[layer]: do_matching = False elif self.global_min_vertices[layer] and graph['vertices'][ layer] <= self.global_min_vertices[layer]: do_matching = False if do_matching: contract = True level[layer] += 1 graph['similarity'] = getattr( Similarity(graph, graph['adjlist']), self.similarity[layer]) kwargs = dict( reduction_factor=self.reduction_factor[layer]) if self.matching[layer] in ['mlpb', 'gmb', 'rgmb']: kwargs['vertices'] = graph['vertices_by_type'][layer] kwargs['reverse'] = self.reverse[layer] if self.matching[layer] in ['mlpb', 'rgmb']: kwargs['seed_priority'] = self.seed_priority[layer] if self.matching[layer] in ['mlpb']: kwargs['upper_bound'] = self.upper_bound[layer] kwargs['n'] = self.source_graph['vertices'][layer] kwargs[ 'global_min_vertices'] = self.global_min_vertices[ layer] kwargs['tolerance'] = self.tolerance[layer] kwargs['itr'] = self.itr[layer] if self.matching[layer] in ['hem', 'lem', 'rm']: one_mode_graph = graph.weighted_one_mode_projection( graph['vertices_by_type'][layer]) matching_function = getattr(one_mode_graph, self.matching[layer]) else: matching_function = getattr(graph, self.matching[layer]) # Create a args for the engine multiprocessing.pool args.append([(matching_function, kwargs)]) if contract: # Create pools pool = mp.Pool(processes=self.threads) processes = [] for arg in args: processes.append( pool.starmap_async(modified_starmap_async, arg)) # Merge solutions import sys numpy.set_printoptions(threshold=sys.maxsize) matching = numpy.arange(graph.vcount()) for process in processes: result = process.get()[0] vertices = numpy.where(result > -1)[0] matching[vertices] = result[vertices] # Close processes pool.close() pool.join() coarsened_graph = graph.contract(matching) coarsened_graph['level'] = level if coarsened_graph.vcount() == graph.vcount(): break self.hierarchy_graphs.append(coarsened_graph) self.hierarchy_levels.append(level[:]) graph = coarsened_graph else: break
def __init__(self, cfg): super(EventBert, self).__init__() self.similarity = Similarity(cfg) self.bert = BERTSearch(cfg) self.fnn = nn.Linear(2, 1)
def main(): """ Main entry point for the application when run from the command line. """ # Timing instanciation timing = Timing(['Snippet', 'Time [m]', 'Time [s]']) with timing.timeit_context_add('Pre-processing'): # Setup parse options command line current_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parser = args.setup_parser(current_path + '/args/mdr.json') options = parser.parse_args() args.update_json(options) args.check_output(options) # Log instanciation log = helper.initialize_logger(dir='log', output='log') if options.input and options.vertices is None: log.warning('Vertices are required when input is given.') sys.exit(1) # Create default values for optional parameters if options.reduction_factor is None: options.reduction_factor = 0.5 if options.max_levels is None: options.max_levels = 3 if options.matching is None: options.matching = 'greedy_seed_twohops' if options.similarity is None: options.similarity = 'weighted_common_neighbors' # Validation of matching method valid_matching = ['gmb', 'rgmb', 'hem', 'lem', 'rm'] if options.matching.lower() not in valid_matching: log.warning('Matching method is unvalid.') sys.exit(1) # Validation of input extension valid_input = ['.arff', '.dat'] if options.extension not in valid_input: log.warning('Input is unvalid.') sys.exit(1) # Validation of similarity measure valid_similarity = ['common_neighbors', 'weighted_common_neighbors', 'salton', 'preferential_attachment', 'jaccard', 'adamic_adar', 'resource_allocation', 'sorensen', 'hub_promoted', 'hub_depressed', 'leicht_holme_newman', 'weighted_jaccard'] if options.similarity.lower() not in valid_similarity: log.warning('Similarity misure is unvalid.') sys.exit(1) options.vertices = map(int, options.vertices) options.max_levels = int(options.max_levels) options.reduction_factor = float(options.reduction_factor) # Load bipartite graph with timing.timeit_context_add('Load'): if options.extension == '.arff': graph = helperigraph.load_csr(options.input) elif options.extension == '.dat': graph = helperigraph.load_dat(options.input, skip_last_column=options.skip_last_column, skip_rows=options.skip_rows) graph['level'] = 0 # Coarsening with timing.timeit_context_add('Coarsening'): hierarchy_graphs = [] hierarchy_levels = [] while not graph['level'] == options.max_levels: matching = range(graph.vcount()) levels = graph['level'] levels += 1 graph['similarity'] = getattr(Similarity(graph, graph['adjlist']), options.similarity) start = sum(graph['vertices'][0:1]) end = sum(graph['vertices'][0:1 + 1]) if options.matching in ['hem', 'lem', 'rm']: one_mode_graph = graph.weighted_one_mode_projection(vertices) matching_method = getattr(one_mode_graph, options.matching) matching_method(matching, reduction_factor=options.reduction_factor) else: matching_method = getattr(graph, options.matching) matching_method(range(start, end), matching, reduction_factor=options.reduction_factor) coarse = graph.contract(matching) coarse['level'] = levels graph = coarse if options.save_hierarchy or (graph['level'] == options.max_levels): hierarchy_graphs.append(graph) hierarchy_levels.append(levels) # Save with timing.timeit_context_add('Save'): output = options.output for index, obj in enumerate(reversed(zip(hierarchy_levels, hierarchy_graphs))): levels, graph = obj if options.save_conf: with open(output + '-' + str(index) + '.conf', 'w+') as f: d = {} d['source_filename'] = options.input d['source_v0'] = options.vertices[0] d['source_v1'] = options.vertices[1] d['source_vertices'] = options.vertices[0] + options.vertices[1] d['edges'] = graph.ecount() d['vertices'] = graph.vcount() d['reduction_factor'] = options.reduction_factor d['max_levels'] = options.max_levels d['similarity'] = options.similarity d['matching'] = options.matching d['levels'] = levels for layer in range(graph['layers']): vcount = str(len(graph.vs.select(type=layer))) attr = 'v' + str(layer) d[attr] = vcount json.dump(d, f, indent=4) if options.save_ncol: graph.write(output + '-' + str(index) + '.ncol', format='ncol') if options.save_source: with open(output + '-' + str(index) + '.source', 'w+') as f: for v in graph.vs(): f.write(' '.join(map(str, v['source'])) + '\n') if options.save_predecessor: with open(output + '-' + str(index) + '.predecessor', 'w+') as f: for v in graph.vs(): f.write(' '.join(map(str, v['predecessor'])) + '\n') if options.save_successor: numpy.savetxt(output + '-' + str(index) + '.successor', graph.vs['successor'], fmt='%d') if options.save_weight: numpy.savetxt(output + '-' + str(index) + '.weight', graph.vs['weight'], fmt='%d') if options.save_adjacency: numpy.savetxt(output + '-' + str(index) + '.dat', helperigraph.biajcent_matrix(graph), fmt='%.2f') if options.save_gml: del graph['adjlist'] del graph['similarity'] graph['layers'] = str(graph['layers']) graph['vertices'] = ','.join(map(str, graph['vertices'])) graph['level'] = str(graph['level']) graph.vs['name'] = map(str, range(0, graph.vcount())) graph.vs['type'] = map(str, graph.vs['type']) graph.vs['weight'] = map(str, graph.vs['weight']) graph.vs['successor'] = map(str, graph.vs['successor']) for v in graph.vs(): v['source'] = ','.join(map(str, v['source'])) v['predecessor'] = ','.join(map(str, v['predecessor'])) graph.write(output + '-' + str(index) + '.gml', format='gml') if not options.save_hierarchy: break if options.show_timing: timing.print_tabular() if options.save_timing_csv: timing.save_csv(output + '-timing.csv') if options.save_timing_json: timing.save_json(output + '-timing.csv')
def run(self): graph = self.source_graph.copy() while True: level = graph['level'] debug_print("------------------------------------------------------") debug_print("level: ") debug_print(level) debug_print(graph) debug_print("------------------------------------------------------") contract = False args = [] spark_args = [] broadcast_kwargs = [] current_layer = 0 for layer in range(graph['layers']): current_layer = current_layer + 1 do_matching = True if self.global_min_vertices[layer] is None and level[layer] >= self.max_levels[layer]: debug_print("------------------") debug_print("max") debug_print(self.global_min_vertices[layer]) debug_print(level[layer]) debug_print(self.max_levels[layer]) debug_print("------------------") do_matching = False elif self.global_min_vertices[layer] and graph['vertices'][layer] <= self.global_min_vertices[layer]: debug_print("min") do_matching = False if do_matching: debug_print("do_matching") debug_print(do_matching) contract = True level[layer] += 1 graph['similarity'] = getattr(Similarity(graph, graph['adjlist']), self.similarity[layer]) kwargs = dict(reduction_factor=self.reduction_factor[layer]) if self.matching[layer] in ['mlpb', 'gmb', 'rgmb']: kwargs['vertices'] = graph['vertices_by_type'][layer] kwargs['reverse'] = self.reverse[layer] if self.matching[layer] in ['mlpb', 'rgmb']: kwargs['seed_priority'] = self.seed_priority[layer] if self.matching[layer] in ['mlpb']: kwargs['upper_bound'] = self.upper_bound[layer] kwargs['n'] = self.source_graph['vertices'][layer] kwargs['global_min_vertices'] = self.global_min_vertices[layer] kwargs['tolerance'] = self.tolerance[layer] kwargs['itr'] = self.itr[layer] if self.matching[layer] in ['hem', 'lem', 'rm']: one_mode_graph = graph.weighted_one_mode_projection(graph['vertices_by_type'][layer]) matching_function = getattr(one_mode_graph, self.matching[layer]) # TODO: This could be removed because gmb_pure is hardcoded on the spark approach matching_function_spark = getattr(graph, 'pure_gmb' if self.spark is True and self.matching[ layer] == 'gmb' else self.matching[layer]) else: matching_function_spark = getattr(graph, 'pure_gmb' if self.spark is True and self.matching[ layer] == 'gmb' else self.matching[layer]) matching_function = getattr(graph, self.matching[layer]) # Create a args for the engine multiprocessing.pool args.append([(matching_function, kwargs)]) spark_args.append([(matching_function_spark, kwargs, current_layer)]) broadcast_kwargs.append(kwargs) graph_similarity = self.sparkContext.broadcast(graph['similarity']) def flat_map(arrays, function) -> list: mapped_array = [] for array in arrays: for item in function(array): mapped_array.append(item) return mapped_array if contract: debug_print("contract") debug_print(contract) vertices = flat_map(broadcast_kwargs, lambda arg: arg["vertices"]) final_matching = [] broadcastGraph = self.sparkContext.broadcast(graph) if self.spark: sorted_edges_by_layer = self.sparkContext.parallelize(spark_args) \ .flatMap(lambda arg: gmb_pure_flat_map_two_layers_into_one_list_with_neighborhood(arg, broadcastGraph)) \ .flatMap(lambda arg: gmb_pure_compute_neigh_list_with_similarity(arg, graph_similarity)) \ .reduceByKey(lambda a, b: gmb_pure_map_neight_with_great_similarity(a, b)) \ .map(gmb_pure_map_by_layer_reduced) \ .sortBy(sort_by_similarity) \ .groupByKey() \ .collect() final_matching = gmb_matching_pure_spark(graph, sorted_edges_by_layer, broadcast_kwargs) debug_print("1==================================================") # for layer in sorted_edges_by_layer: # for element in layer[1]: # debug_print("{},".format(element)) debug_print("==================================================1") coarsened_graph = contract_pure(input_graph=graph, matching=final_matching) coarsened_graph['level'] = level if coarsened_graph.vcount() == graph.vcount(): debug_print("break:vcount") break self.hierarchy_graphs.append(coarsened_graph) self.hierarchy_levels.append(level[:]) graph = coarsened_graph debug_print('------------------------------------------graph------------------------------------------') debug_print(graph) debug_print('------------------------------------------graph------------------------------------------') # break else: print('------------------------------------------graph------------------------------------------') print(graph) print('------------------------------------------graph------------------------------------------') # contract === false # do_matching debug_print("break:else") break
def transform(self, X): self.g = helperigraph.load_matrix(X) n = self.g['vertices'][1] self.g['level'] = 0 new_min = 0.1 new_max = 10 old_min = min(self.g.es['weight']) old_max = max(self.g.es['weight']) with open("../bnoc-src/output/cbrson.ncol", "w+") as f: for e in self.g.es(): e['weight'] = helper.remap(e['weight'], old_min, old_max, new_min, new_max) f.write( str(e.tuple[0]) + ' ' + str(e.tuple[1]) + ' ' + str(e['weight']) + '\n') # print self.g.ecount() # # print self.g['vertices'] # dd = self.g.degree_distribution() # print dd # print self.g['vertices'] # print 'grau zero', len(self.g.vs.select(_degree = 0)) # print 'grau um', len(self.g.vs.select(_degree = 1)) # print 'grau dois', len(self.g.vs.select(_degree = 2)) # print 'grau tres', len(self.g.vs.select(_degree = 3)) # print 'grau quatro', len(self.g.vs.select(_degree = 4)) # exit() # plt.plot(dd).show() # xs, ys = zip(*[(left, count) for left, _, count in self.g.degree_distribution().bins()]) # pylab.bar(xs, ys) # pylab.show() running = True while running: running = False membership = range(self.g.vcount()) levels = self.g['level'] contract = False matching_layer = True if (self.global_min_vertices is None): if levels >= self.max_levels: matching_layer = False elif (int(self.g['vertices'][1]) <= int(self.global_min_vertices)): matching_layer = False if matching_layer: contract = True running = True levels += 1 self.g['similarity'] = getattr( Similarity(self.g, self.g['adjlist']), self.similarity) start = sum(self.g['vertices'][0:1]) end = sum(self.g['vertices'][0:1 + 1]) vertices = range(start, end) param = dict(reduction_factor=self.reduction_factor) if self.matching in ['mlpb', 'nmlpb', 'nmb']: param['upper_bound'] = self.upper_bound param['n'] = n param['global_min_vertices'] = self.global_min_vertices if self.matching in ['mlpb', 'nmlpb', 'gmb', 'rgmb']: param['vertices'] = vertices if self.matching in ['mlpb']: param['tolerance'] = self.tolerance param['itr'] = self.itr if self.matching in ['hem', 'lem', 'rm']: one_mode_graph = self.g.weighted_one_mode_projection( vertices) matching_method = getattr(one_mode_graph, self.matching) else: matching_method = getattr(self.g, self.matching) matching_method(membership, **param) if contract: coarse = self.g.contract(membership) coarse['level'] = levels if coarse.vcount() == self.g.vcount(): break self.g = coarse return helperigraph.biajcent_matrix(self.g)