def read_graph(filename, direction): graph = gt.load_graph_from_csv(filename, directed=direction, hashed=True, string_vals=True, csv_options={'delimiter': ' '}) return graph
def run_motif_significance(graph, directed=True, data_loc="../data/", motif_size=3, n_shuffles=16, s_model='uncorrelated'): """Run z-score computation for all `motif_size` subgraph on the given `graph`. By default, graph is loaded as a directed graph_tool instance. Parameters ========== graph: name of the graph file.""" f_name = data_loc + graph + ".edges" g = load_graph_from_csv(f_name, directed, csv_options={ 'quotechar': '"', 'delimiter': ' ' }) m, z = motif_significance(g, motif_size, n_shuffles, shuffle_model=s_model) motif_annotation = str(motif_size) + 'm' if directed else str( motif_size) + 'um' output_name = "{}{}_{}.{}".format(data_loc, graph, motif_annotation, "motifslog") return write_motifs_results(output_name, m, z, n_shuffles, s_model)
def load_graph(args: argparse.Namespace) -> Tuple[Graph, np.ndarray]: """Loads the graph and the truth partition. Parameters ---------- args : argparse.Namespace the command-line arguments passed to the program Returns ------- graph : Graph the loaded graph assignment : np.ndarray[int] the true vertex-to-community membership array """ input_filename = build_filepath(args) if args.gtload: graph = load_graph_from_csv(input_filename + ".tsv", not args.undirected, csv_options={'delimiter': args.delimiter}) else: graph = _load_graph(input_filename) print(graph) true_membership = load_true_membership(input_filename, graph.num_vertices()) if args.verbose: print('Number of vertices: {}'.format(graph.num_vertices())) print('Number of edges: {}'.format(graph.num_edges())) if args.degrees: save_degree_distribution(args, graph) return graph, true_membership
def run(filename, header_bool, sub, obj, props, directed, output): # import modules locally import socket from graph_tool import load_graph_from_csv from kgtk.exceptions import KGTKException try: p = props.split(',') print('loading the TSV graph now ...') #filename='/nas/home/ilievski/kgtk/data/conceptnet_first10.tsv' #G2=load_graph_from_csv(filename) #print('yo') G2 = load_graph_from_csv(filename, skip_first=header_bool, directed=directed, hashed=True, ecols=[sub, obj], eprop_names=props.split(','), csv_options={'delimiter': '\t'}) print('graph loaded! It has %d nodes and %d edges' % (G2.num_vertices(), G2.num_edges())) if output: print('now saving the graph to %s' % output) G2.save(output) except: raise KGTKException
def datafile_to_graph(filename): return graph_tool.load_graph_from_csv( filename, directed=True, string_vals=False, eprop_types=['int', 'int'], eprop_names=[time_start_key, time_end_key], csv_options={"delimiter": " "})
def run(filename, directed, log_file, output): from kgtk.exceptions import KGTKException def infer_index(h, options=[]): for o in options: if o in h: return h.index(o) return -1 def infer_predicate(h, options=[]): for o in options: if o in h: return o return '' try: # import modules locally import socket from graph_tool import load_graph_from_csv from graph_tool import centrality import kgtk.gt.analysis_utils as gtanalysis import sys with open(filename, 'r') as f: header = next(f).split('\t') subj_index = infer_index(header, options=['node1', 'subject']) obj_index = infer_index(header, options=['node2', 'object', 'value']) predicate = infer_predicate( header, options=['property', 'predicate', 'label']) p = [] for i, header_col in enumerate(header): if i in [subj_index, obj_index]: continue p.append(header_col) with open(log_file, 'w') as writer: writer.write('loading the TSV graph now ...\n') G2 = load_graph_from_csv(filename, skip_first=True, directed=directed, hashed=True, ecols=[subj_index, obj_index], eprop_names=p, csv_options={'delimiter': '\t'}) writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') for rel, freq in gtanalysis.get_topN_relations( G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if output: writer.write('now saving the graph to %s\n' % output) G2.save(output) except Exception as e: raise KGTKException('Error: ' + str(e))
def run(filename, output, header_bool, sub, obj, pred, props, undirected, strong): # import modules locally import csv import sys from graph_tool import load_graph_from_csv from graph_tool.util import find_edge from graph_tool.topology import label_components from kgtk.exceptions import KGTKException from kgtk.cli_argparse import KGTKArgumentParser def find_pred_position(sub, pred, obj): if pred < sub and pred < obj: return pred elif (pred > sub and pred < obj) or (pred < sub and pred > obj): return pred - 1 else: return pred - 2 try: header = ['node1', 'label', 'node2'] label = 'c' + str(find_pred_position(sub, pred, obj)) g = load_graph_from_csv(filename, not (undirected), skip_first=not (header_bool), hashed=True, csv_options={'delimiter': '\t'}, ecols=(sub, obj)) es = [] if props: properties = props.split(',') for e in properties: es += (find_edge(g, g.edge_properties[label], e)) g.clear_edges() g.add_edge_list(list(set(es))) comp, hist = label_components(g, directed=strong) if output: f = open(output, 'w') wr = csv.writer(f, quoting=csv.QUOTE_NONE, delimiter="\t", escapechar="\n", quotechar='') wr.writerow(header) for v, c in enumerate(comp): wr.writerow( [g.vertex_properties['name'][v], 'connected_component', c]) f.close() else: sys.stdout.write('%s\t%s\t%s\n' % ('node1', 'label', 'node2')) for v, c in enumerate(comp): sys.stdout.write('%s\t%s\t%s\n' % (g.vertex_properties['name'][v], 'connected_component', str(c))) except: raise KGTKException
def load_graph_from_edgelist(dataset, options={}): """""" edgelist, graph_gt = dataset['path_edgelist'], dataset['path_graph_gt'] D = None # prefer graph_gt file if (not 'reconstruct_graph' in options or not options['reconstruct_graph']) and \ (graph_gt and os.path.isfile( graph_gt )): log.info('Constructing DiGraph from gt.xz') D = load_graph(graph_gt) elif edgelist and os.path.isfile(edgelist): log.info('Constructing DiGraph from edgelist') if 'dict_hashed' in options and options['dict_hashed']: D = load_graph_from_csv(edgelist, directed=True, hashed=False, skip_first=False, csv_options={ 'delimiter': ' ', 'quotechar': '"' }) else: D = load_graph_from_csv(edgelist, directed=True, hashed=True, skip_first=False, csv_options={ 'delimiter': ' ', 'quotechar': '"' }) # check if graph should be dumped dump_graph(D, edgelist, options) else: log.error( 'edgelist or graph_gt file to read graph from does not exist') return None return D
def load(dbsession, graph_id, graph_cache_dir): graph_file = graph_cache_dir.joinpath( JackDawDomainGraphGrapthTools.graph_file_name) g = JackDawDomainGraphGrapthTools(dbsession, graph_id) g.graph = graph_tool.load_graph_from_csv(str(graph_file), directed=True, string_vals=False, hashed=False) g.setup() logger.debug('Graph loaded to memory') return g
def load_graph(tsv_fname, directed=True, skip_first=True, sep='\t'): """Load graph from a TSV edgelist file into graph-tool This will take a long time on large graphs (~2 hours for WoS 2018) :tsv_fname: path to edgelist file (TSV with header) :directed: if True, the graph has directed edges :skip_first: skip the first line of the TSV file (i.e., there is a header) :sep: delimiter for the TSV file (default: tab) :returns: graph_tool object """ return graph_tool.load_graph_from_csv(tsv_fname, directed=directed, skip_first=skip_first, csv_options={'delimiter': sep})
def process(self): input_kr: KgtkReader = KgtkReader.open( self.input_file_path, error_file=self.error_file, who="input", options=self.input_reader_options, value_options=self.value_options, verbose=self.verbose, very_verbose=self.very_verbose, ) input_key_columns: typing.List[int] = self.get_key_columns( input_kr, "input") label_col_idx = input_key_columns[1] label = '{}{}'.format('c', label_col_idx) g = load_graph_from_csv(str(input_kr.file_path), not (self.undirected), skip_first=not (self.no_header), hashed=True, csv_options={'delimiter': '\t'}, ecols=(input_key_columns[0], input_key_columns[2])) es = [] header = ['node1', 'label', 'node2'] if self.properties: properties = self.properties.split(',') for e in properties: es += (find_edge(g, g.edge_properties[label], e)) g.clear_edges() g.add_edge_list(list(set(es))) comp, hist = label_components(g, directed=self.strong) ew: KgtkWriter = KgtkWriter.open(header, self.output_file_path, mode=input_kr.mode, require_all_columns=False, prohibit_extra_columns=True, fill_missing_columns=True, gzip_in_parallel=False, verbose=self.verbose, very_verbose=self.very_verbose) for v, c in enumerate(comp): ew.write([ g.vertex_properties['name'][v], 'connected_component', str(c) ])
def to_graph_tool(data): """ convert the dataset to a graph-tool graph. (TBD) graph_tool support weights? :param data: :py:class:`gct.Dataset` :rtype: graph-tool graph """ import graph_tool fname = data.file_edges if not utils.file_exists(fname): data.to_edgelist() g = graph_tool.load_graph_from_csv(fname, directed=data.is_directed(), string_vals=False, skip_first=False, csv_options={"delimiter": " "}) return g
def run(input_file: KGTKFiles, directed, max_hops, source_nodes, target_nodes): def infer_index(h, options=[]): for o in options: if o in h: return h.index(o) return -1 def infer_predicate(h, options=[]): for o in options: if o in h: return o return '' try: # import modules locally from kgtk.exceptions import KGTKException import socket from graph_tool import load_graph_from_csv from graph_tool import centrality from graph_tool.all import find_vertex from graph_tool.topology import all_paths import sys from collections import defaultdict id_col = 'name' graph_edge = 'graph' filename: Path = KGTKArgumentParser.get_input_file(input_file) filename = str(filename) with open(filename, 'r') as f: header = next(f).split('\t') subj_index = infer_index(header, options=['node1', 'subject']) obj_index = infer_index(header, options=['node2', 'object', 'value']) predicate = infer_predicate( header, options=['property', 'predicate', 'label']) p = [] for i, header_col in enumerate(header): if i in [subj_index, obj_index]: continue p.append(header_col) if 'id' not in p: raise KGTKException('Error: no id column found') G = load_graph_from_csv(filename, skip_first=True, directed=directed, hashed=True, ecols=[subj_index, obj_index], eprop_names=p, csv_options={'delimiter': '\t'}) graph_id = 1 paths = defaultdict(set) for source_node in source_nodes: source_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=source_node) if len(source_ids) == 1: source_id = source_ids[0] for target_node in target_nodes: target_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=target_node) if len(target_ids) == 1: target_id = target_ids[0] for path in all_paths(G, source_id, target_id, cutoff=max_hops, edges=True): for an_edge in path: edge_id = G.properties[('e', 'id')][an_edge] paths[edge_id].add(str(graph_id)) graph_id += 1 sys.stdout.write('node1\tlabel\tnode2\tid\t%s\n' % graph_edge) for e in G.edges(): sid, oid = e edge_id = G.properties[('e', 'id')][e] lbl = G.ep[predicate][e] graph_id = '|'.join(list(paths[edge_id])) sys.stdout.write( '%s\t%s\t%s\t%s\t%s\n' % (G.vp[id_col][sid], lbl, G.vp[id_col][oid], edge_id, graph_id)) except Exception as e: raise KGTKException('Error: ' + str(e))
def run(filename,root,rootfile,rootfilecolumn,root_header_bool,output,header_bool,sub,obj,pred,props,undirected): import sys import csv import time from graph_tool.search import dfs_iterator from graph_tool import load_graph_from_csv from graph_tool.util import find_edge from kgtk.exceptions import KGTKException from kgtk.cli_argparse import KGTKArgumentParser #Graph-tool names columns that are not subject or object c0, c1... This function finds the number that graph tool assigned to the predicate column def find_pred_position(sub,pred,obj): if pred < sub and pred < obj: return pred elif (pred > sub and pred < obj) or (pred<sub and pred>obj): return pred-1 else: return pred-2 def get_edges_by_edge_prop(g, p, v): return find_edge(g, prop=g.properties[('e', p)], match=v) label='c'+str(find_pred_position(sub,pred,obj)) header=['node1','label','node2'] root_set=set() property_list=[] if (rootfile): tsv_file = open(rootfile) read_tsv = csv.reader(tsv_file, delimiter="\t") first_row=True for row in read_tsv: if first_row and not root_header_bool: first_row=False continue root_set.add(row[rootfilecolumn]) tsv_file.close() if (root): for r in root.split(','): root_set.add(r) G = load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) name = G.vp["name"] index_list = [] for v in G.vertices(): if name[v] in root_set: index_list.append(v) edge_filter_set = set() if props: property_list = [item for item in props.split(',')] for prop in property_list: edge_filter_set.update(get_edges_by_edge_prop(G, label,prop)); G.clear_edges() G.add_edge_list(list(edge_filter_set)) if output: f=open(output,'w') tsv_writer = csv.writer(f, quoting=csv.QUOTE_NONE,delimiter="\t",escapechar="\n",quotechar='') if index_list == []: print("No root nodes found in the graph") else: tsv_writer.writerow(header) for index in index_list: for e in dfs_iterator(G, G.vertex(index)): tsv_writer.writerow([name[index], 'reachable', name[e.target()]]) f.close() else: if index_list == []: print("No root nodes found in the graph") else: sys.stdout.write('%s\t%s\t%s\n' % ('node1', 'label', 'node2')) for index in index_list: for e in dfs_iterator(G, G.vertex(index)): sys.stdout.write('%s\t%s\t%s\n' % (name[index], 'reachable', name[e.target()]))
import sys import graph_tool as gt import graph_tool.centrality as centr def print_top_v( g, vprops ): names = g.vertex_properties['name']; vv = list( g.vertices() ) vv.sort( key= lambda a: vprops[a], reverse=True ) for v in vv: print( names[v] ) def pageRankBiDi( g ): # calculate the product of the PageRange and reverse PageRank for each vertex pr = centr.pagerank( g ) g.set_reversed( True ) rpr = centr.pagerank( g ) g.set_reversed( False ) for v in g.vertices(): pr[v] = pr[v] * rpr[v] return pr g = gt.load_graph_from_csv( sys.argv[1], csv_options = { 'delimiter': "\t" } ) pr = pageRankBiDi( g ) if g.num_vertices() > 0 else [] print_top_v( g, pr )
args = parser.parse_args() fname = args.f dirname = '/'.join(fname.split('/')[:-1]) node_f_name = dirname + '/node_feature.csv' edge_f_name = dirname + '/edge_feature.csv' sep = args.d directed = bool(args.directed) print('Loaded file name: {},\tis_directed: {},\tis_weighted: {}\n'.format( fname, directed, bool(args.w))) g = load_graph_from_csv(fname, directed=directed, csv_options={ 'delimiter': sep, 'quotechar': '"' }) weight = None if bool(args.w): edge_weights = [] with open(fname) as f: for l in f: edge_weight = float(l.split()[2]) edge_weights.append(edge_weight) # create property for edge weights weight = g.new_edge_property('float') weight.a = edge_weights
def main(args): outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): logger.debug("creating output directory: {}".format(outdir)) os.mkdir(outdir) else: logger.debug("using output directory: {}".format(outdir)) start = timer() logger.debug("loading graph from {}. This will take a while...".format( args.edges)) g = graph_tool.load_graph_from_csv(args.edges, directed=True, skip_first=True, csv_options={'delimiter': '\t'}) logger.debug("done loading graph. Took {}".format( format_timespan(timer() - start))) start = timer() logger.debug("creating dictionary of name to vertices...") name_to_v = {g.vp.name[v]: v for v in g.vertices()} logger.debug("done loading dictionary. Took {}".format( format_timespan(timer() - start))) start = timer() n_samples = 1000 random_seed = 999 logger.debug("getting a sample of {} vertices (random seed: {})".format( n_samples, random_seed)) random_state = np.random.RandomState(random_seed) vertices_sample_indexes = random_state.randint(low=0, high=len(name_to_v), size=n_samples) vertices_sample = [g.vertex(x) for x in vertices_sample_indexes] # vertices_sample is a list of graph-tools Vertex objects logger.debug("done getting random sample. took {}".format( format_timespan(timer() - start))) # get a unique filename i = 0 while True: fname_calc_times = os.path.join(outdir, 'calc_times_{:03}.csv'.format(i)) if not os.path.exists(fname_calc_times): break i += 1 f_calc_times = open(fname_calc_times, 'w', buffering=1) sep = ',' logger.debug("writing header to {}".format(fname_calc_times)) f_calc_times.write( "source_index{sep}source_name{sep}calc_time{sep}distance_fname\n". format(sep=sep)) start = timer() logger.debug("starting shortest path calculations...") if args.undirected is True: logger.debug( "treating graph as undirected for shortest distance calculations") directed = False else: directed = None for i, source in enumerate(vertices_sample): this_start = timer() source_name = g.vp.name[source] source_index = vertices_sample_indexes[i] outfname = "{:012d}.csv".format( i) # filename corresponds to row number of calc_time.csv file outfname = os.path.join(outdir, outfname) if os.path.exists(outfname): logger.debug( "filename {} already exists. skipping.".format(outfname)) continue logger.debug( "calculating shortest distance for vertex: index: {} | name: {}". format(source_index, source_name)) dist = shortest_distance(g, source=source, target=vertices_sample, directed=directed) this_time = timer() - this_start with open(outfname, 'w') as outf: for x in dist: outf.write("{}\n".format(x)) f_calc_times.write( "{source_index}{sep}{source_name}{sep}{calc_time}{sep}{distance_fname}\n" .format(sep=sep, source_index=source_index, source_name=source_name, calc_time=this_time, distance_fname=outfname)) logger.debug("finished shortest path calculations. Took {}".format( format_timespan(timer() - start))) f_calc_times.close()
from datetime import datetime from graph_tool.all import * from graph_tool import * stage_dir = "/media/johannes/D45CF5375CF514C8/Users/johannes/mlhd/0-15/stage/" stage_files = listdir(stage_dir) for i in stage_files: print(i) # i = stage_files[20] gx = gt.load_graph_from_csv(stage_dir+i, directed=True, string_vals=True, csv_options={'delimiter':'\t'}) rel_mbids = find_vertex_range(gx, 'in', (100, 10**19)) # need to set limits # should be rather low: want to focus on new ones # genres are not equals mbid_file = stage_dir + "mbids/" + i[0:len(i)-4] + ".csv" with open(mbid_file, 'w') as fo: wr = csv.writer(fo) [wr.writerow([gx.vp.name[i]]) for i in rel_mbids] ## add saving of graph so that i don't have to read them in all the time
parser.add_argument("edgelistfilename", help="the edgelist to be parsed") parser.add_argument("-p", "--pngfilename", type=str, help="the output png name", default="test.png") args = parser.parse_args() # if args.positioned: # push the ATOM positions to a file # with open(pdbFilename) as pdbFile: # for line in pdbFile: # comArray = treeFileToNumpyArray("edgelist3.tree") g = graph_tool.load_graph_from_csv(args.edgelistfilename, directed=False, skip_first=True, csv_options={"delimiter": " "}) # pos = graph_tool.draw.sfdp_layout(g, C=1000000) # pos = graph_tool.draw.sfdp_layout(g) # pos = graph_tool.draw.fruchterman_reingold_layout(g, n_iter=1000) # print(pos) comps = graph_tool.topology.label_components(g) # state = inference.minimize_blockmodel_dl(g, deg_corr=False) # print(comps[1]) # print([x for x in comps]) numpy.savetxt("comps.txt", comps[1]) # draw.graph_draw( # g, # pos=pos, # output=args.pngfilename,
def run(input_file: KGTKFiles, path_file, output_stats, directed, max_hops): def infer_index(h, options=[]): for o in options: if o in h: return h.index(o) return -1 def infer_predicate(h, options=[]): for o in options: if o in h: return o return '' try: # import modules locally from kgtk.exceptions import KGTKException import socket from graph_tool import load_graph_from_csv from graph_tool import centrality from graph_tool.all import find_vertex from graph_tool.topology import all_paths import sys import csv from collections import defaultdict csv.field_size_limit(sys.maxsize) id_col = 'name' pairs = [] with open(path_file, 'r') as f: header = next(f) for line in f: src, tgt = line.strip().split('\t') pairs.append((src, tgt)) filename: Path = KGTKArgumentParser.get_input_file(input_file) with open(filename, 'r') as f: header = next(f).strip().split('\t') subj_index = infer_index(header, options=['node1', 'subject']) obj_index = infer_index(header, options=['node2', 'object', 'value']) predicate = infer_predicate( header, options=['property', 'predicate', 'label']) p = [] for i, header_col in enumerate(header): if i in [subj_index, obj_index]: continue p.append(header_col) if 'id' not in p: raise KGTKException('Error: no id column found') G = load_graph_from_csv(str(filename), skip_first=True, directed=directed, hashed=True, ecols=[subj_index, obj_index], eprop_names=p, csv_options={'delimiter': '\t'}) sys.stdout.write('node1\tlabel\tnode2\tid\n') id_count = 0 if not output_stats: for e in G.edges(): sid, oid = e lbl = G.ep[predicate][e] sys.stdout.write( '%s\t%s\t%s\t%s\n' % (G.vp[id_col][sid], lbl, G.vp[id_col][oid], '{}-{}-{}'.format(G.vp[id_col][sid], lbl, id_count))) id_count += 1 id_count = 0 path_id = 0 paths = defaultdict(set) for pair in pairs: source_node, target_node = pair source_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=source_node) target_ids = find_vertex(G, prop=G.properties[('v', id_col)], match=target_node) if len(source_ids) == 1 and len(target_ids) == 1: source_id = source_ids[0] target_id = target_ids[0] for path in all_paths(G, source_id, target_id, cutoff=max_hops, edges=True): for edge_num, an_edge in enumerate(path): edge_id = G.properties[('e', 'id')][an_edge] node1 = 'p%d' % path_id sys.stdout.write( '%s\t%d\t%s\t%s\n' % (node1, edge_num, edge_id, '{}-{}-{}'.format( node1, edge_num, id_count))) id_count += 1 path_id += 1 except Exception as e: raise KGTKException('Error: ' + str(e))
with open('synthetic_graphs_examined.csv', mode='w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=Fieldnames) writer.writeheader() for prop in props: writer.writerow(prop._asdict()) # Examine real world graphs print("=====Examining Real World Graphs=====") with open('real_graphs_examined.csv', mode='w') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=Fieldnames) writer.writeheader() for graphname in REAL: print("graphname = ", graphname) filename = "../../data/{0}/unkOverlap_unkBlockSizeVar/{0}_unkOverlap_unkBlockSizeVar_-1_nodes".format( graphname) if graphname in REAL_UNDIRECTED: graph = load_graph_from_csv(filename + ".tsv", False, csv_options={'delimiter': ' '}) directed = False else: graph = load_graph_from_csv(filename + ".tsv", True, csv_options={'delimiter': ' '}) directed = True print("done loading graph") prop = examine_graph(graph, "real", graphname, True, directed) print("done examining graph") with open('real_graphs_examined.csv', mode='a') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=Fieldnames) writer.writerow(prop._asdict())
# use awk first # awk '$1 >= 1239200000 && $1 <= 1259400000' *.txt >> read_in.txt # requires filename as output # awk '$1 > 5 && $1 < 20' *.txt t3 = time.time() proc_str = 'cd ' + daet_dir + ' && ./sorter.sh' os.system(proc_str) g2 = gt.load_graph_from_csv('/media/johannes/D45CF5375CF514C8/Users/johannes/mlhd/0-15/01/read_in.xxx', directed=True, string_vals=True, csv_options={'delimiter':'\t'}) t4=time.time() subs=find_vertex_range(g2, 'in', (500, 1000000)) g2.vp.name[subs] [print(g2.vp.name[i]) for i in subs] ################################## # add multiple dirs rel_dirs = ['00','01', '02', '03', '04', '05'] t1 = 1250000000 t2 = 1260000000
def run(input_file: KGTKFiles, directed, compute_degrees, compute_pagerank, compute_hits, log_file, output_stats, vertex_in_degree, vertex_out_degree, vertex_pagerank, vertex_auth, vertex_hubs): from kgtk.exceptions import KGTKException def infer_index(h, options=[]): for o in options: if o in h: return h.index(o) return -1 def infer_predicate(h, options=[]): for o in options: if o in h: return o return '' v_prop_dict = { 'vertex_pagerank': vertex_pagerank, 'vertex_hubs': vertex_hubs, 'vertex_auth': vertex_auth } try: # import modules locally import socket from graph_tool import load_graph_from_csv from graph_tool import centrality import kgtk.gt.analysis_utils as gtanalysis from pathlib import Path import sys import csv csv.field_size_limit(sys.maxsize) filename: Path = KGTKArgumentParser.get_input_file(input_file) # hardcoded values useful for the script. Perhaps some of them should be exposed as arguments later directions = ['in', 'out', 'total'] id_col = 'name' with open(filename, 'r') as f: header = next(f).split('\t') header=[h.strip() for h in header] subj_index = infer_index(header, options=['node1', 'subject']) obj_index = infer_index(header, options=['node2', 'object', 'value']) predicate = infer_predicate(header, options=['label', 'predicate', 'relation', 'relationship']) p = [] for i, header_col in enumerate(header): if i in [subj_index, obj_index]: continue p.append(header_col) with open(log_file, 'w') as writer: writer.write('loading the TSV graph now ...\n') G2 = load_graph_from_csv(str(filename), skip_first=True, directed=directed, hashed=True, ecols=[subj_index, obj_index], eprop_names=p, csv_options={'delimiter': '\t'}) writer.write('graph loaded! It has %d nodes and %d edges\n' % (G2.num_vertices(), G2.num_edges())) writer.write('\n###Top relations:\n') for rel, freq in gtanalysis.get_topN_relations(G2, pred_property=predicate): writer.write('%s\t%d\n' % (rel, freq)) if compute_degrees: writer.write('\n###Degrees:\n') for direction in directions: degree_data = gtanalysis.compute_node_degree_hist(G2, direction) max_degree = len(degree_data) - 1 mean_degree, std_degree = gtanalysis.compute_avg_node_degree(G2, direction) writer.write( '%s degree stats: mean=%f, std=%f, max=%d\n' % (direction, mean_degree, std_degree, max_degree)) if compute_pagerank: writer.write('\n###PageRank\n') v_pr = G2.new_vertex_property('float') centrality.pagerank(G2, prop=v_pr) G2.properties[('v', 'vertex_pagerank')] = v_pr writer.write('Max pageranks\n') result = gtanalysis.get_topn_indices(G2, 'vertex_pagerank', 5, id_col) for n_id, n_label, pr in result: writer.write('%s\t%s\t%f\n' % (n_id, n_label, pr)) if compute_hits: writer.write('\n###HITS\n') hits_eig, G2.vp['vertex_hubs'], G2.vp['vertex_auth'] = gtanalysis.compute_hits(G2) writer.write('HITS hubs\n') main_hubs = gtanalysis.get_topn_indices(G2, 'vertex_hubs', 5, id_col) for n_id, n_label, hubness in main_hubs: writer.write('%s\t%s\t%f\n' % (n_id, n_label, hubness)) writer.write('HITS auth\n') main_auth = gtanalysis.get_topn_indices(G2, 'vertex_auth', 5, id_col) for n_id, n_label, authority in main_auth: writer.write('%s\t%s\t%f\n' % (n_id, n_label, authority)) sys.stdout.write('node1\tlabel\tnode2\tid\n') id_count = 0 if not output_stats: for e in G2.edges(): sid, oid = e lbl = G2.ep[predicate][e] sys.stdout.write( '%s\t%s\t%s\t%s\n' % (G2.vp[id_col][sid], lbl, G2.vp[id_col][oid], '{}-{}-{}'.format(G2.vp[id_col][sid], lbl, id_count))) id_count += 1 id_count = 0 for v in G2.vertices(): v_id = G2.vp[id_col][v] sys.stdout.write( '{}\t{}\t{}\t{}\n'.format(v_id, vertex_in_degree, v.in_degree(), '{}-{}-{}'.format(v_id, vertex_in_degree, id_count))) id_count += 1 sys.stdout.write( '{}\t{}\t{}\t{}\n'.format(v_id, vertex_out_degree, v.out_degree(), '{}-{}-{}'.format(v_id, vertex_out_degree, id_count))) id_count += 1 for vprop in G2.vertex_properties.keys(): if vprop == id_col: continue sys.stdout.write( '%s\t%s\t%s\t%s\n' % (v_id, v_prop_dict[vprop], G2.vp[vprop][v], '{}-{}-{}'.format(v_id, v_prop_dict[vprop], id_count))) id_count += 1 except Exception as e: raise KGTKException('Error: ' + str(e))
import graph_tool from kgtk.gt import analysis_utils, topology_utils #input_file="P279_sorted_by_node.csv" input_file = "first100k_P279.csv" direction = 'total' G = graph_tool.load_graph_from_csv(input_file, directed=True, skip_first=True, ecols=(0, 2), csv_options={ 'delimiter': ',', 'quotechar': '"' }) print(analysis_utils.get_num_nodes(G)) print(analysis_utils.get_num_edges(G)) print(analysis_utils.compute_stats(G, direction)) print('now computing transitive closure') G2 = topology_utils.compute_transitive_closure(G) print('transitive closure computed') print(analysis_utils.get_num_nodes(G2)) print(analysis_utils.get_num_edges(G2)) print(analysis_utils.compute_stats(G2, direction))