def preprocess_train(self, node_ids): """ preprocess training set """ if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df') # subset df for training nodes df_tr = self.df[self.df.index.isin(node_ids)] # one-hot-encode target self.y_encoding = sklearn.feature_extraction.DictVectorizer(sparse=False) train_targets = self.y_encoding.fit_transform(df_tr[["target"]].to_dict('records')) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # return generator G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names]) self.G_sg = G_sg generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS, [self.sampsize, self.sampsize]) train_gen = generator.flow(df_tr.index, train_targets, shuffle=True) from .node_generator import NodeSequenceWrapper return NodeSequenceWrapper(train_gen)
def build_sgc_features(self, g, feature_dict): num_features = len(list(feature_dict.values())[0]) feature_names = ["w_{}".format(ii) for ii in range(num_features)] column_names = feature_names + ["label"] features = {} for f in column_names: features[f] = [] for s in g.nodes(): for i in range(num_features): features["w_{}".format(i)].append(feature_dict[s][i]) features['label'].append(g.nodes()[s]['label']) F = pd.DataFrame(data=features, index=g.nodes()) self.df_features = F[feature_names] self.df_targets = F[['label']].astype(str) self.df_features = self.reduce_dimensions(self.df_features) self.SG = sg.StellarGraph(g, node_features=self.df_features, node_type_name='tag') self.generator = self.build_generator() target_encoding = feature_extraction.DictVectorizer(sparse=False) self.target_encoding = target_encoding.fit( self.df_targets.to_dict("records"))
def preprocess_valid(self, node_ids): """ preprocess validation nodes (transductive inference) node_ids (list): list of node IDs that generator will yield """ if not self.ids_exist(node_ids): raise ValueError('node_ids must exist in self.df') if self.y_encoding is None: raise Exception('Unset parameters. Are you sure you called preprocess_train first?') # subset df for validation nodes df_val = self.df[self.df.index.isin(node_ids)] # one-hot-encode target val_targets = self.y_encoding.transform(df_val[["target"]].to_dict('records')) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) # return generator if self.G_sg is None: self.G_sg = sg.StellarGraph(self.G, node_features=self.df[self.feature_names]) generator = GraphSAGENodeGenerator(self.G_sg, U.DEFAULT_BS, [self.sampsize,self.sampsize]) val_gen = generator.flow(df_val.index, val_targets, shuffle=False) from .node_generator import NodeSequenceWrapper return NodeSequenceWrapper(val_gen)
def get_graph(): Gnx = create_graph_from_edgelist(read_edgelist()) node_data, node_features = read_node_features() G = sg.StellarGraph(Gnx, node_features=node_features) return node_data, G
def preprocess_train(self, G, edge_ids, edge_labels, mode='train'): """ ``` preprocess training set Args: G (networkx graph): networkx graph edge_ids(list): list of tuples representing edge ids edge_labels(list): edge labels (1 or 0 to indicated whether it is a true edge in original graph or not) ``` """ # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGELinkGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse('0.8'): raise Exception(SG_ERRMSG) #edge_labels = to_categorical(edge_labels) G_sg = sg.StellarGraph(G, node_features="feature") #print(G_sg.info()) shuffle = True if mode == 'train' else False link_seq = GraphSAGELinkGenerator( G_sg, U.DEFAULT_BS, self.sample_sizes).flow(edge_ids, edge_labels, shuffle=shuffle) from .sg_wrappers import LinkSequenceWrapper return LinkSequenceWrapper(link_seq)
def test(edgelist, node_data, model_file, batch_size, target_name="subject"): """ Load the serialized model and evaluate on all nodes in the graph. Args: G: NetworkX graph file target_converter: Class to give numeric representations of node targets feature_converter: CLass to give numeric representations of the node features model_file: Location of Keras model to load batch_size: Size of batch for inference """ # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist) # We must also save the target encoding to convert model predictions encoder_file = model_file.replace( "cora_example_model", "cora_example_encoding" ).replace(".h5", ".pkl") with open(encoder_file, "rb") as f: target_encoding = pickle.load(f)[0] # Endode targets with pre-trained encoder node_targets = target_encoding.transform( node_data[[target_name]].to_dict("records") ) node_ids = node_data.index # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_features=node_features) # Load Keras model model = keras.models.load_model( model_file, custom_objects={"MeanAggregator": MeanAggregator} ) print("Loaded model:") model.summary() # Get required samples from model # TODO: Can we move this to the library? num_samples = [ int(model.input_shape[ii + 1][1] / model.input_shape[ii][1]) for ii in range(len(model.input_shape) - 1) ] # Create mappers for GraphSAGE that input data from the graph to the model generator = GraphSAGENodeGenerator( G, batch_size, num_samples, seed=42 ) all_gen = generator.flow(node_ids, node_targets) # Evaluate and print metrics all_metrics = model.evaluate_generator(all_gen) print("\nAll-node Evaluation:") for name, val in zip(model.metrics_names, all_metrics): print("\t{}: {:0.4f}".format(name, val))
def form_graph(edges_path, meta_path, ids_path, meta_received): edges = pd.read_csv(edges_path, sep=",", index_col=0) ID = 111180 idss = pd.read_csv(ids_path, index_col=0, names=["paper_id"]).iloc[1:].append( pd.DataFrame([111180], columns=["paper_id"])).reset_index(drop=True) meta = pd.read_csv(meta_path, index_col=0) new_meta = pd.DataFrame([meta_received], columns=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]) meta_final = meta.append(new_meta).reset_index(drop=True) meta_finall = meta_final.join(idss).set_index("paper_id") ids = pd.read_csv("data/cutted_edges_to.csv", index_col=0).iloc[1:].append( pd.DataFrame([ID], columns=["0"])).reset_index(drop=True) column_from = [] for i in range(len(ids)): column_from.append([ID]) column_from = pd.DataFrame(column_from, columns=["from"]) edges_final = column_from.join(ids) edges_final.rename(columns={'from': '0', '0': '1'}, inplace=True) edges_final = edges.append(edges_final).reset_index(drop=True) edge_data = pd.DataFrame( { "source": list(edges_final["0"].astype(int)), "target": list(edges_final["1"].astype(int)) }) G = sg.StellarGraph( {"paper": meta_finall}, {"paper-cites": edge_data} ) print(G.info()) return G
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 16 if(not "layer_sizes" in hyper_params.keys()): num_samples = [25, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [256, 256] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.0 if(not "lr" in hyper_params.keys()): lr = 1e-3 if(not "num_walks" in hyper_params.keys()): num_walks = 1 if(not "length" in hyper_params.keys()): length = 5 self.graph = sg.StellarGraph(nodes=self.nodes_df,edges=self.edges_df) self.nodes = list(self.graph.nodes()) del self.nodes_df del self.edges_df unsupervised_samples = UnsupervisedSampler( self.graph, nodes=self.nodes, length=length, number_of_walks=num_walks ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph, batch_size, num_samples) self.train_flow = train_gen.flow(unsupervised_samples) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout, normalize="l2" ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) x_inp_src = x_inp[0::2] x_out_src = x_out[0] self.embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src) self.node_gen = GraphSAGENodeGenerator(self.graph, batch_size, num_samples).flow(self.nodes) return self.model.get_weights()
def pgframe_to_stellargraph(pgframe, directed=True, include_type=False, feature_vector_prop=None, feature_props=None, edge_weight=None): """Convert a PGFrame to a StellarGraph object.""" if feature_props is None: feature_props = [] feature_array = None if include_type: nodes = {} for t in pgframe.node_types(): index = pgframe.nodes(typed_by=t) if feature_vector_prop is not None: feature_array = np.array( pgframe.get_node_property_values(feature_vector_prop, typed_by=t).to_list()) elif len("feature_props") > 0: feature_array = pgframe.nodes( raw_frame=True, typed_by=t)[feature_props].to_numpy() nodes[t] = sg.IndexedArray(feature_array, index=index) else: if feature_vector_prop is not None: feature_array = np.array( pgframe.get_node_property_values( feature_vector_prop).to_list()) elif len("feature_props") > 0: feature_array = pgframe.nodes( raw_frame=True)[feature_props].to_numpy() nodes = sg.IndexedArray(feature_array, index=pgframe.nodes()) if pgframe.number_of_edges() > 0: edges = pgframe.edges(raw_frame=True, include_index=True, filter_props=lambda x: ((x == "@type") if include_type else False) or x == edge_weight, rename_cols={ '@source_id': 'source', "@target_id": "target" }) else: edges = pd.DataFrame(columns=["source", "target"]) if directed: graph = sg.StellarDiGraph( nodes=nodes, edges=edges, edge_weight_column=edge_weight, edge_type_column="@type" if include_type else None) else: graph = sg.StellarGraph( nodes=nodes, edges=edges, edge_weight_column=edge_weight, edge_type_column="@type" if include_type else None) return graph
def initialize(self,**hyper_params): if(not "batch_size" in hyper_params.keys()): batch_size = 20 if(not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if(not "num_samples" in hyper_params.keys()): layer_sizes = [10, 10 ] if(not "bias" in hyper_params.keys()): bias = True if(not "dropout" in hyper_params.keys()): dropout = 0.1 if(not "lr" in hyper_params.keys()): lr = 1e-2 graph = sg.StellarGraph(nodes=self.nodes,edges=self.edges) # Test split edge_splitter_test = EdgeSplitter(graph) self.graph_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train split edge_splitter_train = EdgeSplitter(self.graph_test) self.graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, method="global", keep_connected=True, seed = 42 ) # Train iterators train_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Test iterators test_gen = GraphSAGELinkGenerator(self.graph_train, batch_size, num_samples, seed = 42) self.test_flow = test_gen.flow(edge_ids_test, edge_labels_test, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE( layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout ) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method="ip" )(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.BinaryAccuracy(),keras.metrics.Recall(),keras.metrics.AUC(),keras.metrics.Precision()], ) # return number of training and testing examples return edge_ids_train.shape[0],edge_ids_test.shape[0]
def main(graph_loc, layer_sizes, activations, dropout, learning_rate): edgelist = pd.read_table( os.path.join(graph_loc, 'cora.cites'), header=None, names=['source', 'target'] ) # Load node features # The CORA dataset contains binary attributes 'w_x' that correspond to whether the corresponding keyword # (out of 1433 keywords) is found in the corresponding publication. feature_names = ['w_{}'.format(ii) for ii in range(1433)] # Also, there is a "subject" column column_names = feature_names + ['subject'] node_data = pd.read_table( os.path.join(graph_loc, 'cora.content'), header=None, names=column_names ) target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[['subject']].to_dict("records") ) node_ids = node_data.index node_features = node_data[feature_names] Gnx = nx.from_pandas_edgelist(edgelist) # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=55232 ) # Split test set into test and validation val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=300, test_size=None, random_state=523214 ) generator = FullBatchNodeGenerator(G, func_opt=GCN_Aadj_feats_op, filter='localpool') model = train(train_nodes, train_targets, val_nodes, val_targets, generator, dropout, layer_sizes, learning_rate, activations) # Save the trained model save_str = "_h{}_l{}_d{}_r{}".format( "gcn", ''.join([str(x) for x in layer_sizes]), str(dropout), str(learning_rate) ) model.save("cora_gcn_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_gcn_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f) test(test_nodes, test_targets, generator, "cora_gcn_model" + save_str + ".h5")
def preprocess_test(self, df_te, G_te): """ ``` preprocess for inductive inference df_te (DataFrame): pandas dataframe containing new node attributes G_te (Graph): a networkx Graph containing new nodes ``` """ try: import networkx as nx except ImportError: raise ImportError("Please install networkx: pip install networkx") if self.y_encoding is None: raise Exception( "Unset parameters. Are you sure you called preprocess_train first?" ) # get aggregrated df # df_agg = pd.concat([df_te, self.df]).drop_duplicates(keep='last') df_agg = pd.concat([df_te, self.df]) # df_te = pd.concat([self.df, df_agg]).drop_duplicates(keep=False) # get aggregrated graph is_subset = set(self.G.nodes()) <= set(G_te.nodes()) if not is_subset: raise ValueError("Nodes in self.G must be subset of G_te") G_agg = nx.compose(self.G, G_te) # one-hot-encode target if "target" in df_te.columns: test_targets = self.y_encoding.transform( df_te[["target"]].to_dict("records")) else: test_targets = [-1] * len(df_te.shape[0]) # import stellargraph try: import stellargraph as sg from stellargraph.mapper import GraphSAGENodeGenerator except: raise Exception(SG_ERRMSG) if version.parse(sg.__version__) < version.parse("0.8"): raise Exception(SG_ERRMSG) # return generator G_sg = sg.StellarGraph(G_agg, node_features=df_agg[self.feature_names]) generator = GraphSAGENodeGenerator(G_sg, U.DEFAULT_BS, [self.sampsize, self.sampsize]) test_gen = generator.flow(df_te.index, test_targets, shuffle=False) from .sg_wrappers import NodeSequenceWrapper return NodeSequenceWrapper(test_gen)
def initialize(self, **hyper_params): if (not "batch_size" in hyper_params.keys()): batch_size = 20 if (not "layer_sizes" in hyper_params.keys()): num_samples = [20, 10] if (not "num_samples" in hyper_params.keys()): layer_sizes = [20, 20] if (not "bias" in hyper_params.keys()): bias = True if (not "dropout" in hyper_params.keys()): dropout = 0.3 if (not "lr" in hyper_params.keys()): lr = 1e-3 if (not "train_split" in hyper_params.keys()): train_split = 0.2 self.graph = sg.StellarGraph(nodes=self.nodes, edges=self.edges) # Train split edge_splitter_train = EdgeSplitter(self.graph) graph_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=train_split, method="global", keep_connected=True) # Train iterators train_gen = GraphSAGELinkGenerator(graph_train, batch_size, num_samples) self.train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) # Model defining - Keras functional API + Stellargraph layers graphsage = GraphSAGE(layer_sizes=layer_sizes, generator=train_gen, bias=bias, dropout=dropout) x_inp, x_out = graphsage.in_out_tensors() prediction = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_out) self.model = keras.Model(inputs=x_inp, outputs=prediction) self.model.compile( optimizer=keras.optimizers.Adam(lr=lr), loss=keras.losses.binary_crossentropy, metrics=["acc"], ) return self.model.get_weights()
def CreateGraph(filename, seperator, *args): edges = pd.read_csv(filename, sep=seperator) drugs = pd.DataFrame(index=pd.unique(edges[args[0]])) genes = pd.DataFrame(index=pd.unique(edges[args[1]])) graphObj = sg.StellarGraph( {"drug": drugs, "gene": genes}, edges, source_column=args[0], target_column=args[1], ) return graphObj
def test(G, model_file: AnyStr, batch_size: int = 100): """ Load the serialized model and evaluate on a random balanced subset of all links in the graph. Note that the set of links the model is evaluated on may contain links from the model's training set. To avoid this, set the seed of the edge splitter to the same seed as used for link splitting in train() Args: G: NetworkX graph file model_file: Location of Keras model to load batch_size: Size of batch for inference """ print("Loading model from ", model_file) model = keras.models.load_model( model_file, custom_objects={"MeanAggregator": MeanAggregator}) # Get required input shapes from model num_samples = [ int(model.input_shape[ii + 1][1] / model.input_shape[ii][1]) for ii in range(1, len(model.input_shape) - 1, 2) ] edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, method=args.edge_sampling_method, probs=args.edge_sampling_probs) # Convert G_test to StellarGraph object (undirected, as required by GraphSAGE): G_test = sg.StellarGraph(G_test, node_features="feature") # Generator feeds data from (source, target) sampled subgraphs to GraphSAGE model test_gen = GraphSAGELinkGenerator( G_test, batch_size, num_samples, name="test", ).flow(edge_ids_test, edge_labels_test) # Evaluate and print metrics test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Evaluation:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val))
def load_graph(node_file, edge_file): nodes_df = pd.read_csv(node_file, sep=",", header=None, encoding='utf-8') edges_df = pd.read_csv(edge_file, sep=",", header=None, encoding='utf-8') nodes_list = [] node_type_list = [] output_layer = [] nodes_and_types_list = [] edges_source_list = [] edges_target_list = [] for index, line in nodes_df.iterrows(): node_id = int(str(line[0]).rstrip()) node_type = str(line[1]).rstrip() node_type = encode_node_type(node_type) nodes_list.append(node_id) node_type_id = 0 if node_type == "[go]": node_type_id = 1 output_layer.append([node_id, 1, 0, 0, 0, 0, 0, 0, 0, 0]) if node_type == "[co]": node_type_id = 2 output_layer.append([node_id, 0, 1, 0, 0, 0, 0, 0, 0, 0]) if node_type == "[ss]": node_type_id = 3 output_layer.append([node_id, 0, 0, 1, 0, 0, 0, 0, 0, 0]) if node_type == "[ta]": node_type_id = 4 output_layer.append([node_id, 0, 0, 0, 1, 0, 0, 0, 0, 0]) if node_type == "[ti]": node_type_id = 5 output_layer.append([node_id, 0, 0, 0, 0, 1, 0, 0, 0, 0]) if node_type == "[pa]": node_type_id = 6 output_layer.append([node_id, 0, 0, 0, 0, 0, 1, 0, 0, 0]) if node_type == "[di]": node_type_id = 7 output_layer.append([node_id, 0, 0, 0, 0, 0, 0, 1, 0, 0]) if node_type == "[dr]": node_type_id = 8 output_layer.append([node_id, 0, 0, 0, 0, 0, 0, 0, 1, 0]) if node_type == "[se]": node_type_id = 9 output_layer.append([node_id, 0, 0, 0, 0, 0, 0, 0, 0, 1]) node_type_list.append(node_type_id) nodes_and_types_list.append([node_id, node_type_id]) for index, line in edges_df.iterrows(): source = int(str(line[0]).rstrip()) target = int(str(line[1]).rstrip()) edges_source_list.append(source) edges_target_list.append(target) nodes_df = pd.DataFrame({"type": node_type_list}, index=nodes_list) edges_df = pd.DataFrame({ "source": edges_source_list, "target": edges_target_list }) graph = sg.StellarGraph(nodes_df, edges_df) return graph, nodes_and_types_list, output_layer
def load_data(start_month_id=220, end_month_id=264): # # 每个时间点股票的列表的列表,为求这段时间内共同的不含缺失值的股票做准备 # stock_code_list_list = [] # for month_id in range(250, 260): # stock_code_list_list.append(list(pd.read_csv('csv_demo_con/' + # str(month_id + 1) + '.csv',index_col = 0).dropna(how='any')['stock'].values)) # # 求这个时间段里都存在的股票 # common_stock_code_list = stock_code_list_list[0] # for i in range(260-250): # common_stock_code_list = [x for x in common_stock_code_list if x in stock_code_list_list[i]] if not os.path.exists('data'): os.makedirs('data') ''' G是stellargraph中封装好的对象,存储图相关的信息 nodes_subjects是节点的标签,这里是norm_return nodes_features是节点的特征,这里是因子信息 提前做好列表,方便存每个月的数据 ''' G_list = [] nodes_subjects_list = [] nodes_features_list = [] # range调整需要计算的月份,但是延迟一个月,219实际计算的是220.csv,264实际计算的是265.csv # 文件中有220.csv到264.csv,所以range取(219, 264)即可全部读取 for month_id in range(start_month_id - 1, end_month_id + 1): # if如果不存这些文件,就说明是第一次运行,则要计算邻接矩阵adj和因子矩阵factor # else如果已经存在,那么直接读取即可 if not (os.path.exists("data/factor-" + str(month_id + 1) + ".csv") and os.path.exists("data/adj-" + str(month_id + 1) + ".npz")): # 行业为0~30,读取为ori_df,为创建行业的邻接矩阵做准备 ori_df = pd.read_excel('monthly_indus.xlsx', index_col=0, header=None) stock_code_list = list(ori_df.index) # 字典存stock_code对应的行数 stock_code_id_dict = {} for i, j in zip([i for i in range(3945)], stock_code_list): stock_code_id_dict[j] = i sub_ori_df = ori_df.iloc[:, month_id] # 这三个数组为构建稀疏矩阵做准备 adj_row = [] adj_col = [] adj_data = [] # 行业编号有30个:0~30 for indus_code in range(31): sub_ori_df_of_indus_code = sub_ori_df[sub_ori_df == indus_code] index_of_sub_ori_df_of_indus_code = sub_ori_df_of_indus_code.index size = len(sub_ori_df_of_indus_code) if size > 1: for i in range(size): for j in range(i, size): # 若为range(i + 1, size),则邻接矩阵的对角线为0 stock_1_code = index_of_sub_ori_df_of_indus_code[i] stock_2_code = index_of_sub_ori_df_of_indus_code[j] stock_1_id = stock_code_id_dict[stock_1_code] stock_2_id = stock_code_id_dict[stock_2_code] adj_row.append(stock_1_id) adj_col.append(stock_2_id) adj_data.append(1) # 直接创建稀疏的adj可以大幅加速,但还要转回dataframe,因为要根据stock_code和因子求交集 adj_sparse = sp.coo_matrix((adj_data, (adj_row, adj_col)), shape=(3945, 3945)) adj_df = pd.DataFrame((adj_sparse).toarray(), index=stock_code_list, columns=stock_code_list) # 新得到adj_df, factor_df,里面包含的股票一致 adj_df, factor_df = create_mat( adj_df, 'csv_demo_con/' + str(month_id + 1) + '.csv') # 将adj_df转回adj_sparse,创建edges为创建G对象做准备 adj_sparse = sp.coo_matrix(adj_df.values) # 将邻接矩阵和因子矩阵保存,以后就不用再计算一遍了 sp.save_npz('data/adj-' + str(month_id + 1) + '.npz', adj_sparse) factor_df.to_csv('data/factor-' + str(month_id + 1) + '.csv') else: adj_sparse = sp.load_npz('data/adj-' + str(month_id + 1) + '.npz') factor_df = pd.read_csv('data/factor-' + str(month_id + 1) + '.csv', index_col=0) ''' 如 row = [1, 3, 4, 6, 8] col = [3, 5, 7, 8, 10] 则1节点和3节点有连结,3节点和5节点有连接,以此类推 边权由weight对应的列表表示 注意:每个节点是对应不同的股票的 ''' row = adj_sparse.row col = adj_sparse.col edges = pd.DataFrame({ "source": row, "target": col, "weight": [1 for i in range(len(row))] }) ''' nodes是一个列表,第0个特征对应第0个节点,第1个特征对应第1个节点,以此类推 ''' nodes = factor_df.reset_index().loc[:, 'return_1m':'return_12m'] nodes_features_list.append(nodes) # # 大家共同使用第一个月的邻接矩阵 # if month_id == 250: # common_edges = edges # 创建包含图信息的对象G G = sg.StellarGraph(nodes, edges) G_list.append(G) # 创建每个节点对应的标签,这里是norm_return node_subjects = factor_df.reset_index()['norm_return'] nodes_subjects_list.append(node_subjects) print(month_id + 1, "has finished") return G_list, nodes_subjects_list, nodes_features_list
triads = create_triads(X, Y, Lx, Ly, W, nx, ny, order) ############### X = triads[0] Y1 = triads[1] Y2 = triads[2] W = triads[3] Lx = triads[4] Ly1 = triads[5] Ly2 = triads[6] #input_graphs = get_graphs() # Create the graph network. # convert the raw data into StellarGraph's graph format for faster operations graph = sg.StellarGraph(nodes, edges) generator = sg.mapper.FullBatchNodeGenerator(graph, method="gcn") # two layers of GCN, each with hidden dimension 16 gcn = sg.layer.GCN(layer_sizes=[16, 16], generator=generator) x_inp, x_out = gcn.in_out_tensors( ) # create the input and output TensorFlow tensors # use TensorFlow Keras to add a layer to compute the (one-hot) predictions predictions = tf.keras.layers.Dense(units=len(ground_truth_targets.columns), activation="softmax")(x_out) # use the input and output tensors to create a TensorFlow Keras model model = tf.keras.Model(inputs=x_inp, outputs=predictions) '''
def train( self, layer_size, num_samples, train_size=0.7, batch_size: int = 200, num_epochs: int = 20, learning_rate=5e-3, dropout=0.0, use_bias=True, ): """ Build and train the HinSAGE model for link attribute prediction on the specified graph G with given parameters. Args: layer_size: a list of number of hidden nodes in each layer num_samples: number of neighbours to sample at each layer batch_size: size of mini batch num_epochs: number of epochs to train the model (epoch = all training batches are streamed through the model once) learning_rate: initial learning rate dropout: dropout probability in the range [0, 1) use_bias: tells whether to use a bias terms in HinSAGE model Returns: """ # Training and test edges edges = list(self.g.edges(data=True)) edges_train, edges_test = model_selection.train_test_split( edges, train_size=train_size) # Edgelists: edgelist_train = [(e[0], e[1]) for e in edges_train] edgelist_test = [(e[0], e[1]) for e in edges_test] labels_train = [e[2]["score"] for e in edges_train] labels_test = [e[2]["score"] for e in edges_test] # Our machine learning task of learning user-movie ratings can be framed as a supervised Link Attribute Inference: # given a graph of user-movie ratings, we train a model for rating prediction using the ratings edges_train, # and evaluate it using the test ratings edges_test. The model also requires the user-movie graph structure. # To proceed, we need to create a StellarGraph object from the ingested graph, for training the model: # When sampling the GraphSAGE subgraphs, we want to treat user-movie links as undirected self.g = sg.StellarGraph(self.g, node_features="feature") # Next, we create the link generators for preparing and streaming training and testing data to the model. # The mappers essentially sample k-hop subgraphs of G with randomly selected head nodes, as required by # the HinSAGE algorithm, and generate minibatches of those samples to be fed to the input layer of the HinSAGE model. generator = HinSAGELinkGenerator(self.g, batch_size, num_samples, head_node_types=["user", "movie"]) train_gen = generator.flow(edgelist_train, labels_train) test_gen = generator.flow(edgelist_test, labels_test) # Build the model by stacking a two-layer HinSAGE model and a link regression layer on top. assert len(layer_size) == len( num_samples ), "layer_size and num_samples must be of the same length! Stopping." hinsage = HinSAGE(layer_sizes=layer_size, generator=generator, bias=use_bias, dropout=dropout) # Define input and output sockets of hinsage: x_inp, x_out = hinsage.build() # Final estimator layer score_prediction = link_regression( edge_embedding_method=args.edge_embedding_method)(x_out) # Create Keras model for training model = Model(inputs=x_inp, outputs=score_prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate), loss=losses.mean_squared_error, metrics=[root_mean_square_error, metrics.mae], ) # Train model print("Training the model for {} epochs with initial learning rate {}". format(num_epochs, learning_rate)) history = model.fit_generator( train_gen, validation_data=test_gen, epochs=num_epochs, verbose=2, shuffle=True, use_multiprocessing=True, workers=multiprocessing.cpu_count() // 2, ) # Evaluate and print metrics test_metrics = model.evaluate_generator(test_gen) print("Test Evaluation:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val))
def main(): with open(r"training.txt", "r") as f: reader = csv.reader(f) training = list(reader) # in order of training examples training = [element[0].split(" ") for element in training] training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link']) print("Training examples shape: {}".format(training.shape)) with open(r"testing.txt", "r") as f: reader = csv.reader(f) testing = list(reader) # in order of testing examples testing = [element[0].split(" ") for element in testing] testing = pd.DataFrame(testing, columns=['Node1', 'Node2']) print("Testing examples shape: {}".format(testing.shape)) ''' uncomment lines for reduced corpus with stopword removal. In future integrate stemmer here, multi-language ''' NODE_INFO_DIRECTORY = r"node_information/text/" corpus_path = r"pickles/simple_corpus.PICKLE" ids_path = r"pickles/ids.PICKLE" if os.path.exists(corpus_path): with open(corpus_path, 'rb') as f: corpus = pickle.load(f) f.close() with open(ids_path, 'rb') as f: ids = pickle.load(f) f.close() else: corpus = [] ids = [] for filename in tqdm(os.listdir(NODE_INFO_DIRECTORY), position=0, leave=True): with open(NODE_INFO_DIRECTORY + filename, 'r', encoding='UTF-8', errors='ignore') as f: doc_string = [] for line in f: [ doc_string.append(token.strip()) for token in line.lower().strip().split(" ") if token != "" ] corpus.append(' '.join(doc_string)) ids.append(filename[:-4]) with open(corpus_path, '+wb') as f: pickle.dump(corpus, f) f.close() with open(ids_path, '+wb') as f: pickle.dump(ids, f) f.close() stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE" if os.path.exists(stemmed_corpus_path): with open(stemmed_corpus_path, 'rb') as f: stemmed_corpus = pickle.load(f) f.close() else: print('Stemmed corpus unavailable') # in order of alphabetical text information i.e. 0, 1, 10, 100 node_info = pd.DataFrame({ 'id': ids, 'corpus': corpus, 'stemmed': stemmed_corpus }) print("Training node info shape: {}".format(node_info.shape)) train_graph_split_path = 'pickles/train_graph_split.PICKLE' if os.path.exists(train_graph_split_path): with open(train_graph_split_path, 'rb') as f: keep_indices = pickle.load(f) f.close() else: keep_indices = random.sample(range(len(training)), k=int(len(training) * 0.05)) with open(train_graph_split_path, '+wb') as f: pickle.dump(keep_indices, f) f.close() data_train_val = training.iloc[keep_indices] linked_nodes = training.loc[training['Link'] == '1'] linked_nodes = linked_nodes[['Node1', 'Node2']] edgelist = linked_nodes.rename(columns={ "Node1": "source", "Node2": "target" }) lda_path = r"pickles/stemmed_lda_matrix.PICKLE" if os.path.exists(lda_path): with open(lda_path, 'rb') as f: lda = pickle.load(f) f.close() lda.shape feature_names = node_column_names = ["w_{}".format(ii) for ii in range(10)] node_data = pd.DataFrame(lda, columns=node_column_names) node_data.index = [str(i) for i in node_data.index] G_all_nx = nx.from_pandas_edgelist(edgelist) all_node_features = node_data[feature_names] G_all = sg.StellarGraph(G_all_nx, node_features=all_node_features) print(G_all.info()) G_all.get_feature_for_nodes(['0']) ## Get DBLP Subgraph ### with papers published before a threshold year sub_linked_nodes = data_train_val.loc[data_train_val['Link'] == '1'] sub_linked_nodes = sub_linked_nodes[['Node1', 'Node2']] subgraph_edgelist = sub_linked_nodes.rename(columns={ "Node1": "source", "Node2": "target" }) G_sub_nx = nx.from_pandas_edgelist(subgraph_edgelist) subgraph_node_ids = sorted(list(G_sub_nx.nodes)) subgraph_node_features = node_data[feature_names].reindex( subgraph_node_ids) G_sub = sg.StellarGraph(G_sub_nx, node_features=subgraph_node_features) print(G_sub.info()) ## Train attri2vec on the DBLP Subgraph nodes = list(G_sub.nodes()) number_of_walks = int(input('Number of Walks: ')) length = int(input('Walk length: ')) unsupervised_samples = UnsupervisedSampler(G_sub, nodes=nodes, length=length, number_of_walks=number_of_walks) batch_size = 50 epochs = int(input('Enter number of epochs: ')) generator = Attri2VecLinkGenerator(G_sub, batch_size) layer_sizes = [128] attri2vec = Attri2Vec(layer_sizes=layer_sizes, generator=generator.flow(unsupervised_samples), bias=False, normalize=None) # Build the model and expose input and output sockets of attri2vec, for node pair inputs: x_inp, x_out = attri2vec.build() prediction = link_classification(output_dim=1, output_act="sigmoid", edge_embedding_method='ip')(x_out) model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=keras.optimizers.Adam(lr=1e-2), loss=keras.losses.binary_crossentropy, metrics=[keras.metrics.binary_accuracy], ) history = model.fit_generator( generator.flow(unsupervised_samples), epochs=epochs, verbose=1, use_multiprocessing=bool(int(input('Multiprocessing? 1/0: '))), workers=int(input('Number of workers: ')), shuffle=True, ) print(history) model.save('model_walks{}len{}e{}.h5'.format(number_of_walks, length, epochs)) return model
def train( G, layer_size: List[int], num_samples: List[int], batch_size: int = 100, num_epochs: int = 10, learning_rate: float = 0.001, dropout: float = 0.0, ): """ Train the GraphSAGE model on the specified graph G with given parameters. Args: G: NetworkX graph file layer_size: A list of number of hidden units in each layer of the GraphSAGE model num_samples: Number of neighbours to sample at each layer of the GraphSAGE model batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Split links into train/test print("Using '{}' method to sample negative links".format( args.edge_sampling_method)) # From the original graph, extract E_test and the reduced graph G_test: edge_splitter_test = EdgeSplitter(G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the # reduced graph G_test with the sampled links removed: G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # From G_test, extract E_train and the reduced graph G_train: edge_splitter_train = EdgeSplitter(G_test, G) # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the # further reduced graph G_train with the sampled links removed: G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.1, keep_connected=True, method=args.edge_sampling_method, probs=args.edge_sampling_probs, ) # G_train, edge_ds_train, edge_labels_train will be used for model training # G_test, edge_ds_test, edge_labels_test will be used for model testing # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML: G_train = sg.StellarGraph(G_train, node_features="feature") G_test = sg.StellarGraph(G_test, node_features="feature") # Mapper feeds link data from sampled subgraphs to GraphSAGE model # We need to create two mappers: for training and testing of the model train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples) train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True) test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples) test_flow = test_gen.flow(edge_ids_test, edge_labels_test) # GraphSAGE model graphsage = GraphSAGE(layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout) # Construct input and output tensors for the link prediction model x_inp, x_out = graphsage.build() # Final estimator layer prediction = link_classification( output_dim=1, output_act="sigmoid", edge_embedding_method=args.edge_embedding_method, )(x_out) # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate), loss=losses.binary_crossentropy, metrics=[metrics.binary_accuracy], ) # Evaluate the initial (untrained) model on the train and test set: init_train_metrics = model.evaluate_generator(train_flow) init_test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the initial (untrained) model:") for name, val in zip(model.metrics_names, init_test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Train model print("\nTraining the model for {} epochs...".format(num_epochs)) history = model.fit_generator( train_flow, epochs=num_epochs, validation_data=test_flow, verbose=2, shuffle=False, ) # Evaluate and print metrics train_metrics = model.evaluate_generator(train_flow) test_metrics = model.evaluate_generator(test_flow) print("\nTrain Set Metrics of the trained model:") for name, val in zip(model.metrics_names, train_metrics): print("\t{}: {:0.4f}".format(name, val)) print("\nTest Set Metrics of the trained model:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("graphsage_link_pred" + save_str + ".h5")
def train( edgelist, node_data, attn_heads, layer_sizes, num_epochs=10, learning_rate=0.005, es_patience=100, dropout=0.0, target_name="subject", ): """ Train a GAT model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes attn_heads: Number of attention heads in GAT layers layer_sizes: A list of number of hidden nodes in each layer num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records")) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist) # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=55232, ) # Further split test set into validation and test val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=1000, random_state=523214) # Create mappers for GraphSAGE that input data from the graph to the model generator = FullBatchNodeGenerator(G) train_gen = generator.flow(train_nodes, train_targets) val_gen = generator.flow(val_nodes, val_targets) # GAT model gat = GAT( layer_sizes=layer_sizes, attn_heads=attn_heads, generator=generator, bias=True, in_dropout=dropout, attn_dropout=dropout, activations=["elu", "elu"], normalize=None, ) # Expose the input and output tensors of the GAT model for nodes: x_inp, x_out = gat.node_model(add_self_loops=True) # Snap the final estimator layer to x_out x_out = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=x_out) model.compile( optimizer=optimizers.Adam(lr=learning_rate, decay=0.001), loss=losses.categorical_crossentropy, weighted_metrics=["acc"], ) print(model.summary()) # Train model # Callbacks if not os.path.isdir("logs"): os.makedirs("logs") N = len(node_ids) es_callback = EarlyStopping(monitor="val_weighted_acc", patience=es_patience) tb_callback = TensorBoard(batch_size=N) mc_callback = ModelCheckpoint( "logs/best_model.h5", monitor="val_weighted_acc", save_best_only=True, save_weights_only=True, ) if args.interface == "fit": print("\nUsing model.fit() to train the model\n") # Get the training data [X, A], y_train, node_mask_train = train_gen.__getitem__(0) N = A.shape[0] # A = sparse.csr_matrix(A + np.eye(A.shape[0])) # Add self-loops # Get the validation data [_, _], y_val, node_mask_val = val_gen.__getitem__(0) history = model.fit( x=[X, A], y=y_train, sample_weight=node_mask_train, batch_size=N, shuffle= False, # must be False, since shuffling data means shuffling the whole graph epochs=num_epochs, verbose=2, validation_data=([X, A], y_val, node_mask_val), callbacks=[es_callback, tb_callback, mc_callback], ) else: print("\nUsing model.fit_generator() to train the model\n") history = model.fit_generator( train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=False, callbacks=[es_callback, tb_callback, mc_callback], ) # Load best model model.load_weights("logs/best_model.h5") # Evaluate on validation set and print metrics if args.interface == "fit": val_metrics = model.evaluate(x=[X, A], y=y_val, sample_weight=node_mask_val, batch_size=N) else: val_metrics = model.evaluate_generator(val_gen) print("\nBest model's Validation Set Metrics:") for name, val in zip(model.metrics_names, val_metrics): print("\t{}: {:0.4f}".format(name, val)) # Evaluate on test set and print metrics if args.interface == "fit": [_, _], y_test, node_mask_test = generator.flow( test_nodes, test_targets).__getitem__(0) test_metrics = model.evaluate(x=[X, A], y=y_test, sample_weight=node_mask_test, batch_size=N) else: test_metrics = model.evaluate_generator( generator.flow(test_nodes, test_targets)) print("\nBest model's Test Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes # Note that the `predict` or `predict_generator` function now operates differently to the `GraphSAGE` or `HinSAGE` models # in that if you give it less than the complete set of nodes, it will still return all predictions and in a fixed order # defined by the order of nodes in X and A (which is defined by the order of G.nodes()). if args.interface == "fit": all_predictions = model.predict(x=[X, A], batch_size=N) else: all_predictions = model.predict_generator(generator.flow(node_ids)) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=list(G.nodes())) accuracy = np.mean([ "subject=" + gt_subject == p for gt_subject, p in zip(node_data["subject"][list(G.nodes())], node_predictions.idxmax(axis=1)) ]) print("\nAll-node accuracy: {:0.4f}".format(accuracy)) # Save the trained model save_str = "_h{}_l{}_d{}_r{}".format( attn_heads, "_".join([str(x) for x in layer_sizes]), dropout, learning_rate) model.save("cora_gat_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_gat_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)
def train( edgelist, node_data, layer_size, num_samples, batch_size=100, num_epochs=10, learning_rate=0.005, dropout=0.0, target_name="subject", ): """ Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model. Args: edgelist: Graph edgelist node_data: Feature and target data for nodes layer_size: A list of number of hidden nodes in each layer num_samples: Number of neighbours to sample at each layer batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ # Extract target and encode as a one-hot vector target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[[target_name]].to_dict("records")) node_ids = node_data.index # Extract the feature data. These are the feature vectors that the Keras model will use as input. # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication. node_features = node_data[feature_names] # Create graph from edgelist and set node features and node type Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label") nx.set_node_attributes(Gnx, "paper", "label") # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_type_name="label", node_features=node_features) # Split nodes into train/test using stratification. train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=5232, ) # Split test set into test and validation val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=500, test_size=None, random_state=5214) # Create mappers for GraphSAGE that input data from the graph to the model generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312) train_gen = generator.flow(train_nodes, train_targets, shuffle=True) val_gen = generator.flow(val_nodes, val_targets) # GraphSAGE model model = GraphSAGE( layer_sizes=layer_size, generator=train_gen, bias=True, dropout=dropout, aggregator=MeanAggregator, ) # Expose the input and output sockets of the model: x_inp, x_out = model.build() # Snap the final estimator layer to x_out prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate, decay=0.001), loss=losses.categorical_crossentropy, metrics=[metrics.categorical_accuracy], ) print(model.summary()) # Train model history = model.fit_generator(train_gen, epochs=num_epochs, validation_data=val_gen, verbose=2, shuffle=False) # Evaluate on test set and print metrics test_metrics = model.evaluate_generator( generator.flow(test_nodes, test_targets)) print("\nTest Set Metrics:") for name, val in zip(model.metrics_names, test_metrics): print("\t{}: {:0.4f}".format(name, val)) # Get predictions for all nodes all_predictions = model.predict_generator(generator.flow(node_ids)) # Turn predictions back into the original categories node_predictions = pd.DataFrame( target_encoding.inverse_transform(all_predictions), index=node_ids) accuracy = np.mean([ "subject=" + gt_subject == p for gt_subject, p in zip( node_data["subject"], node_predictions.idxmax(axis=1)) ]) print("All-node accuracy: {:3f}".format(accuracy)) # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them # Save the trained model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("cora_example_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_example_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f)
def train_model(Gnx, train_data, test_data, all_features): output_results = {} from collections import Counter #TODO: save size of dataset, train_data, and test data #save the count of each subject in the blocks print(len(train_data), len(test_data)) subject_groups_train = Counter(train_data['subject']) subject_groups_test = Counter(test_data['subject']) output_results['train_size'] = len(train_data) output_results['test_size'] = len(test_data) output_results['subject_groups_train'] = subject_groups_train output_results['subject_groups_test'] = subject_groups_test #node_features = train_data[feature_names] #print (feature_names) G = sg.StellarGraph(Gnx, node_features=all_features) #TODO: save graph info print(G.info()) print("writing graph.dot") #write_dot(Gnx,"graph.dot") output_results['graph_info'] = G.info() print("building the graph generator...") batch_size = 50 num_samples = [10, 5] generator = GraphSAGENodeGenerator(G, batch_size, num_samples) #generator = HinSAGENodeGenerator(G, batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[["subject"]].to_dict('records')) print(np.unique(train_data["subject"].to_list())) class_weights = class_weight.compute_class_weight( 'balanced', np.unique(train_data["subject"].to_list()), train_data["subject"].to_list()) print('class_weights', class_weights) test_targets = target_encoding.transform(test_data[["subject" ]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graphsage_model = GraphSAGE( #graphsage_model = HinSAGE( #layer_sizes=[32, 32], layer_sizes=[80, 80], generator=generator, #train_gen, bias=True, dropout=0.5, ) print("building model...") #x_inp, x_out = graphsage_model.build(flatten_output=True) x_inp, x_out = graphsage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print("compiling model...") model.compile( optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc", metrics.categorical_accuracy], ) print("testing the model...") test_gen = generator.flow(test_data.index, test_targets) history = model.fit_generator( train_gen, epochs=EPOCH, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate_generator(test_gen) print("\nTest Set Metrics:") output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict_generator(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ "Predicted": results, "True": test_data['subject'] }) #, "program":test_data['program']}) clean_result_labels = df["Predicted"].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) #pred_program = np.unique(df['program'].values) # save predictions per label precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) return generator, model, x_inp, x_out, history, target_encoding, output_results
def main(graph_loc, layer_sizes, activations, dropout, learning_rate, num_epochs): # Load edges in order 'cited-paper' <- 'citing-paper' edgelist = pd.read_csv( os.path.join(graph_loc, "cora.cites"), sep="\t", header=None, names=["target", "source"], ) # Load node features # The CORA dataset contains binary attributes 'w_x' that correspond to whether the corresponding keyword # (out of 1433 keywords) is found in the corresponding publication. feature_names = ["w_{}".format(ii) for ii in range(1433)] # Also, there is a "subject" column column_names = feature_names + ["subject"] node_data = pd.read_csv( os.path.join(graph_loc, "cora.content"), sep="\t", header=None, names=column_names, ) target_encoding = feature_extraction.DictVectorizer(sparse=False) node_targets = target_encoding.fit_transform( node_data[["subject"]].to_dict("records")) node_ids = node_data.index node_features = node_data[feature_names] Gnx = nx.from_pandas_edgelist(edgelist) # Convert to StellarGraph and prepare for ML G = sg.StellarGraph(Gnx, node_features=node_features) # Split nodes into train/test using stratification. ( train_nodes, test_nodes, train_targets, test_targets, ) = model_selection.train_test_split( node_ids, node_targets, train_size=140, test_size=None, stratify=node_targets, random_state=55232, ) # Split test set into test and validation val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split( test_nodes, test_targets, train_size=300, test_size=None, random_state=523214) # We specify the method='gcn' to give the pre-processing required by the GCN algorithm. generator = FullBatchNodeGenerator(G, method="gcn") model = train( train_nodes, train_targets, val_nodes, val_targets, generator, dropout, layer_sizes, learning_rate, activations, num_epochs, ) # Save the trained model save_str = "_h{}_l{}_d{}_r{}".format( "gcn", "".join([str(x) for x in layer_sizes]), str(dropout), str(learning_rate)) model.save("cora_gcn_model" + save_str + ".h5") # We must also save the target encoding to convert model predictions with open("cora_gcn_encoding" + save_str + ".pkl", "wb") as f: pickle.dump([target_encoding], f) test(test_nodes, test_targets, generator, "cora_gcn_model" + save_str + ".h5")
else: assert testtype == 'edges' gtrain_edgelist, edges_test_true, edges_test_false = Main.Dataset.make_edges_test_set( to_julia_edgelist(Gnx), testprop) edges_test_true = edges_test_true - 1 edges_test_false = edges_test_false - 1 Gtrain_nx = from_julia_edgelist(gtrain_edgelist) # Recover nodes that are now isolated in Gtrain_nx, not seen through the edgelist for n in Gnx.nodes(): if n not in Gtrain_nx.nodes(): Gtrain_nx.add_node(n) nx.set_node_attributes(Gtrain_nx, "paper", "label") ## Train the embedding # mo"number of epochs to train for"l G = sg.StellarGraph(Gnx, node_features=node_features) Gtrain = sg.StellarGraph(Gtrain_nx, node_features=node_features) # The graph G # together wi"number of parallel workers to use" the unsupervised sampler will be used to generate samples. actual_nodes_train = list(Gtrain.nodes()) if testtype == 'nodes': assert set(nodes_train).issuperset(actual_nodes_train) unsupervised_samples = UnsupervisedSampler(Gtrain, nodes=actual_nodes_train, length=length_of_walks, number_of_walks=number_of_walks) train_gen = GraphSAGELinkGenerator(Gtrain, batch_size, num_samples).flow(unsupervised_samples) # Build the model
def get_stellargraph(self): return sg.StellarGraph(self.g_nx, node_type_name="ntype", node_features=self.node_features)
# path_weights = sys.argv[1] # path_node_partition = sys.argv[2] # path_edge_partition = sys.argv[3] path_weights = "./weights/weights.npy" path_node_partition = "./data/4_attributes_0" path_edge_partition = "./data/4_0" # Constructing the graph nodes = pd.read_csv(path_node_partition , sep='\t', lineterminator='\n',header=None).loc[:,0:1433] nodes.set_index(0,inplace=True) edges = pd.read_csv(path_edge_partition , sep='\s+', lineterminator='\n', header=None) edges.columns = ["source","target"] G = sg.StellarGraph(nodes=nodes,edges=edges) # Train split edge_splitter_train = EdgeSplitter(G) G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split( p=0.2, method="global", keep_connected=True ) # Hyperparams batch_size = 20 epochs = 20 num_samples = [20, 10] layer_sizes = [20, 20] # Train iterators train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
def _train_model(self, gnx, train_data, test_data, all_features, target_feature_name): subject_groups_train = Counter(train_data[target_feature_name]) subject_groups_test = Counter(test_data[target_feature_name]) graph = sg.StellarGraph(gnx, node_features=all_features) output_results = { 'train_size': len(train_data), 'test_size': len(test_data), 'subject_groups_train': subject_groups_train, 'subject_groups_test': subject_groups_test, 'graph_info': graph.info() } num_samples = [10, 5] generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples) target_encoding = feature_extraction.DictVectorizer(sparse=False) train_targets = target_encoding.fit_transform( train_data[[target_feature_name]].to_dict('records')) class_weights = class_weight.compute_class_weight( class_weight='balanced', classes=np.unique(train_data[target_feature_name].to_list()), y=train_data[target_feature_name].to_list()) class_weights = dict(enumerate(class_weights)) test_targets = target_encoding.transform( test_data[[target_feature_name]].to_dict('records')) train_gen = generator.flow(train_data.index, train_targets, shuffle=True) graph_sage_model = GraphSAGE( layer_sizes=[80, 80], generator=generator, # train_gen, bias=True, dropout=0.5, ) print('building model...') x_inp, x_out = graph_sage_model.build() prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) model = Model(inputs=x_inp, outputs=prediction) print('compiling model...') model.compile( optimizer=optimizers.Adam(learning_rate=0.005), loss=losses.categorical_crossentropy, metrics=['acc', metrics.categorical_accuracy], ) print('testing the model...') test_gen = generator.flow(test_data.index, test_targets) history = model.fit( train_gen, epochs=self.num_epochs, validation_data=test_gen, verbose=2, shuffle=True, class_weight=class_weights, ) # save test metrics test_metrics = model.evaluate(test_gen) print('Test Set Metrics:') output_results['test_metrics'] = [] for name, val in zip(model.metrics_names, test_metrics): output_results['test_metrics'].append({'name': name, 'val:': val}) print("\t{}: {:0.4f}".format(name, val)) test_nodes = test_data.index test_mapper = generator.flow(test_nodes) test_predictions = model.predict(test_mapper) node_predictions = target_encoding.inverse_transform(test_predictions) results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1) df = pd.DataFrame({ 'Predicted': results, 'True': test_data[target_feature_name] }) clean_result_labels = df['Predicted'].map( lambda x: x.replace('subject=', '')) # save predicted labels pred_labels = np.unique(clean_result_labels.values) precision, recall, f1, _ = skmetrics.precision_recall_fscore_support( df['True'].values, clean_result_labels.values, average=None, labels=pred_labels) output_results['classifier'] = [] for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1): output_results['classifier'].append({ 'label': lbl, 'precision': prec, 'recall': rec, 'fscore': fm }) print(output_results['classifier']) print(pred_labels) print('precision: {}'.format(precision)) print('recall: {}'.format(recall)) print('fscore: {}'.format(f1)) output_results['history'] = { 'epochs': history.epoch, 'training_log': history.history, 'training_params': history.params } return generator, model, x_inp, x_out, history, target_encoding, output_results