def predict(): string = str('test') hist_pred_n = string + "hist_pred.jpeg" # Loading from .pkl files pkl_hnd = store(app.config['static_path'], app.root_path) clf = pkl_hnd.load('model') n_labels = pkl_hnd.load('n_labels') enc = pkl_hnd.load('enc') # Feature extraction data = utils.file_parser_test( os.path.join(app.config['upload_path'], "test.txt")) features = utils.feature_extractor(data['text'], 5000) # Preprocessing features data_x = utils.preprocess_features(features, 2500) # Predicting pr = predict_model(data_x) pred_enc = pr.predict_model(clf) # Decoding the encoded prediction pred = utils.label_encoder(pred_enc, True, enc) pkl_hnd.save_pred(data_x, pred) # Saving predicted value and data into .csv file #Plotting histogram of prediction pkl_hnd.plot_hist(pred, hist_pred_n) return render_template( "predict_result.html", img_hist_pred=url_for(app.config['static_path'], filename=hist_pred_n), )
def process_data(self): data = load_data('cora') adj, feas = data[:2] self.adj = adj.todense() self.normed_adj = preprocess_adj(adj) self.feas = preprocess_features(feas, False) self.y_train, self.y_val, self.y_test = data[2:5] self.train_mask, self.val_mask, self.test_mask = data[5:]
def run(args): ( adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size, ) = load_corpus(args.select_data) train_mask = train_mask + val_mask y_train = y_train + y_val adj_dense = preprocess_adj(adj).toarray().astype(np.float32) features_dense = preprocess_features(features).toarray().astype(np.float32) y_train = y_train.astype(np.float32) y_test = y_test.astype(np.float32) train_mask = train_mask.astype(np.float32) test_mask = test_mask.astype(np.float32) gcn_model = GCN( tf.convert_to_tensor(adj_dense), layers=args.layers, hidden_size=args.hidden_size, dropout=args.dropout, ) loss_fn = masked_softmax_cross_entropy # acc_fn = masked_accuracy optimizer = Adam(learning_rate=args.lr) # print("Model Layers: ", gcn_model.trainable_variables) model_textGCN = TextGCN(model=gcn_model, loss=loss_fn, optimizer=optimizer, args=args) model_textGCN.train(features_dense, y_train, train_mask) sns.distplot(model_textGCN.train_accuracy) plt.savefig("train_acc.png") plt.clf() sns.distplot(model_textGCN.train_losses) plt.savefig("train_losses.png") eval_result = model_textGCN.evaluate(features_dense, y_test, test_mask) print(f"Final Evaluation Result: {eval_result}")
def load(dataset): datadir = os.path.join('data', dataset) if not os.path.exists(datadir): os.makedirs(datadir) ds = download(dataset) adj = nx.to_numpy_array(ds.graph) diff = compute_ppr(ds.graph, 0.2) feat = ds.features[:] labels = ds.labels[:] idx_train = np.argwhere(ds.train_mask == 1).reshape(-1) idx_val = np.argwhere(ds.val_mask == 1).reshape(-1) idx_test = np.argwhere(ds.test_mask == 1).reshape(-1) np.save(f'{datadir}/adj.npy', adj) np.save(f'{datadir}/diff.npy', diff) np.save(f'{datadir}/feat.npy', feat) np.save(f'{datadir}/labels.npy', labels) np.save(f'{datadir}/idx_train.npy', idx_train) np.save(f'{datadir}/idx_val.npy', idx_val) np.save(f'{datadir}/idx_test.npy', idx_test) else: adj = np.load(f'{datadir}/adj.npy') diff = np.load(f'{datadir}/diff.npy') feat = np.load(f'{datadir}/feat.npy') labels = np.load(f'{datadir}/labels.npy') idx_train = np.load(f'{datadir}/idx_train.npy') idx_val = np.load(f'{datadir}/idx_val.npy') idx_test = np.load(f'{datadir}/idx_test.npy') if dataset == 'citeseer': feat = preprocess_features(feat) epsilons = [1e-5, 1e-4, 1e-3, 1e-2] avg_degree = np.sum(adj) / adj.shape[0] epsilon = epsilons[np.argmin([ abs(avg_degree - np.argwhere(diff >= e).shape[0] / diff.shape[0]) for e in epsilons ])] diff[diff < epsilon] = 0.0 scaler = MinMaxScaler() scaler.fit(diff) diff = scaler.transform(diff) ori_adj = copy.deepcopy(adj) # print(ori_adj) adj = normalize_adj(adj + sp.eye(adj.shape[0])).todense() return ori_adj, adj, diff, feat, labels, idx_train, idx_val, idx_test
def main(args): # save_dir = args.save_dir log_dir = args.log_dir train_dir = args.data_dir if not os.path.exists(save_dir): os.makedirs(save_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = utils.load_data( args.data_type) features = utils.preprocess_features(features) support = [utils.preprocess_adj(adj)] args.num_supports = 1 args.input_size, args.features_size = features[2][1], features[2] args.output_size = y_train.shape[1] config_proto = utils.get_config_proto() sess = tf.Session(config=config_proto) model = GCN(args, sess, name="gcn") summary_writer = tf.summary.FileWriter(log_dir) for epoch in range(1, args.nb_epoch + 1): epoch_start_time = time.time() feed_dict = utils.construct_feed_dict(model, features, support, y_train, train_mask) _, train_loss, train_acc, summaries = model.train(feed_dict) if epoch % args.summary_epoch == 0: summary_writer.add_summary(summaries, epoch) if epoch % args.print_epoch == 0: feed_dict_val = utils.construct_feed_dict(model, features, support, y_val, val_mask) val_loss, val_acc = model.evaluate(feed_dict_val) print "epoch %d, train_loss %f, train_acc %f, val_loss %f, val_acc %f, time %.5fs" % \ (epoch, train_loss, train_acc, val_loss, val_acc, time.time()-epoch_start_time) if args.anneal and epoch >= args.anneal_start: sess.run(model.lr_decay_op) model.saver.save(sess, os.path.join(save_dir, "model.ckpt")) print "Model stored...."
def get_data(dataset): # Load output_data (adj, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size) = utils.load_data(dataset) features = sparse.identity(adj.shape[1]) # Some preprocessing features = utils.preprocess_features(features) support = [utils.preprocess_adj(adj)] # Define placeholders t_features = torch.from_numpy(features) t_y_train = torch.from_numpy(y_train) t_y_val = torch.from_numpy(y_val) t_y_test = torch.from_numpy(y_test) t_train_mask = torch.from_numpy(train_mask.astype(np.float32)) t_support = [] for i in range(len(support)): t_support.append(torch.Tensor(support[i])) return (t_features, t_y_train, t_y_val, t_y_test, t_train_mask, t_support, val_mask, test_mask, train_size, test_size)
def load_data(dataset_name, splits_file_path=None, train_percentage=None, val_percentage=None, embedding_mode=None, embedding_method=None, embedding_method_graph=None, embedding_method_space=None): if dataset_name in {'cora', 'citeseer', 'pubmed'}: adj, features, labels, _, _, _ = utils.load_data(dataset_name) labels = np.argmax(labels, axis=-1) features = features.todense() G = nx.DiGraph(adj) else: graph_adjacency_list_file_path = os.path.join('new_data', dataset_name, 'out1_graph_edges.txt') graph_node_features_and_labels_file_path = os.path.join( 'new_data', dataset_name, f'out1_node_feature_label.txt') G = nx.DiGraph() graph_node_features_dict = {} graph_labels_dict = {} if dataset_name == 'film': with open(graph_node_features_and_labels_file_path ) as graph_node_features_and_labels_file: graph_node_features_and_labels_file.readline() for line in graph_node_features_and_labels_file: line = line.rstrip().split('\t') assert (len(line) == 3) assert (int(line[0]) not in graph_node_features_dict and int(line[0]) not in graph_labels_dict) feature_blank = np.zeros(932, dtype=np.uint8) feature_blank[np.array(line[1].split(','), dtype=np.uint16)] = 1 graph_node_features_dict[int(line[0])] = feature_blank graph_labels_dict[int(line[0])] = int(line[2]) else: with open(graph_node_features_and_labels_file_path ) as graph_node_features_and_labels_file: graph_node_features_and_labels_file.readline() for line in graph_node_features_and_labels_file: line = line.rstrip().split('\t') assert (len(line) == 3) assert (int(line[0]) not in graph_node_features_dict and int(line[0]) not in graph_labels_dict) graph_node_features_dict[int(line[0])] = np.array( line[1].split(','), dtype=np.uint8) graph_labels_dict[int(line[0])] = int(line[2]) with open(graph_adjacency_list_file_path) as graph_adjacency_list_file: graph_adjacency_list_file.readline() for line in graph_adjacency_list_file: line = line.rstrip().split('\t') assert (len(line) == 2) if int(line[0]) not in G: G.add_node(int(line[0]), features=graph_node_features_dict[int(line[0])], label=graph_labels_dict[int(line[0])]) if int(line[1]) not in G: G.add_node(int(line[1]), features=graph_node_features_dict[int(line[1])], label=graph_labels_dict[int(line[1])]) G.add_edge(int(line[0]), int(line[1])) adj = nx.adjacency_matrix(G, sorted(G.nodes())) features = np.array([ features for _, features in sorted(G.nodes(data='features'), key=lambda x: x[0]) ]) labels = np.array([ label for _, label in sorted(G.nodes(data='label'), key=lambda x: x[0]) ]) features = utils.preprocess_features(features) if not embedding_mode: g = DGLGraph(adj + sp.eye(adj.shape[0])) else: if embedding_mode == 'ExperimentTwoAll': embedding_file_path = os.path.join( 'embedding_method_combinations_all', f'outf_nodes_relation_{dataset_name}all_embedding_methods.txt') elif embedding_mode == 'ExperimentTwoPairs': embedding_file_path = os.path.join( 'embedding_method_combinations_in_pairs', f'outf_nodes_relation_{dataset_name}_graph_{embedding_method_graph}_space_{embedding_method_space}.txt' ) else: embedding_file_path = os.path.join( 'structural_neighborhood', f'outf_nodes_space_relation_{dataset_name}_{embedding_method}.txt' ) space_and_relation_type_to_idx_dict = {} with open(embedding_file_path) as embedding_file: for line in embedding_file: if line.rstrip() == 'node1,node2 space relation_type': continue line = re.split(r'[\t,]', line.rstrip()) assert (len(line) == 4) assert (int(line[0]) in G and int(line[1]) in G) if (line[2], int( line[3])) not in space_and_relation_type_to_idx_dict: space_and_relation_type_to_idx_dict[(line[2], int( line[3]))] = len(space_and_relation_type_to_idx_dict) if G.has_edge(int(line[0]), int(line[1])): G.remove_edge(int(line[0]), int(line[1])) G.add_edge(int(line[0]), int(line[1]), subgraph_idx=space_and_relation_type_to_idx_dict[( line[2], int(line[3]))]) space_and_relation_type_to_idx_dict['self_loop'] = len( space_and_relation_type_to_idx_dict) for node in sorted(G.nodes()): if G.has_edge(node, node): G.remove_edge(node, node) G.add_edge( node, node, subgraph_idx=space_and_relation_type_to_idx_dict['self_loop']) adj = nx.adjacency_matrix(G, sorted(G.nodes())) g = DGLGraph(adj) for u, v, feature in G.edges(data='subgraph_idx'): g.edges[g.edge_id(u, v)].data['subgraph_idx'] = th.tensor([feature]) if splits_file_path: with np.load(splits_file_path) as splits_file: train_mask = splits_file['train_mask'] val_mask = splits_file['val_mask'] test_mask = splits_file['test_mask'] else: assert (train_percentage is not None and val_percentage is not None) assert (train_percentage < 1.0 and val_percentage < 1.0 and train_percentage + val_percentage < 1.0) if dataset_name in {'cora', 'citeseer'}: disconnected_node_file_path = os.path.join( 'unconnected_nodes', f'{dataset_name}_unconnected_nodes.txt') with open(disconnected_node_file_path) as disconnected_node_file: disconnected_node_file.readline() disconnected_nodes = [] for line in disconnected_node_file: line = line.rstrip() disconnected_nodes.append(int(line)) disconnected_nodes = np.array(disconnected_nodes) connected_nodes = np.setdiff1d(np.arange(features.shape[0]), disconnected_nodes) connected_labels = labels[connected_nodes] train_and_val_index, test_index = next( ShuffleSplit(n_splits=1, train_size=train_percentage + val_percentage).split( np.empty_like(connected_labels), connected_labels)) train_index, val_index = next( ShuffleSplit(n_splits=1, train_size=train_percentage).split( np.empty_like(connected_labels[train_and_val_index]), connected_labels[train_and_val_index])) train_index = train_and_val_index[train_index] val_index = train_and_val_index[val_index] train_mask = np.zeros_like(labels) train_mask[connected_nodes[train_index]] = 1 val_mask = np.zeros_like(labels) val_mask[connected_nodes[val_index]] = 1 test_mask = np.zeros_like(labels) test_mask[connected_nodes[test_index]] = 1 else: train_and_val_index, test_index = next( ShuffleSplit(n_splits=1, train_size=train_percentage + val_percentage).split(np.empty_like(labels), labels)) train_index, val_index = next( ShuffleSplit(n_splits=1, train_size=train_percentage).split( np.empty_like(labels[train_and_val_index]), labels[train_and_val_index])) train_index = train_and_val_index[train_index] val_index = train_and_val_index[val_index] train_mask = np.zeros_like(labels) train_mask[train_index] = 1 val_mask = np.zeros_like(labels) val_mask[val_index] = 1 test_mask = np.zeros_like(labels) test_mask[test_index] = 1 num_features = features.shape[1] num_labels = len(np.unique(labels)) assert (np.array_equal(np.unique(labels), np.arange(len(np.unique(labels))))) features = th.FloatTensor(features) labels = th.LongTensor(labels) train_mask = th.BoolTensor(train_mask) val_mask = th.BoolTensor(val_mask) test_mask = th.BoolTensor(test_mask) # Adapted from https://docs.dgl.ai/tutorials/models/1_gnn/1_gcn.html degs = g.in_degrees().float() norm = th.pow(degs, -0.5).cuda() norm[th.isinf(norm)] = 0 g.ndata['norm'] = norm.unsqueeze(1) return g, features, labels, train_mask, val_mask, test_mask, num_features, num_labels
A = A + np.eye(A.shape[0]) # Add self-loops # print('A = A+np.eye(A.shape[0])') # print(type(A)) # print(A) #========================= X ======================= #==================================归一化=================================== data_pre = [] data_train = [] data_test = [] # Preprocessing operations A_list = [A for i in range(5)] for i in range(5): data_pre.append(preprocess_features(data[i])) for i in range(2): data_train.append(data_pre[i]) for i in range(2, 4): data_test.append(data_pre[i]) # for i in range(5): # data_pre.append(preprocess_features(data[i])) # for i in range(3): # data_train.append([data_pre[i], A]) # for i in range(2,4): # data_test.append([data_pre[i], A]) #--ValueError: could not broadcast input array from shape (12,7) into shape (12)
import torch import numpy as np import pickle from utils import load_data,preprocess_features,preprocess_adj,tuple_to_torchSparseTensor from gcn_model import GCN adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data("cora") adj_hat = preprocess_adj(adj) features = preprocess_features(features) # features[0].shape == (49216, 2) # features[1].sahpe == (49216,) # features[2] === (2708, 1433) # Convert to torch.Tensor sparse_adj_hat = tuple_to_torchSparseTensor(adj_hat) sparse_features = tuple_to_torchSparseTensor(features) y_train = torch.FloatTensor(y_train) # dtype = torch.float32 y_val = torch.FloatTensor(y_val) y_test = torch.FloatTensor(y_test) train_mask = torch.from_numpy(train_mask) # dtype = torch.bool val_mask = torch.from_numpy(val_mask) test_mask = torch.from_numpy(test_mask) model_file = 'training_dir/gcn_model.pkl' model = torch.load(model_file) output = model(sparse_adj_hat,sparse_features) test_loss = model.loss(output,y_test,test_mask) test_acc = model.accuracy(output,y_test,test_mask) print("model_file={},test_loss={},test_acc={}".format(model_file,test_loss.item(),test_acc.item()))
from models.gat import create_gat_model from models.sgc import create_sgc_model from models.gfnn import create_gfnn_model #from models.graphsage import create_graphsage_model from models.masked_gcn import create_masked_gcn_model from train import run from utils import preprocess_features import argparse if __name__ == '__main__': parser = argparse.ArgumentParser(description='') parser.add_argument('--dataset', type=str, default='small') parser.add_argument('--model', type=str, default='sgc') parser.add_argument('--niter', type=int, default=10) args = parser.parse_args() data = Data.load(args.dataset) #load_data(args.dataset) data.update_mask(0) data.features = preprocess_features(data.features) if args.model == 'sgc': model, optimizer = create_sgc_model(data, lr=0.2, K=2) elif args.model == 'gcn': model, optimizer = create_gcn_model(data) else: raise ValueError(args.model) run(data, model, optimizer, verbose=False, niter=args.niter, patience=10)
def exp(dataset, data_seed, init_seed): ''' dataset - name of dataset data_seed - data_seed corresponds to train/dev/test split init_seed - seed for initializing NN weights ''' print('running {} on {}'.format(FLAGS.model, dataset)) tf.reset_default_graph() adj, subgraphs, features, labels, train_mask, val_mask, test_mask = load_data( dataset, data_seed) features = preprocess_features(features) # if early_stopping is not used, then put validation data into the testing data if FLAGS.early_stop == 0: mask = np.logical_or(val_mask, test_mask) test_mask = mask val_mask = mask config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 #config.log_device_placement = True config.gpu_options.allow_growth = True train_loss = [] train_acc = [] valid_loss = [] valid_acc = [] with tf.Graph().as_default(): random.seed(init_seed) np.random.seed(init_seed) tf.set_random_seed(init_seed) with tf.Session(config=config) as sess: model, support, placeholders = build_model(adj, features, labels.shape[1], subgraphs) sess.run(tf.global_variables_initializer()) def evaluate(labels_mask, noise=0., dropout=0.): feed_dict_val = construct_feed_dict(features, support, labels, labels_mask, placeholders, noise, dropout) outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val) return outs_val[0], outs_val[1] start_t = time.time() for epoch in range(FLAGS.epochs): feed_dict = construct_feed_dict(features, support, labels, train_mask, placeholders, FLAGS.fisher_noise, FLAGS.dropout) feed_dict.update({tf.keras.backend.learning_phase(): 1}) outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) train_loss.append(outs[1]) train_acc.append(outs[2]) # Validation outs = evaluate(val_mask) valid_loss.append(outs[0]) valid_acc.append(outs[1]) if (epoch + 1) % 10 == 0: print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_loss[-1]), "train_acc=", "{:.5f}".format(train_acc[-1]), "val_loss=", "{:.5f}".format(valid_loss[-1]), "val_acc=", "{:.5f}".format(valid_acc[-1])) #print( 'perterbation radius:', sess.run( pradius ) ) if FLAGS.early_stop == 0: if epoch > 10 and (train_loss[-1] > 1.5 * train_loss[0] or np.isnan(train_loss[-1])): print("Early stopping at epoch {}...".format(epoch)) break elif FLAGS.early_stop == 1: # simple early stopping if epoch > 20 and valid_loss[-1] > np.mean( valid_loss[-10:] ) \ and valid_acc[-1] < np.mean( valid_acc[-10:] ): print("Early stopping at epoch {}...".format(epoch)) break elif FLAGS.early_stop == 2: # more strict conditions if epoch > 100 \ and np.mean( valid_loss[-10:] ) > np.mean( valid_loss[-100:] ) \ and np.mean( valid_acc[-10:] ) < np.mean( valid_acc[-100:] ): print("Early stopping at epoch {}...".format(epoch)) break else: print('unknown early stopping strategy:', FLAGS.early_stop) sys.exit(0) test_loss, test_acc = evaluate(test_mask) sec_per_epoch = (time.time() - start_t) / epoch print("Test set results:", "loss=", "{:.5f}".format(test_loss), "accuracy=", "{:.5f}".format(test_acc), "epoch_secs=", "{:.2f}".format(sec_per_epoch)) tf.reset_default_graph() return { 'train_loss': train_loss, 'train_acc': train_acc, 'valid_loss': valid_loss, 'valid_acc': valid_acc, 'test_loss': test_loss, 'test_acc': test_acc, }
A, X, Y_train, Y_val, Y_test, idx_train, idx_val, idx_test = load_data('cora') # Parameters N = X.shape[0] # Number of nodes in the graph F = X.shape[1] # Original feature dimension n_classes = Y_train.shape[1] # Number of classes F_ = 8 # Output size of first GraphAttention layer n_attn_heads = 8 # Number of attention heads in first GAT layer dropout_rate = 0.6 # Dropout rate (between and inside GAT layers) l2_reg = 5e-4 / 2 # Factor for l2 regularization learning_rate = 5e-3 # Learning rate for Adam epochs = 10000 # Number of training epochs es_patience = 100 # Patience fot early stopping # Preprocessing operations X = preprocess_features(X) A = A + np.eye(A.shape[0]) # Add self-loops # Model definition (as per Section 3.3 of the paper) X_in = Input(shape=(F, )) A_in = Input(shape=(N, )) dropout1 = Dropout(dropout_rate)(X_in) graph_attention_1 = GraphAttention( F_, attn_heads=n_attn_heads, attn_heads_reduction='concat', dropout_rate=dropout_rate, activation='elu', kernel_regularizer=l2(l2_reg), attn_kernel_regularizer=l2(l2_reg))([dropout1, A_in])
elif 'bias' in name: torch.nn.init.constant_(w, 0) else: pass if __name__ == '__main__': adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data( 'cora') print('adj:', adj.shape) print('features:', features.shape) print('y:', y_train.shape, y_val.shape, y_test.shape) print('mask:', train_mask.shape, val_mask.shape, test_mask.shape) features = preprocess_features( features) # [49216, 2], [49216], [2708, 1433] supports = preprocess_adj(adj) train_label = torch.from_numpy(y_train).long().to(device) num_classes = train_label.shape[1] train_label = train_label.argmax(dim=1) train_mask = torch.from_numpy(train_mask.astype(np.int)).to(device) val_label = torch.from_numpy(y_val).long().to(device) val_label = val_label.argmax(dim=1) val_mask = torch.from_numpy(val_mask.astype(np.int)).to(device) test_label = torch.from_numpy(y_test).long().to(device) test_label = test_label.argmax(dim=1) test_mask = torch.from_numpy(test_mask.astype(np.int)).to(device) i = torch.from_numpy(features[0].astype(np.float)).long().to(device) v = torch.from_numpy(features[1]).to(device)
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.') flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).') flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') flags.DEFINE_string('gpu', '1', 'GPU selection.') flags.DEFINE_string('method', args.method, 'Adversarial attack method') os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu # Load data adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data( FLAGS.dataset_dir, FLAGS.dataset) # Some preprocessing features_dense, features = preprocess_features(features) if FLAGS.model == 'gcn': support = [preprocess_adj(adj)] num_supports = 1 model_func = GCN elif FLAGS.model == 'gcn_cheby': support = chebyshev_polynomials(adj, FLAGS.max_degree) num_supports = 1 + FLAGS.max_degree model_func = GCN elif FLAGS.model == 'dense': support = [preprocess_adj(adj)] # Not used num_supports = 1 model_func = MLP else: raise ValueError('Invalid argument for model: ' + str(FLAGS.model))
print("Ignored fold {}, since store_data_dicts already contains {} folds".format(fold_i, len(store_data_dicts))) continue data["train_ind"] = train_ind data["val_ind"] = val_ind data["fold_i"] = fold_i # Get the CNN embedding in case of a parisot model, it is dependent on fold_i. if args.model == "parisot_tf" or args.model == "parisot_py": data['cnn_embedding'] = get_data_raw_cnn(args, fold_i) if type(data['cnn_embedding']) == type(None): print("No cnn embedding loaded, this fold is therefore ignored.") continue # Preprocess the features for the parisot models. data['cnn_embedding'] = preprocess_features(data['cnn_embedding']) # Rewriting the name since the general name will be input_feauteres, independend of the actual data source. data['input_features'] = data.pop('cnn_embedding') # Create adjacency matrix based on vae in the case of a Parisot model. # This should happen in this point since it can be based on input_features for the sparisity part. data['vae'] = get_data_raw_amc_vae(data['id']) data['adj_raw'] = get_adjacency_matrix_vae(args, data['vae'], data) data['adj_support'] = chebyshev_polynomials(data['adj_raw'], args.polynomial_degree) store_data_dict = train_single_fold(args, data) store_data_dicts.append(store_data_dict) save_datadicts(args, store_data_dicts)
def train(): clf = request.form['train'] if allowed_classifier(clf): string = str('train') hist_n = string + "hist.jpeg" cnmt_n = string + "cnmt.jpeg" pkl_hnd = store(app.config['static_path'], app.root_path) # Feature extraction data = utils.file_parser( os.path.join(app.config['upload_path'], "data.txt")) features = utils.feature_extractor(data['text'], 5000).todense() sh = data.shape # Preprocessing features and labels data_x = utils.preprocess_features(features, 2500) data_y, enc = utils.label_encoder(data['label'], False, None) pkl_hnd.dump(enc, 'enc') # storing encoder # Splitting data into training set and validation set train_x, train_y, valid_x, valid_y = utils.train_valid( data_x, data_y, 0.2) #Balancing data with SMOTE text, label = utils.balance_data(train_x, train_y) # Selecting model and tuning hyperparameters tr = model(clf, text[:sh[0], :], label[:sh[0]], valid_x, valid_y) comb_mod = tr.model_selection() # Fitting model and predicting mod = tr.build_model(comb_mod) pkl_hnd.dump(mod, 'model') # storing the model pr = predict_model(valid_x) pred = pr.predict_model(mod) #Training Statistics st = stats(pred, valid_y) acc, f1 = st.train_stats() #Plotting histogram and confusion matrix pkl_hnd.plot_hist(data['label'], hist_n) n_labels = np.unique(np.asarray(data['label'])) pkl_hnd.dump(n_labels, 'n_labels') # storing labels cnf_matrix = st.cnf_mtx() pkl_hnd.plot_confusion_matrix( cnf_matrix, n_labels, cnmt_n, normalize=True, title='Confusion matrix', cmap=plt.cm.Blues, ) return render_template("train_result.html", accuracy=acc, img_hist=url_for(app.config['static_path'], filename=hist_n), img_cfmt=url_for(app.config['static_path'], filename=cnmt_n), f1=f1) else: flash('Please enter a valid classifier') return redirect(url_for('index'))
def train_Model(dataset, data_seed, init_seed): print('{} Model on {}'.format(FLAGS.model, dataset)) tf.reset_default_graph() adj, features, labels, train_mask, val_mask, test_mask = load_data( dataset, data_seed) #Feature Selection part y_train = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_train = np.argmax(y_train, axis=1) # clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(features[train_mask], y_train[train_mask]) # model = SelectFromModel(clf, prefit=True) # features = model.transform(features) #alfa A+ beta A' (Clique Finding) graphMain = nx.from_numpy_matrix(adj.todense()) listClique = list(nx.find_cliques(graphMain)) tmp = deepcopy(np.matrix(adj.todense())) for i in listClique: for j in i: for k in i: if j != k: adj[j, k] = len(i) - 1 adj[k, j] = len(i) - 1 adj = FLAGS.alfa * np.matrix(adj.todense()) + FLAGS.beta * tmp features = preprocess_features(features) config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 #config.log_device_placement = True config.gpu_options.allow_growth = True train_loss = [] train_acc = [] valid_loss = [] valid_acc = [] with tf.Graph().as_default(): random.seed(init_seed) np.random.seed(init_seed) tf.set_random_seed(init_seed) sess = tf.Session(config=config) model, support, placeholders = build_model(adj, features, labels.shape[1]) sess.run(tf.global_variables_initializer()) start_t = time.time() for epoch in range(FLAGS.epochs): feed_dict = construct_feed_dict(features, support, labels, train_mask, placeholders, FLAGS.dropout, FLAGS.alfa, FLAGS.beta) feed_dict.update({tf.keras.backend.learning_phase(): 1}) outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict) train_loss.append(outs[1]) train_acc.append(outs[2]) # Validation outs = evaluate(sess, model, features, support, labels, placeholders, val_mask) valid_loss.append(outs[0]) valid_acc.append(outs[1]) if epoch > FLAGS.early_stoping \ and np.mean( valid_loss[-10:] ) > np.mean( valid_loss[-100:] ) \ and np.mean( valid_acc[-10:] ) < np.mean( valid_acc[-100:] ): print("Early stopping at epoch {}...".format(epoch)) break test_loss, test_acc = evaluate(sess, model, features, support, labels, placeholders, test_mask) print("Test set results:", "loss=", "{:.5f}".format(test_loss), "accuracy=", "{:.5f}".format(test_acc)) tf.reset_default_graph() from importlib import reload import scipy.io as sio sio.savemat('train_lossCoolClique.mat', {'train_loss_GOOLnorm': train_loss}) sio.savemat('train_accCoolClique.mat', {'train_loss_GOOLnorm': train_acc}) sio.savemat('valid_lossCoolClique.mat', {'train_loss_GOOLnorm': valid_loss}, {'valid_acc': valid_acc}) sio.savemat('valid_accgCoolClique.mat', {'train_loss_GOOLnorm': valid_acc}) return { 'train_loss': train_loss, 'train_acc': train_acc, 'valid_loss': valid_loss, 'valid_acc': valid_acc, 'test_loss': test_loss, 'test_acc': test_acc, }
neg_test_path = dataset + '/neg_test.pkl' changedadj_path = dataset + '/changed_adj.pkl' linkspath = dataset + '/links.pkl' # Load data if dataset == 'nell.0.001': features = load_nell(dataset)[1] else: features = load_data(dataset)[1] with open(changedadj_path, 'rb') as load_cha_adj: changed_adj = pickle.load(load_cha_adj) # Some preprocessing if FLAGS.features == 0: changed_features = preprocess_features(changed_adj + sp.eye(changed_adj.shape[0])) else: changed_features = preprocess_features(features) support = [preprocess_adj(changed_adj)] num_supports = 1 model_func = GCN # Define placeholders placeholders = { 'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)], 'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(changed_features[2], dtype=tf.int64)),
print('Dataset: ' + dataset) print('----- Opt. hyperparams -----') print('lr: {}'.format(lr)) print('l2_coef: {}'.format(l2_coef)) print('feed forward dropout: {}'.format(ff_dropout)) print('attention dropout: {}'.format(attn_dropout)) print('patience: {}'.format(patience)) print('----- Archi. hyperparams -----') print('no. layers: {}'.format(n_layer)) print('no. hidden units: {}'.format(hidden_units)) print('nonlinearity: {}'.format(nonlinearity)) print('model: {}'.format(model)) adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data( dataset) features, spars = preprocess_features(features) n_node = features.shape[0] ft_size = features.shape[1] n_class = y_train.shape[1] adj = adj.todense() features = torch.from_numpy(features) y_train = torch.from_numpy(y_train) y_val = torch.from_numpy(y_val) y_test = torch.from_numpy(y_test) train_mask = torch.from_numpy(np.array(train_mask, dtype=np.uint8)) val_mask = torch.from_numpy(np.array(val_mask, dtype=np.uint8)) test_mask = torch.from_numpy(np.array(test_mask, dtype=np.uint8))