def sensitive_3(): top_50 = [] f = open("../data/univ_top_50_cs.txt", "r") for line in f: line = line.strip().lower() top_50.append(line) f.close() fo = open( "../result/result_top50_cs_newdata_apr09/sensitivity/all/sensitivity_diff_hits_weighted-inedge1.csv", "w") node_list, edge_list = dp.read_data("../data/data_top50_cs_apr09.csv", self_edge=False) G = dp.construct_graph(node_list, edge_list) hits = algo.weighted_HITS(G, max_iterations=100, min_delta=0.00001) result = sorted(hits.iteritems(), key=lambda asd: asd[1], reverse=True) G.clear() rank = [] for e in result: if e[0] in top_50: rank.append(e[0]) original_r = [] for e in result: if e[0] in top_50: original_r.append([e[0]]) for k in range(len(original_r)): # if not original_r[k][0] == "mit": node_list, edge_list = dp.read_data("../data/data_top50_cs_apr09.csv", self_edge=False) G = dp.construct_graph(node_list, edge_list) G = remove_significant_edge( G, original_r[k][0], rank=rank) ### add one edge from MIT to <node> hits = algo.weighted_HITS(G, max_iterations=100, min_delta=0.00001) result = sorted(hits.iteritems(), key=lambda asd: asd[1], reverse=True) #result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True) G.clear() res1 = [] for e in result: if e[0] in top_50: res1.append(e[0]) kr = 0 for i in range(len(res1)): if res1[i] == original_r[k][0]: kr = i original_r[k].append(k - kr) print original_r fo.write("univ,diff+mit1\n") for r in original_r: for i in range(len(r)): if i == 0: fo.write(str(r[i])) else: fo.write("," + str(r[i])) fo.write("\n") fo.close()
def sensitive_add_edge(filename1, filename2, outputfilename, type = "hits_weighted", add_node = "mit"): top_50 = [] f = open(filename2,"r") for line in f: line = line.strip().lower() top_50.append(line) f.close() fo = open(outputfilename,"w") node_list, edge_list = dp.read_data(filename1, filename2, self_edge = False, extended = True) G = dp.construct_graph(node_list, edge_list) #r = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001) r = choose_algorithm(G, type = type) result = sorted(r.iteritems(), key = lambda asd:asd[1], reverse = True) G.clear() rank = [] for e in result: if e[0] in top_50: rank.append(e[0]) original_r = [] for e in result: if e[0] in top_50: original_r.append([e[0]]) for k in range(len(original_r)): # if not original_r[k][0] == "mit": node_list, edge_list = dp.read_data(filename1, filename2, self_edge = False, extended = True) G = dp.construct_graph(node_list, edge_list) G = G = add_non_existing_edges(G, original_r[k][0], add_node, weight = 1) ### add one edge from MIT to <node> #r = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001) r = choose_algorithm(G, type = type) result = sorted(r.iteritems(), key = lambda asd:asd[1], reverse = True) #result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True) G.clear() res1 = [] for e in result: if e[0] in top_50: res1.append(e[0]) kr = 0 for i in range(len(res1)): if res1[i] == original_r[k][0]: kr = i original_r[k].append(k-kr) print original_r fo.write("univ,diff+%s1\n" %(add_node)) for r in original_r: for i in range(len(r)): if i == 0: fo.write(str(r[i])) else: fo.write(","+str(r[i])) fo.write("\n") fo.close()
def sensitive_3(): top_50 = [] f = open("../data/univ_top_50_cs.txt","r") for line in f: line = line.strip().lower() top_50.append(line) f.close() fo = open("../result/result_top50_cs_newdata_apr09/sensitivity/all/sensitivity_diff_hits_weighted-inedge1.csv","w") node_list, edge_list = dp.read_data("../data/data_top50_cs_apr09.csv", self_edge = False) G = dp.construct_graph(node_list, edge_list) hits = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001) result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True) G.clear() rank = [] for e in result: if e[0] in top_50: rank.append(e[0]) original_r = [] for e in result: if e[0] in top_50: original_r.append([e[0]]) for k in range(len(original_r)): # if not original_r[k][0] == "mit": node_list, edge_list = dp.read_data("../data/data_top50_cs_apr09.csv", self_edge = False) G = dp.construct_graph(node_list, edge_list) G = remove_significant_edge(G, original_r[k][0], rank = rank) ### add one edge from MIT to <node> hits = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001) result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True) #result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True) G.clear() res1 = [] for e in result: if e[0] in top_50: res1.append(e[0]) kr = 0 for i in range(len(res1)): if res1[i] == original_r[k][0]: kr = i original_r[k].append(k-kr) print original_r fo.write("univ,diff+mit1\n") for r in original_r: for i in range(len(r)): if i == 0: fo.write(str(r[i])) else: fo.write(","+str(r[i])) fo.write("\n") fo.close()
def main(): dimension = 32 X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8) # average matrix over train data avg_matrix = X_train.mean(axis=0) # generate random walks walk = random_walk(avg_matrix, steps=1000) seq = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): seq[i, :] = avg_matrix[pos] print(seq.shape) skipgram = Skip_Gram(268, dimension, 2, 0.1) skipgram.train_from_feature_seq(seq, epochs=200) embedded_train_matrix = np.zeros((len(X_train), 268 * dimension)) for i in range(len(X_train)): embedding_train = skipgram.encode(X_train[i]) embedded_train_matrix[i] = np.ndarray.flatten(embedding_train) embedded_test_matrix = np.zeros((len(X_test), 268 * dimension)) for i in range(len(X_test)): embedding_test = skipgram.encode(X_test[i]) embedded_test_matrix[i] = np.ndarray.flatten(embedding_test) lasso = Lasso(100, .01) lasso.train_coordinate_descent(embedded_train_matrix, y_train) predicted = lasso.predict(embedded_test_matrix) print(mean_squared_error(y_test, predicted))
def main(): TRAIN_CSV_PATH = "./data/train.csv" MAX_SEQUENCE_LENGTH = 20 data = read_data(TRAIN_CSV_PATH) clean(data) word_to_ix = tokenizer(data) label_to_ix = one_hot_encoding(data) def trim_zero_padding(x): arr = x[:MAX_SEQUENCE_LENGTH] arr = arr + [0] * (MAX_SEQUENCE_LENGTH - len(arr)) return arr data['text_token'] = data.loc[:, 'text_token'].apply(trim_zero_padding) X = list(data['text_token']) y = list(data['label_one_hot']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) svm = SVC(kernel="rbf", random_state=1, gamma=0.2, C=1.0) svm.fit(X_train, y_train) y_pred = svm.predict(X_test) acc = accuracy_score(y_test, y_pred) print(acc)
def main(): X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat') Xm = X.mean(axis = 0) EMBEDDING_DIM = 8 ACTIVATION = 'tanh' HEADS = 16 #Fully-Connected AutoEncoder e_x = tf.keras.layers.Input((None, X.shape[-1])) e_o = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(EMBEDDING_DIM, activation=ACTIVATION))(e_x) e = tf.keras.Model(e_x, e_o) d_x = tf.keras.layers.Input((None, EMBEDDING_DIM)) d_o = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(X.shape[-1], activation='linear'))(d_x) d = tf.keras.Model(d_x, d_o) model = AutoEncoder(e, d) model.train(X, epochs = 50, learning_rate = 0.001, loss = 'mse') generate_embedding_vis(Xm, model.encode(Xm), embedding_name='Neural Autoencoder') #Transformer AutoEncoder et_x = tf.keras.layers.Input((X.shape[1], X.shape[2])) et_o = Transformer(EMBEDDING_DIM, heads=HEADS, activation=ACTIVATION)(et_x) et = tf.keras.Model(et_x, et_o) dt_x = tf.keras.layers.Input((X.shape[1], EMBEDDING_DIM)) dt_o = Transformer(X.shape[2], heads=HEADS, activation='linear')(dt_x) dt = tf.keras.Model(dt_x, dt_o) modelt = AutoEncoder(et, dt) modelt.train(X, epochs = 100, learning_rate = 0.001, loss = 'mse') generate_embedding_vis(Xm, modelt.encode(Xm), embedding_name='Neural Transformer')
def main(): X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat') #X = data_processing.adjacency_matrix(X) avg_matrix = X.mean(axis = 0) print(avg_matrix.shape) model = AutoEncoder(X.shape[-1], 64, activation = 'relu') model.train(X, epochs = 200, learning_rate = 0.001, loss = 'mse') #generate_embedding_vis(avg_matrix, model.encode(avg_matrix), embedding_name='Neural Autoencoder') walk = random_walk(avg_matrix, steps = 1000) seq = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): seq[i, :] = avg_matrix[pos] print(seq.shape) skipgram = Skip_Gram(268, 64, 2, 0.1) skipgram.train_from_feature_seq(seq, epochs = 200) #generate_embedding_vis(avg_matrix, skipgram.encode(avg_matrix), embedding_name='SkipGram') cbow = CBOW(268, 64, 2, 0.1) cbow.train_from_feature_seq(seq, epochs = 200) #generate_embedding_vis(avg_matrix, cbow.encode(avg_matrix), embedding_name='CBOW') distances = [[avg_matrix, model.encode(avg_matrix)], [skipgram.encode(avg_matrix), cbow.encode(avg_matrix)]] names = [['Original Distances', 'Autoencoder Distances'], ['SkipGram Distances', 'CBOW Distances']] generate_embedding_vis_array(distances, names)
def run_li(): # 读入数据 # pos_file_path = '/Users/li/Kunyan/MyRepository/DeepNaturalLanguageProcessing/DeepNLP/data/test3.txt' # neg_file_path = '/Users/li/Kunyan/MyRepository/DeepNaturalLanguageProcessing/DeepNLP/data/test2.txt' pos_file_path = globe.file_pos neg_file_path = globe.file_neg tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) train_vecs = res[0] test_vecs = res[1] label_train = res[2] label_test = res[3] # 分类训练 lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs, label_train) print('Test Accuracy: %.2f' % lr.score(test_vecs, label_test)) pred_probas = lr.predict_proba(test_vecs)[:, 1] fpr, tpr, _ = roc_curve(label_test, pred_probas) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='area = %.2f' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') plt.show()
def word2vec_test(): # 读入数据 pos_file_path = globe.file_pos neg_file_path = globe.file_neg tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) x_train = res[0] x_train = data_processing.text_clean(x_train) for i in x_train: for j in i: print j, n_dim = 200 min_count = 2 # model = gensim.models.Word2Vec(x_train, min_count=0, size=200, workers=4) model = word2vec_model(x_train, n_dim, min_count) # res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1) # # w2c_model.doesnt_match("我 爱 中国".split()) # # var = w2c_model.similarity('纤维', '批次') # print var # res = w2c_model.most_similar("纤维") # for i in res: # print i[0], dd = model.most_similar("批次") for i in dd: print i[0],
def main(): X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat') Xm = X.mean(axis = 0) factorization = MatrixFactorization(Xm, 2) factorization.fit(200, 0.00001) generate_embedding_vis(Xm, factorization.factor, embedding_name="Matrix Factorization")
def test(args): if args.load_var: test_utterances, test_labels, word_dict = read_data( load_var=args.load_var, input_=None, mode='test') else: test_utterances, test_labels, word_dict = read_data(load_var=args.load_var, \ input_=os.path.join(constant.data_path, "entangled_{}.json".format(args.mode)), mode='test') if args.save_input: utils.save_or_read_input(os.path.join(constant.save_input_path, "{}_utterances.pk".format(args.mode)), \ rw='w', input_obj=test_utterances) utils.save_or_read_input(os.path.join(constant.save_input_path, "{}_labels.pk".format(args.mode)), \ rw='w', input_obj=test_labels) current_time = re.findall('.*model_(.+?)/.*', args.model_path)[0] step_cnt = re.findall('.step_(.+?)\.pkl', args.model_path)[0] test_dataloader = TrainDataLoader(test_utterances, test_labels, word_dict, name='test', batch_size=4) ensemble_model = EnsembleModel(word_dict, word_emb=None, bidirectional=False) if torch.cuda.is_available(): ensemble_model.cuda() supervised_trainer = SupervisedTrainer(args, ensemble_model, current_time=current_time) supervised_trainer.test(test_dataloader, args.model_path, step_cnt=step_cnt)
def main(): X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat') Xm = X.mean(axis=0) walk = random_walk(Xm, steps=1000) one_hot = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): one_hot[i, :] = Xm[pos] #Skip-Gram model = Skip_Gram(268, 64, 2, 0.1) model.train_from_feature_seq(one_hot, epochs=200) generate_embedding_vis(Xm, model.encode(Xm), embedding_name="Skip-Gram") #CBOW model = CBOW(268, 64, 2, 0.1) model.train_from_feature_seq(one_hot, epochs=200) generate_embedding_vis(Xm, model.encode(Xm), embedding_name="CBOW")
def main(): X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') Xm = X.mean(axis=0) factorization = MatrixFactorization(Xm, 8) factorization.fit(200, 0.0001) #generate_embedding_vis(Xm, factorization.factor, embedding_name="Matrix Factorization") generate_embedding_vis(X, factorization.encode(X), embedding_name="Matrix Factorization") factorization = TensorFactorization(X, 8) factorization.fit(50) #generate_embedding_vis(Xm, factorization.matrix_factor, embedding_name="Tensor Factorization") generate_embedding_vis(X, factorization.encode(X), embedding_name='Tensor Factorization')
def main(): X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/task_240.mat', target_variable='mean_rxn') #X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat', target_variable='age') #task = mean_rxn, age = age for tagert variable indices = ~np.isnan(X).any(axis=(1, 2)) X, y = X[indices], y[indices] permutation = np.random.permutation(len(X)) X, y = X[permutation], y[permutation] #y = (y - y.min()) / (y.max() - y.min()) y = (y - y.mean()) / y.std() node_features = np.eye(268)[np.newaxis, ...] node_features = np.repeat(node_features, len(X), axis=0) edge_features = X + node_features edge_features = edge_features[:, np.newaxis, ...] model = graph_nn(268) X_train, y_train = [node_features[:200], edge_features[:200]], y[:200] X_test, y_test = [node_features[200:], edge_features[200:]], y[200:] model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5)) model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=4000, batch_size=16) predictions = model.predict(X_test) print(predictions) print(y_test) print('MSE:', ((predictions - y_test)**2).mean()) print('Corr:', np.corrcoef(predictions[:, 0], y_test[:, 0])[0, 1])
def _data_read(pos_file_path, neg_file_path, w2c_model_path): """read data and word2vec model from file path, Args: pos_file_path: Positive file path. neg_file_path: Negative file path. w2c_model_path: word2vec model path Returns: A list contains train and test data with labels. Raises: IOError: An error occurred accessing the bigtable.Table object. """ tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) (train_data, test_data, train_labels, test_labels) = (res[0], res[1], res[2], res[3]) # print train_labels[0] train_data = data_processing.text_clean(train_data) test_data = data_processing.text_clean(test_data) # 词向量的维度 n_dim = globe.n_dim doc_vecs = [] try: # load word2vec model from model path word2vec_model = Word2Vec.load(w2c_model_path) doc_vecs = word2vec_gensim_train.text_vecs(train_data, test_data, n_dim, word2vec_model) except IOError: pass # 生成文本向量 train_data_vecs = doc_vecs[0] # print train_data_vecs.shape test_data_vecs = doc_vecs[1] # print test_data_vecs.shape return train_data_vecs, train_labels, test_data_vecs, test_labels
def add_pro_feature(data, flag='train'): train_data = None if flag == 'train': data = data.drop(['conversionTime', 'userID'], axis=1) data['clickHour'] = pd.Series([str(x)[2:4] for x in data.clickTime]) hourDict = data.groupby(['clickHour'])['label'].mean() data['clickTimePro'] = 0.0 for i in hourDict.index: data.loc[data.clickHour == i, 'clickTimePro'] = hourDict[i] data = data.drop(['clickHour', 'clickTime'], axis=1) print('clickTime to clickTimePro finished!') # data['conversionHour']=pd.Series([str(x)[2:4] for x in data.conversionTime if pd.isnull(x)==True]) # hourDict=data.groupby(['conversionHour'])['label'].mean() # data['conversionTimePro']=0.0 # for i in hourDict.index: # data.loc[data.conversionHour==i,'conversionTimePro']=hourDict[i] # data=data.drop(['conversionHour','conversionTime'],axis=1) positionIDDict = data.groupby(['positionID'])['label'].mean() data['positionIDPro'] = 0.0 for i in positionIDDict.index: data.loc[data.positionID == i, 'positionIDPro'] = positionIDDict[i] data = data = data.drop(['positionID'], axis=1) print('positionID to positionIDPro finished!') connectionTypeDict = data.groupby(['connectionType'])['label'].mean() data['connectionTypePro'] = 0.0 for i in connectionTypeDict.index: data.loc[data.connectionType == i, 'connectionTypePro'] = connectionTypeDict[i] data = data.drop(['connectionType'], axis=1) print('connectionType to connectionTypePro finished!') telecomsOperatorDict = data.groupby(['telecomsOperator' ])['label'].mean() data['telecomsOperatorPro'] = 0.0 for i in telecomsOperatorDict.index: data.loc[data.telecomsOperator == i, 'telecomsOperatorPro'] = telecomsOperatorDict[i] data = data.drop(['telecomsOperator'], axis=1) print('telecomsOperator to telecomsOperatorPro finished!') creativeIDDict = data.groupby(['creativeID'])['label'].mean() data['creativeIDPro'] = 0.0 for i in creativeIDDict.index: data.loc[data.creativeID == i, 'creativeIDPro'] = creativeIDDict[i] data = data.drop(['creativeID'], axis=1) print('creativeID to creativeIDPro finished!') # userIDDict=data.groupby(['userID'])['label'].mean() # data['userIDPro']=0.0 # for i in userIDDict.index: # data.loc[data.userID==i,'userIDPro']=userIDDict[i] # data=data.drop(['userID'],axis=1) else: train_data = dp.read_data('train.csv') data = data.drop(['userID'], axis=1) train_data['clickHour'] = pd.Series( [str(x)[2:4] for x in train_data.clickTime]) data['clickHour'] = pd.Series([str(x)[2:4] for x in data.clickTime]) hourDict = train_data.groupby(['clickHour'])['label'].mean() data['clickTimePro'] = 0.0 for i in hourDict.index: data.loc[data.clickHour == i, 'clickTimePro'] = hourDict[i] data = data.drop(['clickHour', 'clickTime'], axis=1) print('clickTime to clickTimePro finished!') # data['conversionHour']=pd.Series([str(x)[2:4] for x in data.conversionTime if pd.isnull(x)==True]) # hourDict=data.groupby(['conversionHour'])['label'].mean() # data['conversionTimePro']=0.0 # for i in hourDict.index: # data.loc[data.conversionHour==i,'conversionTimePro']=hourDict[i] # data=data.drop(['conversionHour','conversionTime'],axis=1) positionIDDict = train_data.groupby(['positionID'])['label'].mean() data['positionIDPro'] = 0.0 for i in positionIDDict.index: data.loc[data.positionID == i, 'positionIDPro'] = positionIDDict[i] data = data.drop(['positionID'], axis=1) print('positionID to positionIDPro finished!') connectionTypeDict = train_data.groupby(['connectionType' ])['label'].mean() data['connectionTypePro'] = 0.0 for i in connectionTypeDict.index: data.loc[data.connectionType == i, 'connectionTypePro'] = connectionTypeDict[i] data = data.drop(['connectionType'], axis=1) print('connectionType to connectionTypePro finished!') telecomsOperatorDict = train_data.groupby(['telecomsOperator' ])['label'].mean() data['telecomsOperatorPro'] = 0.0 for i in telecomsOperatorDict.index: data.loc[data.telecomsOperator == i, 'telecomsOperatorPro'] = telecomsOperatorDict[i] data = data.drop(['telecomsOperator'], axis=1) print('telecomsOperator to telecomsOperatorPro finished!') creativeIDDict = train_data.groupby(['creativeID'])['label'].mean() data['creativeIDPro'] = 0.0 for i in creativeIDDict.index: data.loc[data.creativeID == i, 'creativeIDPro'] = creativeIDDict[i] data = data.drop(['creativeID'], axis=1) print('creativeID to creativeIDPro finished!') # userIDDict=data.groupby(['userID'])['label'].mean() # data['userIDPro']=0.0 # for i in userIDDict.index: # data.loc[data.userID==i,'userIDPro']=userIDDict[i] # data=data.drop(['userID'],axis=1) data.to_csv('new_' + flag + '.csv') # train 13分钟 ;test 1分钟 return data
# res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1) # # w2c_model.doesnt_match("我 爱 中国".split()) # # var = w2c_model.similarity('纤维', '批次') # print var # res = w2c_model.most_similar("纤维") # for i in res: # print i[0], dd = model.most_similar("批次") for i in dd: print i[0], if __name__ == "__main__": word2vec_test() pos_file_path = globe.file_pos neg_file_path = globe.file_neg tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) x_train = res[0] x_train = data_processing.text_clean(x_train) n_dim = 200 min_count = 2 model_path = globe.model_path mymodel = word2vec_model(x_train, n_dim, min_count) mymodel.save(model_path)
def main(): X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') Xm = X.mean(axis=0) EMBEDDING_DIM = 16 #Fully-Connected AutoEncoder e_x = tf.keras.layers.Input((None, X.shape[-1])) e_o = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(EMBEDDING_DIM, activation='tanh'))(e_x) e = tf.keras.Model(e_x, e_o) d_x = tf.keras.layers.Input((None, EMBEDDING_DIM)) d_o = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(X.shape[-1], activation='linear'))(d_x) d = tf.keras.Model(d_x, d_o) ae_model = AutoEncoder(e, d) ae_model.train(X, epochs=50, learning_rate=0.001, loss='mse') #Transformer AutoEncoder et_x = tf.keras.layers.Input((X.shape[1], X.shape[2])) et_o = Transformer(EMBEDDING_DIM, heads=8, activation='tanh')(et_x) et = tf.keras.Model(et_x, et_o) dt_x = tf.keras.layers.Input((X.shape[1], EMBEDDING_DIM)) dt_o = Transformer(X.shape[2], heads=8, activation='linear')(dt_x) dt = tf.keras.Model(dt_x, dt_o) ae_modelt = AutoEncoder(et, dt) ae_modelt.train(X, epochs=100, learning_rate=0.001, loss='mse') #Matrix Factorization mat_factorization = MatrixFactorization(Xm, EMBEDDING_DIM) mat_factorization.fit(200, 0.0001) #Tensor Factorization tens_factorization = TensorFactorization(X, EMBEDDING_DIM) tens_factorization.fit(50) walk = random_walk(Xm, steps=1000) one_hot = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): one_hot[i, :] = Xm[pos] #Skip-Gram skipgram = Skip_Gram(268, EMBEDDING_DIM, 3, 0.1) skipgram.train_from_feature_seq(one_hot, epochs=200) #CBOW cbow = CBOW(268, EMBEDDING_DIM, 3, 0.1) skipgram.train_from_feature_seq(one_hot, epochs=200) og_distances = calculate_distance_matrix(X.reshape((len(X), -1))) models = { 'AutoEncoder': ae_model, 'Transformer': ae_modelt, 'Matrix Factorization': mat_factorization, 'Tensor Factorization': tens_factorization, 'Skip-Gram': skipgram, 'CBOW': cbow } model_distances = {} for key, mod in models.items(): x_embed = mod.encode(X) model_distances[key] = calculate_distance_matrix( x_embed.reshape((len(x_embed), -1))) #plot distances plt.matshow(og_distances, cmap='Blues', vmin=0) plt.title('Original Distances') plt.savefig('images/og_distance_matrix.png') fig, axes = plt.subplots(2, 3) i = 0 for embedding_name, embedding_distances in model_distances.items(): r, c = i // 3, i % 3 axes[r, c].matshow(embedding_distances, cmap='Blues', vmin=0) axes[r, c].set_title(embedding_name) i += 1 fig.savefig('images/embedding_distances_matrix.png')
def gradcam_on_dataset(data_conf, model_path, layer_name, custom_objects=None, cache_dir=None, images_dir=None, vectorized_dir=None, output_dir=None, predict_two_output=True): """ Applies GradCAM to a set of images. :param data_dir: path to compressed (featurized) images. :param csv_path: list of slides. :param partitions: list of partitions to select slides. :param model_path: path to trained model. :param layer_name: name of convolutional layer used to compute GradCAM. :param output_unit: output unit in the final layer of the network to compute GradCAM. :param custom_objects: used to load the model. :param cache_dir: folder to store compressed images temporarily. :return: nothing """ # Featurized directories data_dir_luad = data_conf['data_dir_luad'] data_dir_lusc = data_conf['data_dir_lusc'] csv_test = data_conf['csv_path'] # Output dir output_dir = join(dirname(model_path), 'gradcam') if output_dir is None else output_dir if not exists(output_dir): os.makedirs(output_dir) print('GradCAM in directory: {d} with content {c}'.format( d=output_dir, c=os.system("ls " + output_dir)), flush=True) # List features data_config = { 'data_dir_luad': data_dir_luad, 'data_dir_lusc': data_dir_lusc, 'csv_path': csv_test } image_ids, paths, dm_paths, labels, features_ids = read_data( data_config) #, custom_augmentations=[('none', 0)]) # Load model and gradient function K.set_learning_phase( 0 ) # required to avoid bug "You must feed a value for placeholder tensor 'batch_normalization_1/keras_learning_phase' with dtype bool" model = keras.models.load_model(model_path, custom_objects=custom_objects) gradient_function_0 = grad_cam_fn(model, 0, layer_name) if predict_two_output: gradient_function_1 = grad_cam_fn(model, 1, layer_name) else: gradient_function_1 = None # Analyze features for i, (image_id, path, dm_path, label, features_id, batch_id) in enumerate( zip(image_ids, paths, dm_paths, labels, features_ids, batch_ids)): try: print('Computing GradCAM on {filename} ... {i}/{n}'.format( filename=features_id, i=i + 1, n=len(image_ids)), flush=True) output_npy_path0, output_png_path0 = gradcam_on_features( features_path=cache_file(path, cache_dir, overwrite=False), distance_map_path=cache_file(dm_path, cache_dir, overwrite=False), gradient_function=gradient_function_0, output_npy_path=join( output_dir, features_id + '_{unit}_{preds}_gradcam.npy'.format( unit=0, preds='{preds:0.3f}')), output_png_path=join( output_dir, features_id + '_{unit}_{preds}_gradcam.png'.format( unit=0, preds='{preds:0.3f}')), ) if predict_two_output: output_npy_path1, output_png_path1 = gradcam_on_features( features_path=cache_file(path, cache_dir, overwrite=False), distance_map_path=cache_file(dm_path, cache_dir, overwrite=False), gradient_function=gradient_function_1, output_npy_path=join( output_dir, features_id + '_{unit}_{preds}_gradcam.npy'.format( unit=1, preds='{preds:0.3f}')), output_png_path=join( output_dir, features_id + '_{unit}_{preds}_gradcam.png'.format( unit=1, preds='{preds:0.3f}')), ) if (images_dir is not None) and (vectorized_dir is not None): image_crop_from_wsi( wsi_path=join(images_dir, batch_id, image_id + '.mrxs'), vectorized_im_shape_path=join(vectorized_dir, image_id + '_im_shape.npy'), distance_map_path=cache_file(dm_path, cache_dir, overwrite=False), output_npy_path=join(output_dir, features_id + '_image.npy'), output_png_path=join(output_dir, features_id + '_image.png'), crop_size=400) overlay_gradcam_heatmap( gradcam_npy_path=output_npy_path0, image_npy_path=join(output_dir, features_id + '_image.npy'), output_png_path=join( output_dir, features_id + '_{unit}_heatmap.png'.format(unit=0))) if predict_two_output: overlay_gradcam_heatmap( gradcam_npy_path=output_npy_path1, image_npy_path=join(output_dir, features_id + '_image.npy'), output_png_path=join( output_dir, features_id + '_{unit}_heatmap.png'.format(unit=1))) overlay_gradcam_heatmap_bicolor( gradcam_npy_path1=output_npy_path0, gradcam_npy_path2=output_npy_path1, image_npy_path=join(output_dir, features_id + '_image.npy'), output_png_path=join(output_dir, features_id + '_both_heatmap.png')) except Exception as e: print('Failed to compute GradCAM on {f}. Exception: {e}'.format( f=path, e=e), flush=True)
def sensitive_add_edge(filename1, filename2, outputfilename, type="hits_weighted", add_node="mit"): top_50 = [] f = open(filename2, "r") for line in f: line = line.strip().lower() top_50.append(line) f.close() fo = open(outputfilename, "w") node_list, edge_list = dp.read_data(filename1, filename2, self_edge=False, extended=True) G = dp.construct_graph(node_list, edge_list) #r = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001) r = choose_algorithm(G, type=type) result = sorted(r.iteritems(), key=lambda asd: asd[1], reverse=True) G.clear() rank = [] for e in result: if e[0] in top_50: rank.append(e[0]) original_r = [] for e in result: if e[0] in top_50: original_r.append([e[0]]) for k in range(len(original_r)): # if not original_r[k][0] == "mit": node_list, edge_list = dp.read_data(filename1, filename2, self_edge=False, extended=True) G = dp.construct_graph(node_list, edge_list) G = G = add_non_existing_edges( G, original_r[k][0], add_node, weight=1) ### add one edge from MIT to <node> #r = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001) r = choose_algorithm(G, type=type) result = sorted(r.iteritems(), key=lambda asd: asd[1], reverse=True) #result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True) G.clear() res1 = [] for e in result: if e[0] in top_50: res1.append(e[0]) kr = 0 for i in range(len(res1)): if res1[i] == original_r[k][0]: kr = i original_r[k].append(k - kr) print original_r fo.write("univ,diff+%s1\n" % (add_node)) for r in original_r: for i in range(len(r)): if i == 0: fo.write(str(r[i])) else: fo.write("," + str(r[i])) fo.write("\n") fo.close()
def train(args): utils.make_all_dirs(current_time) if args.load_var: all_utterances, labels, word_dict = read_data(load_var=args.load_var, input_=None, mode='train') dev_utterances, dev_labels, _ = read_data(load_var=args.load_var, input_=None, mode='dev') else: all_utterances, labels, word_dict = read_data(load_var=args.load_var, \ input_=os.path.join(constant.data_path, "entangled_train.json"), mode='train') dev_utterances, dev_labels, _ = read_data(load_var=args.load_var, \ input_=os.path.join(constant.data_path, "entangled_dev.json"), mode='dev') word_emb = build_embedding_matrix(word_dict, glove_loc=args.glove_loc, \ emb_loc=os.path.join(constant.save_input_path, "word_emb.pk"), load_emb=False) if args.save_input: utils.save_or_read_input(os.path.join(constant.save_input_path, "train_utterances.pk"), \ rw='w', input_obj=all_utterances) utils.save_or_read_input(os.path.join(constant.save_input_path, "train_labels.pk"), \ rw='w', input_obj=labels) utils.save_or_read_input(os.path.join(constant.save_input_path, "word_dict.pk"), \ rw='w', input_obj=word_dict) utils.save_or_read_input(os.path.join(constant.save_input_path, "word_emb.pk"), \ rw='w', input_obj=word_emb) utils.save_or_read_input(os.path.join(constant.save_input_path, "dev_utterances.pk"), \ rw='w', input_obj=dev_utterances) utils.save_or_read_input(os.path.join(constant.save_input_path, "dev_labels.pk"), \ rw='w', input_obj=dev_labels) train_dataloader = TrainDataLoader(all_utterances, labels, word_dict) if args.add_noise: noise_train_dataloader = TrainDataLoader(all_utterances, labels, word_dict, add_noise=True) else: noise_train_dataloader = None dev_dataloader = TrainDataLoader(dev_utterances, dev_labels, word_dict, name='dev') logger_name = os.path.join(constant.log_path, "{}.txt".format(current_time)) LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' logging.basicConfig(format=LOG_FORMAT, level=logging.INFO, filename=logger_name, filemode='w') logger = logging.getLogger() global log_head log_head = log_head + "Training Model: {}; ".format(args.model) if args.add_noise: log_head += "Add Noise: True; " logger.info(log_head) if args.model == 'T': ensemble_model_bidirectional = EnsembleModel(word_dict, word_emb=word_emb, bidirectional=True) elif args.model == 'TS': ensemble_model_bidirectional = EnsembleModel(word_dict, word_emb=None, bidirectional=True) else: ensemble_model_bidirectional = None if args.model == 'TS': ensemble_model_bidirectional.load_state_dict( torch.load(args.model_path)) ensemble_model = EnsembleModel(word_dict, word_emb=word_emb, bidirectional=False) if torch.cuda.is_available(): ensemble_model.cuda() if args.model == 'T' or args.model == 'TS': ensemble_model_bidirectional.cuda() supervised_trainer = SupervisedTrainer(args, ensemble_model, teacher_model=ensemble_model_bidirectional, \ logger=logger, current_time=current_time) supervised_trainer.train(train_dataloader, noise_train_dataloader, dev_dataloader)
def main(): X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat') #X = data_processing.adjacency_matrix(X) print(random_walk(X[0], steps=1000))
def main(): # bucket = {} # f = open("../result/result_top50_cs_newdata_apr09/year_statistical_from1995_to2015.csv","r") # f.readline() # for line in f: # lines = line.split(",") # try: # bucket.update({lines[0].strip() : int(lines[2].strip())}) # except: # pass # f.close() # # node_list, edge_list = dp.read_data_in_range("../data/data_may28_new/data_top50_ee.csv", # "../data/data_may28_new/top50_ee_2015.txt", # start_year = 1992, end_year = 2013, self_edge = True) node_list, edge_list = dp.read_data("../data/data_may28_new/data_top50_ee.csv", "../data/data_may28_new/top50_ee_2015.txt", self_edge = False, extended = False) print len(node_list), node_list print len(edge_list), edge_list exit(0) G = dp.construct_graph(node_list, edge_list) top_50 = [] f = open("../data/data_may28_new/top50_ee_2015.txt","r") for line in f: line = line.strip().lower() top_50.append(line) f.close() print len(G.edges()) print len(G.nodes()) nodes = dp.rank_univ(G, t = "in_degree") f = open("../result/result_may28/ee/comparison/ee_1951-1991_indegree.csv","w") for node in nodes: if node[0] in top_50: f.write("%s;%d\n" %(node[0], node[1])) f.close() weighted_pagerank = algo.weighted_PR_wnorm(G, damping_factor = 0.85, max_iterations = 100, min_delta = 0.00001) result = sorted(weighted_pagerank.iteritems(), key = lambda asd:asd[1], reverse = True) f = open("../result/result_may28/ee/comparison/ee_1992-2013_weightedPR_w_norm.csv","w") for r in result: if r[0] in top_50: f.write("%s;%.5f\n" %(r[0], r[1])) f.close() weighted_pagerank = algo.weighted_PR_wonorm(G, damping_factor = 0.85, max_iterations = 100, min_delta = 0.00001) s = sum(weighted_pagerank.values()) for rank in weighted_pagerank: weighted_pagerank[rank] = weighted_pagerank[rank]*50.0/s result = sorted(weighted_pagerank.iteritems(), key = lambda asd:asd[1], reverse = True) f = open("../result/result_may28/ee/comparison/ee_1992-2013_weightedPR_wo_norm.csv","w") for r in result: if r[0] in top_50: f.write("%s;%.5f\n" %(r[0], r[1])) f.close() # # hits = algo.HITS(G, max_iterations = 100, min_delta = 0.00001) # result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True) # f = open("../result/result_may28/me/extendedGwselfedges/cs_hits.csv","w") # for r in result: # if r[0] in top_50: # f.write("%s;%.5f\n" %(r[0], r[1])) # f.close() hits = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001) result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True) f = open("../result/result_may28/ee/comparison/ee_1992-2013_hits_weighted.csv","w") for r in result: if r[0] in top_50: f.write("%s;%.5f\n" %(r[0], r[1])) f.close() hubavg = algo.hubavg_HITS(G, max_iterations = 100, min_delta = 0.00001) result = sorted(hubavg.iteritems(), key = lambda asd:asd[1], reverse = True) f = open("../result/result_may28/ee/comparison/ee_1992-2013_hits_hubavg.csv","w") for r in result: if r[0] in top_50: f.write("%s;%.5f\n" %(r[0], r[1])) f.close() # salsa = algo.SALSA(G) # result = sorted(salsa.iteritems(), key = lambda asd:asd[1], reverse = True) # f = open("../result/result_top50_cs_newdata_apr09/result_top50_cs/univ_top50_cs_from2000_salsa.csv","w") # for r in result: # f.write("%s;%.5f\n" %(r[0], r[1])) # f.close() # # salsa = algo.modified_SALSA(G) # result = sorted(salsa.iteritems(), key = lambda asd:asd[1], reverse = True) # f = open("../result/result_top50_cs_extended/entire/univ_top40_me_from1946_to1990_salsa_modified.csv","w") # for r in result: # if r[0] in top_50: # f.write("%s;%.5f\n" %(r[0], r[1])) # f.close() # # credit = algo.CreditPropagation(G, original_rank = hits, cr = 0.8, max_iterations = 10000, min_delta = 0.00001) # result = sorted(credit.iteritems(), key = lambda asd:asd[1], reverse = True) # f = open("../result/result_top50_cs_newdata_apr09/result_top50_cs_subtracted_woselfedge/univ_top50_cs_wo_selfedges_CreditProp_hits.csv","w") # for r in result: # if r[0] in top_50: # f.write("%s;%.5f\n" %(r[0], r[1])) # f.close() """ new experiments on authavg and weightedHITS_normalized @ May 13th """
def main(): # dimensions to test DIMENSIONS = [64, 32, 16, 8, 4, 2, 1] X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8) # train embeddings for each dimension encoders = list() for dimension in DIMENSIONS: print(str(dimension) + "-D Embedding Training") e_x = tf.keras.layers.Input((None, 268)) e_o = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(dimension, activation='tanh'))(e_x) e = tf.keras.Model(e_x, e_o) d_x = tf.keras.layers.Input((None, dimension)) d_o = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(268, activation='linear'))(d_x) d = tf.keras.Model(d_x, d_o) model = AutoEncoder(e, d) model.train(X_train, epochs=100, learning_rate=0.001, loss='mse') encoders.append((model, dimension)) # encode train and test data using embeddings, then flatten for prediction embedded_train_list = list() embedded_test_list = list() for model, dim in encoders: embedded_train_matrix = np.zeros((len(X_train), 268 * dim)) for i in range(len(X_train)): embedding_train = model.encode(X_train[i]) embedded_train_matrix[i] = np.ndarray.flatten(embedding_train) embedded_train_list.append(embedded_train_matrix) embedded_test_matrix = np.zeros((len(X_test), 268 * dim)) for i in range(len(X_test)): embedding_test = model.encode(X_test[i]) embedded_test_matrix[i] = np.ndarray.flatten(embedding_test) embedded_test_list.append(embedded_test_matrix) # train prediction models on encoded train data, then test on encoded test data and calculate Mean Squared Error lr_error_list = list() svr_error_list = list() mlp_error_list = list() lr_error_list_train = list() svr_error_list_train = list() mlp_error_list_train = list() for i in range(len(embedded_train_list)): #savemat(f'Data/neural_{DIMENSIONS[i]}.mat', {'train':embedded_train_list[i] ,'test':embedded_test_list[i]}) lr = Ridge(alpha=2).fit(embedded_train_list[i], y_train) svr = SVR().fit(embedded_train_list[i], np.reshape(y_train, -1)) mlp = MLPRegressor(hidden_layer_sizes=(64, 32, 16, 8), learning_rate_init=0.001, max_iter=1000).fit(embedded_train_list[i], np.reshape(y_train, -1)) predictedLR = lr.predict(embedded_train_list[i]) predictedSV = svr.predict(embedded_train_list[i]) predictedMLP = mlp.predict(embedded_train_list[i]) lr_error = mean_squared_error(predictedLR, y_train) svr_error = mean_squared_error(predictedSV, y_train) mlp_error = mean_squared_error(predictedMLP, y_train) lr_error_list_train.append(lr_error) svr_error_list_train.append(svr_error) mlp_error_list_train.append(mlp_error) predictedLR = lr.predict(embedded_test_list[i]) predictedSV = svr.predict(embedded_test_list[i]) predictedMLP = mlp.predict(embedded_test_list[i]) print(str(embedded_test_list[i].shape[-1] // 268) + "-D Predicted") lr_error = mean_squared_error(predictedLR, y_test) svr_error = mean_squared_error(predictedSV, y_test) mlp_error = mean_squared_error(predictedMLP, y_test) lr_error_list.append(lr_error) svr_error_list.append(svr_error) mlp_error_list.append(mlp_error) # plot MSE for different embedding dims and prediction methods width = 0.35 plt.bar(np.arange(len(lr_error_list_train)), lr_error_list_train, width, label="LinReg") plt.bar(np.arange(len(svr_error_list_train)) + width, svr_error_list_train, width, label="SVR") plt.bar(np.arange(len(mlp_error_list_train)) + 2 * width, mlp_error_list_train, width, label="MLP") plt.ylabel("MSE") plt.xlabel("Dimensions") plt.title("Autoencoder Mean Squared Error by Embedding Dimension - Train") plt.xticks(np.arange(len(svr_error_list)) + width, list(DIMENSIONS)) plt.legend(loc="best") plt.savefig('images/autoencoder_train') plt.show() width = 0.35 plt.bar(np.arange(len(lr_error_list)), lr_error_list, width, label="LinReg") plt.bar(np.arange(len(svr_error_list)) + width, svr_error_list, width, label="SVR") plt.bar(np.arange(len(mlp_error_list)) + 2 * width, mlp_error_list, width, label="MLP") plt.ylabel("MSE") plt.xlabel("Dimensions") plt.title("Autoencoder Mean Squared Error by Embedding Dimension - test") plt.xticks(np.arange(len(svr_error_list)) + width, list(DIMENSIONS)) plt.legend(loc="best") plt.savefig('images/autoencoder_test') plt.show()
def main(): # dimensions to test DIMENSIONS = [64, 32, 16, 8, 4, 2] X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8) # average matrix over train data avg_matrix = X_train.mean(axis=0) # generate random walks walk = random_walk(avg_matrix, steps=1000) seq = np.zeros((len(walk), 268)) for i, pos in enumerate(walk): seq[i, :] = avg_matrix[pos] print(seq.shape) # train embeddings for each dimension skipgrams = list() for dimension in DIMENSIONS: print(str(dimension) + "-D Embedding Training") skipgram = CBOW(268, dimension, 2, 0.1) skipgram.train_from_feature_seq(seq, epochs=300) skipgrams.append((skipgram, dimension)) # encode train and test data using embeddings, then flatten for prediction embedded_train_list = list() embedded_test_list = list() for skipgram in skipgrams: embedded_train_matrix = np.zeros((len(X_train), 268 * skipgram[1])) for i in range(len(X_train)): embedding_train = skipgram[0].encode(X_train[i]) embedded_train_matrix[i] = np.ndarray.flatten(embedding_train) embedded_train_list.append(embedded_train_matrix) embedded_test_matrix = np.zeros((len(X_test), 268 * skipgram[1])) for i in range(len(X_test)): embedding_test = skipgram[0].encode(X_test[i]) embedded_test_matrix[i] = np.ndarray.flatten(embedding_test) embedded_test_list.append(embedded_test_matrix) # train prediction models on encoded train data, then test on encoded test data and calculate Mean Squared Error lr_error_list = list() svr_error_list = list() mlp_error_list = list() for i in range(len(embedded_train_list)): #savemat(f'Data/cbow_{DIMENSIONS[i]}.mat', {'train':embedded_train_list[i] ,'test':embedded_test_list[i]}) lr = Ridge().fit(embedded_train_list[i], y_train) svr = SVR().fit(embedded_train_list[i], np.reshape(y_train, -1)) mlp = MLPRegressor(hidden_layer_sizes=(100,)).fit(embedded_train_list[i], np.reshape(y_train, -1)) print(mlp.loss_) predictedLR = lr.predict(embedded_test_list[i]) predictedSV = svr.predict(embedded_test_list[i]) predictedMLP = mlp.predict(embedded_test_list[i]) print(str(embedded_test_list[i].shape[-1] // 268) + "-D Predicted") lr_error = mean_squared_error(predictedLR, y_test) svr_error = mean_squared_error(predictedSV, y_test) mlp_error = mean_squared_error(predictedMLP, y_test) lr_error_list.append(lr_error) svr_error_list.append(svr_error) mlp_error_list.append(mlp_error) # plot MSE for different embedding dims and prediction methods width = 0.35 plt.bar(np.arange(len(lr_error_list)), lr_error_list, width, label="LinReg") plt.bar(np.arange(len(svr_error_list)) + width, svr_error_list, width, label="SVR") plt.bar(np.arange(len(mlp_error_list)) + 2 * width, mlp_error_list, width, label="MLP") plt.ylabel("MSE") plt.xlabel("Dimensions") plt.title("SkipGram Mean Squared Error by Embedding Dimension") plt.xticks(np.arange(len(svr_error_list)) + width, list(DIMENSIONS)) plt.legend(loc="best") plt.show()
@decription: this piece of code exclusively examine how the cr parameter impacts the result of credit propagation @author: Bolun """ import data_processing as dp import algorithms as algo import networkx as nx import ranking_evaluation as reval list1 = [] f = open("../data/univ_top_50_cs.csv","r") for line in f: list1.append(line.strip()) f.close() node_list, edge_list = dp.read_data("../data/data_top50_cs.csv") G = dp.construct_graph(node_list, edge_list) # orank = algo.weighted_PR_wonorm(G, damping_factor = 0.85, max_iterations = 100, min_delta = 0.00001) # s = sum(orank.values()) # for rank in orank: # orank[rank] = orank[rank]*50.0/s # result = sorted(orank.iteritems(), key = lambda asd:asd[1], reverse = True) orank = algo.HITS(G, max_iterations = 100, min_delta = 0.00001) result = sorted(orank.iteritems(), key = lambda asd:asd[1], reverse = True) print result f = open("../result/result_top50_cs/CreditPropagation_hits_evaluation.csv","w") f.write("cr;dist\n") i = 0.0
@decription: this piece of code exclusively examine how the cr parameter impacts the result of credit propagation @author: Bolun """ import data_processing as dp import algorithms as algo import networkx as nx import ranking_evaluation as reval list1 = [] f = open("../data/univ_top_50_cs.csv", "r") for line in f: list1.append(line.strip()) f.close() node_list, edge_list = dp.read_data("../data/data_top50_cs.csv") G = dp.construct_graph(node_list, edge_list) # orank = algo.weighted_PR_wonorm(G, damping_factor = 0.85, max_iterations = 100, min_delta = 0.00001) # s = sum(orank.values()) # for rank in orank: # orank[rank] = orank[rank]*50.0/s # result = sorted(orank.iteritems(), key = lambda asd:asd[1], reverse = True) orank = algo.HITS(G, max_iterations=100, min_delta=0.00001) result = sorted(orank.iteritems(), key=lambda asd: asd[1], reverse=True) print result f = open("../result/result_top50_cs/CreditPropagation_hits_evaluation.csv", "w") f.write("cr;dist\n") i = 0.0
def main(): X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat') #X = data_processing.adjacency_matrix(X) print(random_walk(X[0], steps=1000))