def run_knowsim(): # 20NG for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] count = ng20_counts[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) newLabels = GraphGenerator.getNewLabels(hin) print scope_name + ' knowsim' res = knowsim_experiment(scope, scope_name, NG20TypeList, count, newLabels) result[i, 6] = res # GCAT for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] count = gcat_counts[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) type_list = GCATTypeList[i] newLabels = GraphGenerator.getNewLabels(hin) print scope_name + ' knowsim' result[i + 2, 6] = knowsim_experiment(scope, scope_name, type_list, count, newLabels)
def run_lp(): # 20NG for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] count = ng20_counts[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) newLabels = GraphGenerator.getNewLabels(hin) tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, 10, tf_param) print scope_name + ' lp+entity' result[i, 4] = lp_experiment(scope, scope_name, count, graph, newLabels, newIds) # GCAT for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] count = gcat_counts[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} newLabels = GraphGenerator.getNewLabels(hin) graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, 10, tf_param) with open('data/local/laplacian/' + scope_name + '.x') as f: X = pk.load(f) graph = GraphGenerator.generateCosineNeighborGraphfromX(X) print scope_name + ' lp+entity' result[i + 2, 4] = lp_experiment(scope, scope_name, count, graph, newLabels, newIds)
def generate_train_test_split(): # generate random train-test split for 2 data set * 2 scopes repeat_times = 50 lp_candidate = [5] # 20ng for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] count = ng20_counts[i] experiment_path = 'data/local/split/' + scope_name + '/' if not os.path.exists('data/local/split/' + scope_name): os.makedirs('data/local/split/' + scope_name) with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} lp_param = {'alpha': 0.99, 'normalization_factor': 0.01} graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, kNeighbors=10, tf_param=tf_param) new_label = GraphGenerator.getNewLabels(hin) for lp in lp_candidate: ssl = SSLClassifier(graph, new_label, scope, lp_param, repeatTimes=repeat_times, trainNumbers=lp, classCount=count) ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' + str(lp).zfill(3) + '_') # gcat for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] count = gcat_counts[i] if not os.path.exists('data/local/split/' + scope_name): os.makedirs('data/local/split/' + scope_name) experiment_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} lp_param = {'alpha': 0.99, 'normalization_factor': 0.01} graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, kNeighbors=10, tf_param=tf_param) new_label = GraphGenerator.getNewLabels(hin) for lp in lp_candidate: ssl = SSLClassifier(graph, new_label, scope, lp_param, repeatTimes=repeat_times, trainNumbers=lp, classCount=count) ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' + str(lp).zfill(3) + '_')
def knowsim_experiment(scope, scope_name, type_list, count, newLabels, tau=1, kNeighbors=10, label_num=5): split_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) repeats = 50 tf_param = {'word': True, 'entity': False, 'we_weight': 0.1} X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param) n = X_word.shape[0] knowsim = sparse.lil_matrix((n, n)) for t in type_list: tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} X_typed, newIds, entityIds = GraphGenerator.getTFVectorX( hin, tf_param, t) # make similarity graph cosX = cosine_similarity(X_typed) graph = sparse.lil_matrix((n, n)) for i in range(n): for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]: if j == i: continue graph[i, j] = cosX[i, j] # np.exp(- (1 - cosX[i, j]) / 0.03) # graph[j, i] = cosX[i, j] # np.exp(- (1 - cosX[i, j]) / 0.03) # # calculate laplacian scores row_sum = graph.sum(axis=1) laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors) # add meta-path-based similarity to the knowsim knowsim = knowsim + np.exp(-tau * laplacian_score) * graph knowsim = knowsim.tocsr() print 'running lp' lp_param = {'alpha': 0.98, 'normalization_factor': 5} ssl = SSLClassifier(knowsim, newLabels, scope, lp_param, repeatTimes=50, trainNumbers=label_num, classCount=count) ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' + str(label_num).zfill(3) + '_', newIds=newIds) return ssl.get_mean()
def semihin_experiment(scope, scope_name, count, X, newIds, label_num=5): experiment_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) n = X.shape[0] e = X.shape[1] if not type(X) is np.ndarray: X = X.toarray() graph = np.zeros((n + e, n + e)) graph[0:n, n:n + e] = X graph[n:n + e, 0:n] = X.transpose() graph = sparse.csc_matrix(graph) newLabel = GraphGenerator.getNewLabels(hin) lp_param = {'alpha': 0.98, 'normalization_factor': 5, 'method': 'variant'} ssl = SSLClassifier(graph, newLabel, scope, lp_param, repeatTimes=50, trainNumbers=label_num, classCount=count) ssl.repeatedFixedExperimentwithNewIds(pathPrefix=experiment_path + 'lb' + str(label_num).zfill(3) + '_', newIds=newIds) return ssl.get_mean()
def generate_meta_graph(scope, scope_name, type_list, count): split_path = 'data/local/split/' + scope_name + '/' pred_path = 'data/local/metagraph/' + scope_name + '/' if not os.path.exists('data/local/metagraph/' + scope_name + '/'): os.makedirs('data/local/metagraph/' + scope_name + '/') with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} for t in type_list: #print t X, newIds, entitynewIds = GraphGenerator.getTFVectorX(hin, tf_param, t) n = X.shape[0] e = X.shape[1] with open('data/local/laplacian/' + scope_name + '/' + str(t) + '_scores') as f: laplacian_score = pk.load(f) laplacian_score = 20 * np.exp(-laplacian_score * 0.01) D = sparse.diags(laplacian_score) X = X * D X = X.toarray() graph = np.zeros((n + e, n + e)) graph[0:n, n:n + e] = X graph[n:n + e, 0:n] = X.transpose() graph = sparse.csc_matrix(graph) newLabel = GraphGenerator.getNewLabels(hin) lp_param = {'alpha': 0.98, 'normalization_factor': 5} # 3-class classification lp_candidate = [5] for lp in lp_candidate: ssl = SSLClassifier(graph, newLabel, scope, lp_param, repeatTimes=50, trainNumbers=lp, classCount=count) if not os.path.exists(pred_path + str(t) + '/'): os.makedirs(pred_path + str(t) + '/') ssl.repeatedFixedExperimentwithNewIds( pathPrefix=split_path + 'lb' + str(lp).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + str(t) + '/' + 'lb' + str(lp).zfill(3))
def run_semihin(): # 20NG for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] count = ng20_counts[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) newLabels = GraphGenerator.getNewLabels(hin) tf_param = {'word': True, 'entity': False, 'we_weight': 0.112} X, newIds, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) print scope_name + ' semihin' result[i, 7] = semihin_experiment(scope, scope_name, count, X, newIds) tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} X, newIds, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) print scope_name + ' semihin+entity' result[i, 8] = semihin_experiment(scope, scope_name, count, X, newIds) # GCAT for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] count = gcat_counts[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) newLabels = GraphGenerator.getNewLabels(hin) tf_param = {'word': True, 'entity': False, 'we_weight': 0.112} X, newIds, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) print scope_name + ' semihin' result[i + 2, 6] = semihin_experiment(scope, scope_name, count, X, newIds) with open('data/local/laplacian/' + scope_name + '.x') as f: X = pk.load(f) print scope_name + ' semihin+entity' result[i + 2, 7] = semihin_experiment(scope, scope_name, count, X, newIds)
def run_generate_laplacian_score(): print 'generate laplacian score for feature reweighting' # 20NG for i in range(2): scope_name = ng20_scope_names[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param) for t in NG20TypeList: tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} X_typed, newIds, entityIds = GraphGenerator.getTFVectorX( hin, tf_param, t) laplacian_score = generate_laplacian_score_vector( X_typed, X_word, 100) if not os.path.exists('data/local/laplacian/' + scope_name): os.makedirs('data/local/laplacian/' + scope_name) with open( 'data/local/laplacian/' + scope_name + '/' + str(t) + '_scores', 'w') as f: pk.dump(laplacian_score, f) # GCAT for i in range(2): scope_name = gcat_scope_names[i] type_list = GCATTypeList[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param) for t in type_list: tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} X_typed, newIds, entityIds = GraphGenerator.getTFVectorX( hin, tf_param, t) laplacian_score = generate_laplacian_score_vector( X_typed, X_word, 100) if not os.path.exists('data/local/laplacian/' + scope_name): os.makedirs('data/local/laplacian/' + scope_name) with open( 'data/local/laplacian/' + scope_name + '/' + str(t) + '_scores', 'w') as f: pk.dump(laplacian_score, f)
def run_svm(): # 20NG for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) print scope_name + ' svm' tf_param = {'word': True, 'entity': False, 'we_weight': 0.1} X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) y = GraphGenerator.gety(hin) result[i, 2] = svm_experiment(scope_name, X, y) print scope_name + ' svm+entity' tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) y = GraphGenerator.gety(hin) result[i, 3] = svm_experiment(scope_name, X, y) # GCAT for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) print scope_name + ' svm' tf_param = {'word': True, 'entity': False, 'we_weight': 0.1} X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) y = GraphGenerator.gety(hin) result[i + 2, 2] = svm_experiment(scope_name, X, y) print scope_name + ' svm+entity' with open('data/local/laplacian/' + scope_name + '.x') as f: X = pk.load(f) y = GraphGenerator.gety(hin) result[i + 2, 3] = svm_experiment(scope_name, X, y)
def generate_data(V, E, T, datapath): print(f"V = {V}, E = {E}, T = {T}") graph = GraphGenerator.gen_adjacencies(V, E, T) DataGenerator.gen(graph, V, E, T, datapath)
def ensemble_cotrain_experiment(scope, scope_name, type_list, threshold, weight, count, label_num=5): pred_path = 'data/local/cotrain/' + scope_name + '/' split_path = 'data/local/split/' + scope_name + '/' if not os.path.exists(pred_path): os.makedirs(pred_path) with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} c = len(scope) lb_cand = [label_num] repeats = 50 # rounds for alternating optimization rounds = 2 best_res = 0 X_s = {} tf_param = {'word': True, 'entity': False, 'we_weight': 0.112} X_word, newIds, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) for t in type_list: if not os.path.exists(pred_path + str(t) + '/'): os.makedirs(pred_path + str(t) + '/') with open('data/local/laplacian/' + scope_name + '/' + str(t) + '_scores') as f: laplacian_score = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} X_typed, newIds, entityIds = GraphGenerator.getTFVectorX( hin, tf_param, t) laplacian_score = 20 * np.exp(-laplacian_score * 0.01) # laplacian_score = laplacian_score / np.sum(laplacian_score) * laplacian_score.shape[0] D = sparse.diags(laplacian_score) X_typed = X_typed * D X_s[str(t)] = X_typed for rd in range(rounds): round_best_res = 0 round_best_t = '' # step 1: # generate output of each meta-path for t in type_list: X = X_s[str(t)].toarray() n = X.shape[0] e = X.shape[1] graph = np.zeros((n + e, n + e)) graph[0:n, n:n + e] = X graph[n:n + e, 0:n] = X.transpose() graph = sparse.csc_matrix(graph) newLabel = GraphGenerator.getNewLabels(hin) lp_param = { 'alpha': 0.98, 'normalization_factor': 5, 'method': 'variant' } lb = label_num ssl = SSLClassifier(graph, newLabel, scope, lp_param, repeatTimes=repeats, trainNumbers=lb, classCount=count) if rd == 0: ssl.repeatedFixedExperimentwithNewIds( pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + str(t) + '/lb' + str(lb).zfill(3)) else: inputPredPath = 'data/local/cotrain/' + scope_name + '/lb' + str( lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3) ssl.repeatedFixedExpeimentwithInput( pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + 'lb' + str(lb).zfill(3) + '_' + str(t), inputPredPath=inputPredPath) res = ssl.get_mean() if res > best_res: best_res = res best_t = t if res > round_best_res: round_best_res = res round_best_t = t print 'Round %d\t%.4f\t%s' % (rd, round_best_res, str(round_best_t)) # step 2: # propagate pseudo-label for other path for lb in lb_cand: results = [] for r in range(repeats): with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainLabel = pk.load(f) with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testLabel = pk.load(f) numTrain = len(trainLabel) numTest = len(testLabel) n = numTrain + numTest # write output probability outPred = np.zeros((n, c)) for t in type_list: typePred = np.zeros((n, c)) with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainPred = pk.load(f) for i, k in enumerate(trainLabel.keys()): typePred[k, :] = trainPred[i, :] with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testPred = pk.load(f) for i, k in enumerate(testLabel.keys()): #typePred[k,:] = testPred[i,:] # some potential improvement: set a threshold for random walk number to block # 'unconfident' data points max = np.max(testPred[i, :]) if max > threshold[str(t)]: typePred[k, :] = testPred[i, :] # add meta-path probability to global probability outPred += typePred * weight[str(t)] with open( 'data/local/cotrain/' + scope_name + '/lb' + str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) + '_' + str(r).zfill(3), 'w') as f: pk.dump(outPred, f) return best_res
def ensemble_gal_experiment(scope, scope_name, type_list, threshold): # this section should be changed between different scopes pred_path = 'data/local/metagraph/' + scope_name + '/' lb_cand = [5] repeats = 50 with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) #X, newIds = GraphGenerator.getTFVectorX(hin, param={'word': True, 'entity': False, 'we_weight': 0.1}) y = GraphGenerator.gety(hin) if sys.platform == 'win32': command_file = open('galm.bat', 'a') else: command_file = open('galm.sh', 'a') for lb in lb_cand: results = [] for r in range(repeats): with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainLabel = pk.load(f) with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testLabel = pk.load(f) if not os.path.exists('data/local/gal/' + scope_name + '/'): os.makedirs('data/local/gal/' + scope_name + '/') label_file = open( 'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_label.txt', 'w') gold_file = open( 'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_gold.txt', 'w') eval_file = open( 'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_eval.txt', 'w') # write get-another-label gold file for k, v in trainLabel.items(): gold_file.write(str(k) + '\t' + v + '\n') # write get-another-label eval file for k, v in testLabel.items(): eval_file.write(str(k) + '\t' + v + '\n') # write get-another-label label file for t in type_list: with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainPred = pk.load(f) for i, k in enumerate(trainLabel.keys()): v = scope[np.argmax(trainPred[i, :])] label_file.write( str(t) + '\t' + str(k) + '\t' + v + '\n') with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testPred = pk.load(f) for i, k in enumerate(testLabel.keys()): v = scope[np.argmax(testPred[i, :])] max = np.max(testPred[i, :]) if max > threshold[str(t)]: label_file.write( str(t) + '\t' + str(k) + '\t' + v + '\n') # run get-another-label batch if sys.platform == 'win32': command = r'call galm/bin/get-another-label.bat ' + \ '--categories galm/settings/' + scope_name + '_categories.txt ' + \ '--cost galm/settings/' + scope_name + '_costs.txt ' + \ '--gold data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_gold.txt ' + \ '--input data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_label.txt ' + \ '--eval data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_eval.txt ' + \ '> data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + \ str(r).zfill(3) + '_result.txt' else: command = r'galm/bin/get-another-label.sh ' + \ '--categories /home/hejiang/results/gal/' + scope_name + '_categories.txt ' + \ '--cost /home/hejiang/results/gal/' + scope_name + '_costs.txt ' + \ '--gold data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_gold.txt ' + \ '--input data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_label.txt ' + \ '--eval data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_eval.txt ' + \ '> data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + \ str(r).zfill(3) + '_result.txt' command_file.write(command + '\r\n')
def ensemble_svm_experiment(scope, scope_name, type_list, threshold): # this section should be changed between different scopes experiment_path = 'data/local/metagraph/' + scope_name + '/' lb_cand = [5] repeats = 50 with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) #X, newIds = GraphGenerator.getTFVectorX(hin, param={'word': True, 'entity': False, 'we_weight': 0.1}) y = GraphGenerator.gety(hin) for lb in lb_cand: results = [] for r in range(repeats): with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainLabel = pk.load(f) with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testLabel = pk.load(f) yTrain = y[trainLabel.keys()] yTest = y[testLabel.keys()] numTrain = len(trainLabel) numTest = len(testLabel) XTrain = np.zeros((numTrain, 0)) XTest = np.zeros((numTest, 0)) for t in type_list: with open(experiment_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainPred = pk.load(f) with open(experiment_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testPred = pk.load(f) # threshold each meta-graph XTraint = np.zeros((numTrain, 3)) XTestt = np.zeros((numTest, 3)) for i, k in enumerate(trainLabel.items()): v = np.argmax(trainPred[i, :]) max = np.max(trainPred[i, :]) if max > threshold[str(t)]: # zero-one prediction XTraint[i, v] = 1 # raw prediction #XTraint[i, :] = trainPred[i, :] for i, k in enumerate(testLabel.items()): v = np.argmax(testPred[i, :]) max = np.max(testPred[i, :]) if max > threshold[str(t)]: # zero-one prediction XTestt[i, v] = 1 # raw prediction #XTestt[i, :] = testPred[i, :] XTrain = np.concatenate((XTrain, XTraint), axis=1) XTest = np.concatenate((XTest, XTestt), axis=1) # use raw input #XTrain = np.concatenate((XTrain,trainPred),axis=1) #XTest = np.concatenate((XTest,testPred),axis=1) # train clf = LinearSVC(C=0.1) clf.fit(XTrain, yTrain) # test pred = clf.predict(XTest) results.append(sum(pred == yTest) / float(yTest.shape[0])) return np.mean(results)
def lp_meta_experiment(scope, scope_name, type_list, threshold, weight, count, label_num=5): pred_path = 'data/local/lpmeta/' + scope_name + '/' if not os.path.exists(pred_path): os.makedirs(pred_path) split_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} c = len(scope) lb_cand = [label_num] repeats = 50 # rounds for alternating optimization rounds = 2 best_res = 0 for rd in range(rounds): # step 1: # generate output of each meta-path for t in type_list: if not os.path.exists(pred_path + str(t)): os.makedirs(pred_path + str(t)) graph, newIds = GraphGenerator.getMetaPathGraph(hin, tf_param, t) newLabel = GraphGenerator.getNewLabels(hin) lp_param = {'alpha': 0.99, 'normalization_factor': 0.01} # lp_param = {'alpha':0.98, 'normalization_factor':5} # 3-class classification lb = label_num ssl = SSLClassifier(graph, newLabel, scope, lp_param, repeatTimes=repeats, trainNumbers=lb, classCount=count) if rd == 0: ssl.repeatedFixedExperimentwithNewIds( pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + str(t) + '/lb' + str(lb).zfill(3)) else: inputPredPath = 'data/local/lpmeta/' + scope_name + '/lb' + str( lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3) ssl.repeatedFixedExpeimentwithInput( pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + str(t) + '/lb' + str(lb).zfill(3), inputPredPath=inputPredPath) res = ssl.get_mean() if res > best_res: best_res = res # step 2: # propagate pseudo-label for other path for lb in lb_cand: results = [] for r in range(repeats): with open(split_path + 'lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainLabel = pk.load(f) with open(split_path + 'lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testLabel = pk.load(f) numTrain = len(trainLabel) numTest = len(testLabel) n = numTrain + numTest # write get-another-label label file outPred = np.zeros((n, c)) for t in type_list: typePred = np.zeros((n, c)) with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainPred = pk.load(f) for i, k in enumerate(trainLabel.keys()): typePred[k, :] = trainPred[i, :] with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testPred = pk.load(f) for i, k in enumerate(testLabel.keys()): typePred[k, :] = testPred[i, :] # add meta-path probability to global probability outPred += typePred * weight[str(t)] with open( 'data/local/lpmeta/' + scope_name + '/lb' + str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) + '_' + str(r).zfill(3), 'w') as f: pk.dump(outPred, f) return best_res