def run_lp(): # 20NG for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] count = ng20_counts[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) newLabels = GraphGenerator.getNewLabels(hin) tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, 10, tf_param) print scope_name + ' lp+entity' result[i, 4] = lp_experiment(scope, scope_name, count, graph, newLabels, newIds) # GCAT for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] count = gcat_counts[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} newLabels = GraphGenerator.getNewLabels(hin) graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, 10, tf_param) with open('data/local/laplacian/' + scope_name + '.x') as f: X = pk.load(f) graph = GraphGenerator.generateCosineNeighborGraphfromX(X) print scope_name + ' lp+entity' result[i + 2, 4] = lp_experiment(scope, scope_name, count, graph, newLabels, newIds)
def generate_train_test_split(): # generate random train-test split for 2 data set * 2 scopes repeat_times = 50 lp_candidate = [5] # 20ng for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] count = ng20_counts[i] experiment_path = 'data/local/split/' + scope_name + '/' if not os.path.exists('data/local/split/' + scope_name): os.makedirs('data/local/split/' + scope_name) with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} lp_param = {'alpha': 0.99, 'normalization_factor': 0.01} graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, kNeighbors=10, tf_param=tf_param) new_label = GraphGenerator.getNewLabels(hin) for lp in lp_candidate: ssl = SSLClassifier(graph, new_label, scope, lp_param, repeatTimes=repeat_times, trainNumbers=lp, classCount=count) ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' + str(lp).zfill(3) + '_') # gcat for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] count = gcat_counts[i] if not os.path.exists('data/local/split/' + scope_name): os.makedirs('data/local/split/' + scope_name) experiment_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} lp_param = {'alpha': 0.99, 'normalization_factor': 0.01} graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, kNeighbors=10, tf_param=tf_param) new_label = GraphGenerator.getNewLabels(hin) for lp in lp_candidate: ssl = SSLClassifier(graph, new_label, scope, lp_param, repeatTimes=repeat_times, trainNumbers=lp, classCount=count) ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' + str(lp).zfill(3) + '_')