def Doc2vecFeatureEngineering(): benchmark = 'trainData' curpath = os.path.abspath(os.curdir) parenDir = os.path.abspath(os.path.pardir) path = parenDir + '/' + benchmark + '/' print(path) #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True) train_X = pickleRead('trainData_X') train_Y = pickleRead('trainData_Y') verify_X = pickleRead('verifyData_X') verify_Y = pickleRead('verifyData_Y') # train_X=train_X[0:10] #cut training size for debug # train_Y = train_Y[0:10] #cut training size for debug # split data to training and verifiying sets #train_X, verify_X, train_Y, verify_Y = train_test_split(train_X, train_Y, test_size=0.2, random_state=42) #load Doc2vec model programDoc2VecModel = gensim.models.doc2vec.Doc2Vec.load( parenDir + '/models/programDoc2VecModel') hintsDoc2VecModel = gensim.models.doc2vec.Doc2Vec.load( parenDir + '/models/hintsDoc2VecModel') transformDatatoFeatures_doc2vec(train_X, verify_X, programDoc2VecModel, hintsDoc2VecModel) #transformDatatoFeatures_node2vec(train_X, verify_X) pickleWrite(train_Y, 'train_Y') pickleWrite(verify_Y, 'verify_Y')
def main(): programList = readMultiplePrograms() trainData, testData = shuffleData(programList, 0.8) trainData_X, trainData_Y = transformDataToTrainingVector( pickleRead("argumentTrainData", path="../")) testData_X, testData_Y = transformDataToTrainingVector( pickleRead("argumentTestData", path="../")) pickleWrite(trainData_X, "argumentTrainData_X", path="../") pickleWrite(trainData_Y, "argumentTrainData_Y", path="../") pickleWrite(testData_X, "argumentTestData_X", path="../") pickleWrite(testData_Y, "argumentTestData_Y", path="../")
def Node2vecFeatureEngineering(): benchmark = 'trainData' curpath = os.path.abspath(os.curdir) parenDir = os.path.abspath(os.path.pardir) path = parenDir + '/' + benchmark + '/' print(path) #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True) train_X = pickleRead('trainData_X') train_Y = pickleRead('trainData_Y') verify_X = pickleRead('verifyData_X') verify_Y = pickleRead('verifyData_Y') transformDatatoFeatures_node2vec(train_X, verify_X) pickleWrite(train_Y, 'train_Y') pickleWrite(verify_Y, 'verify_Y')
def predict_unseen_set(params, trained_model_path, file_list=[], set_max_nodes_per_batch=True): benchmark_fold = params["benchmark"] + "-" + "predict" path = "../benchmarks/" + benchmark_fold + "/" benchmark_name = path[len("../benchmarks/"):-1] parameters = pickleRead(params["benchmark"] + "-" + params["label"] + "-parameters", "../src/trained_model/") parameters["benchmark"] = benchmark_name print("vocabulary size:", parameters["node_vocab_size"]) if set_max_nodes_per_batch == True: parameters['max_nodes_per_batch'] = params["max_nodes_per_batch"] if params["force_read"] == True: write_graph_to_pickle(benchmark_name, data_fold=["test"], label=params["label"], path=path, file_type=".smt2", graph_type=params["graph_type"], max_nodes_per_batch=params['max_nodes_per_batch'], vocabulary_name=params["benchmark"], file_list=file_list) else: print("Use pickle data for training") # if form_label == True and not os.path.isfile("../pickleData/" + label + "-" + benchmark_name + "-gnnInput_train_data.txt"): if params["form_label"] == True: form_GNN_inputs_and_labels(label=params["label"], datafold=["test"], benchmark=benchmark_name, graph_type=params["graph_type"], gathered_nodes_binary_classification_task=params[ "gathered_nodes_binary_classification_task"]) dataset = HornGraphDataset(parameters) dataset.load_data([DataFold.TEST]) test_data = dataset.get_tensorflow_dataset(DataFold.TEST) loaded_model = tf2_gnn.cli_utils.model_utils.load_model_for_prediction(trained_model_path, dataset) return get_predicted_results(params, loaded_model, test_data)
def predictAndOutputHints(model, programDoc2VecModel, hintsDoc2VecModel, programGraph2VecModel, hintsGraph2VecModel): if not os.path.exists("../predictedHints/"): os.makedirs("../predictedHints/") test_X = pickleRead('testData_X') parenDir = os.path.abspath(os.path.pardir) path = parenDir + "/predictedHints/" #embedding encodedPrograms_test, encodedHints_test = doc2vecModelInferNewData( test_X, programDoc2VecModel, hintsDoc2VecModel) graphEncodedPrograms_test, graphEncodedHints_test = graph2vecModelInferNewData( test_X, programGraph2VecModel, hintsGraph2VecModel) #predict sigmoidOutput = model.predict([ encodedPrograms_test, encodedHints_test, graphEncodedPrograms_test, graphEncodedHints_test ]) #transform probability to binary classification predicted_y = sigmoidOutput.copy() predicted_y[predicted_y > 0.5] = int(1) # convert decimals to 0 and 1 predicted_y[predicted_y <= 0.5] = int(0) # convert decimals to 0 and 1 print("Show one example") print("test_X[0][0]", test_X[0][0]) # program print("test_X[0][1]", test_X[0][1]) # hint text (head \n hint) print("test_X[0][2]", test_X[0][2]) # progran graph embedding print("test_X[0][3]", test_X[0][3]) # hint graph print("test_X[0][4]", test_X[0][4]) # hint ID print("test_X len:", len(test_X)) print(predicted_y[0]) #write results to file fileList = list() for X in test_X: fileList.append(X[5]) fileList = list(set(fileList)) for fileName in fileList: predictedHintListWithID = list() print(fileName) f = open(path + fileName + ".optimizedHints", "w+") #print("sorted") for X, y, score in sorted(zip(test_X, predicted_y, sigmoidOutput), key=lambda t: t[2], reverse=True): #print(X[4],X[1],y,score) if (X[5] == fileName): #predictedHintListWithID.append([X[4],X[1], X[1],y,score]) # ID,head,hint,predicted result,score head = X[1][:X[1].find("\n")] #head=head[:head.find("/")] hint = X[1][X[1].find("\n") + 1:] predictedHintListWithID.append([X[4], head, hint, y, score]) content = X[4] + ":" + head + ":" + hint + ":" + "".join( map(str, np.around(y, 0))) + ":" + "".join(map( str, score)) + "\n" f.write(content) f.close()
def Graph2vecFeatureEngineering(): benchmark = 'trainData' curpath = os.path.abspath(os.curdir) parenDir = os.path.abspath(os.path.pardir) path = parenDir + '/' + benchmark + '/' print(path) #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True) train_X = pickleRead('trainData_X') train_Y = pickleRead('trainData_Y') verify_X = pickleRead('verifyData_X') verify_Y = pickleRead('verifyData_Y') programGraph2VecModel = gensim.models.doc2vec.Doc2Vec.load( parenDir + '/models/programGraph2VecModel') hintsGraph2VecModel = gensim.models.doc2vec.Doc2Vec.load( parenDir + '/models/hintsGraph2VecModel') transformDatatoFeatures_graph2vec(train_X, verify_X, programGraph2VecModel, hintsGraph2VecModel) pickleWrite(train_Y, 'train_Y') pickleWrite(verify_Y, 'verify_Y')
def trainDoc2VecModelfunction(program_dim=100, hint_dim=20): X_train = pickleRead('trainData_X') # extract programs and hints from dataset programs_train, hints_train, graphProgram_train, graphHint_train = data2list( X_train) programs_train = list(set(programs_train)) # transform to TaggedDocument programs_trainTaggedDocument, programsMaxLength, programsAverageLength = transform2TaggedDocument( programs_train) hints_trainTaggedDocument, hintsMaxLength, hintsAverageLength = transform2TaggedDocument( hints_train) # print('programsMaxLength',programsMaxLength) # print('programsAverageLength',programsAverageLength) # print('hintsMaxLength',hintsMaxLength) # print('hintsAverageLength',hintsAverageLength) # create Doc2Vec model # parameters window=2 programDoc2VecModel = gensim.models.doc2vec.Doc2Vec( vector_size=program_dim, min_count=0, window=programsAverageLength, epochs=50) hintsDoc2VecModel = gensim.models.doc2vec.Doc2Vec(vector_size=hint_dim, min_count=0, window=hintsMaxLength, epochs=50) # build vovabulary programDoc2VecModel.build_vocab(programs_trainTaggedDocument) hintsDoc2VecModel.build_vocab(hints_trainTaggedDocument) # train Doc2Vec model programDoc2VecModel.train(programs_trainTaggedDocument, total_examples=programDoc2VecModel.corpus_count, epochs=programDoc2VecModel.epochs) hintsDoc2VecModel.train(hints_trainTaggedDocument, total_examples=hintsDoc2VecModel.corpus_count, epochs=hintsDoc2VecModel.epochs) # save trained doc2vec models parenDir = os.path.abspath(os.path.pardir) programDoc2VecModel.save(parenDir + '/models/programDoc2VecModel') hintsDoc2VecModel.save(parenDir + '/models/hintsDoc2VecModel') return programDoc2VecModel, hintsDoc2VecModel
def main(): print("Start") #benchmark='dillig' benchmark = 'trainData' curpath = os.path.abspath(os.curdir) parenDir = os.path.abspath(os.path.pardir) path = parenDir + '/' + benchmark + '/' print(path) #load Doc2Vec models #programDoc2VecModel=gensim.models.doc2vec.Doc2Vec.load(parenDir+'/models/programDoc2VecModel') #hintsDoc2VecModel=gensim.models.doc2vec.Doc2Vec.load(parenDir+'/models/hintsDoc2VecModel') #load features encodedPrograms_train = pickleRead('encodedPrograms_train') encodedPrograms_test = pickleRead('encodedPrograms_test') encodedHints_train = pickleRead('encodedHints_train') encodedHints_test = pickleRead('encodedHints_test') graphEncodedPrograms_train = pickleRead('graphEncodedPrograms_train') graphEncodedPrograms_test = pickleRead('graphEncodedPrograms_test') train_Y = pickleRead('train_Y') verify_Y = pickleRead('verify_Y') #train batch_size = int(encodedPrograms_train.shape[0] / 100) epochs = 100 #history,model=train(encodedPrograms_train,encodedPrograms_test,encodedHints_train,encodedHints_test,train_Y, verify_Y,batch_size,epochs) history, model = train(encodedPrograms_train, encodedPrograms_test, graphEncodedPrograms_train, graphEncodedPrograms_test, encodedHints_train, encodedHints_test, train_Y, verify_Y, batch_size, epochs) plotHistory(history)
def main(): print("Start") #remove files in testData, pickleData, and models if(os.path.exists("../testData")): shutil.rmtree("../testData/") shutil.rmtree("../pickleData/") shutil.rmtree("../models/") os.mkdir("../testData") os.mkdir("../pickleData") os.mkdir("../models") benchmark = 'trainData' curpath = os.path.abspath(os.curdir) parenDir = os.path.abspath(os.path.pardir) path = parenDir + '/' + benchmark + '/' print(path) # get graph data #callEldaricaGenerateGraphs('trainData') # transformOneFiletoFeatures(path) train_X ,train_Y ,verify_X ,verify_Y =\ readHornClausesAndHints_resplitTrainAndVerifyData(path ,\ dataset='train',discardNegativeData=False,smallTrain=False,smallTrainSize=50,\ trainDataSplitRate=0.8) # train_X=pickleRead('trainData_X') # train_Y = pickleRead('trainData_Y') # verify_X = pickleRead('verifyData_X') # verify_X = pickleRead('verifyData_Y') #train_X=train_X[0:40] #cut training size for debug #train_Y = train_Y[0:40] #cut training size for debug #train and save Doc2Vec models print("train Doc2Vec model (text) begin") trainDoc2VecModelfunction(program_dim=100,hint_dim=20) print("train Doc2Vec model (text) end") print("train Doc2Vec model (graph) begin") trainGraph2VecModelfunction(program_dim=100,hint_dim=20) print("train Doc2Vec model (graph) end") # load Doc2Vec models #programDoc2VecModel =gensim.models.doc2vec.Doc2Vec.load(parenDir +'/models/programDoc2VecModel') #hintsDoc2VecModel =gensim.models.doc2vec.Doc2Vec.load(parenDir +'/models/hintsDoc2VecModel') #programGraph2VecModel =gensim.models.doc2vec.Doc2Vec.load(parenDir +'/models/programGraph2VecModel') #hintsGraph2VecModel =gensim.models.doc2vec.Doc2Vec.load(parenDir +'/models/hintsGraph2VecModel') # split data to training and verifiying sets # train_X, verify_X, train_Y, verify_Y = train_test_split(train_X, train_Y, test_size=0.2, random_state=42) # checkSplitData(X_train, X_test, y_train, y_test) # feature engineering # encodedPrograms_train,encodedPrograms_test,encodedHints_train,encodedHints_test=transformDatatoFeatures_tokennizer(train_X,verify_X) # encodedPrograms_train,encodedPrograms_test,encodedHints_train,encodedHints_test,\ # =transformDatatoFeatures_doc2vec(train_X, verify_X,programDoc2VecModel,hintsDoc2VecModel) from Data2Features import Doc2vecFeatureEngineering,Node2vecFeatureEngineering,Graph2vecFeatureEngineering Doc2vecFeatureEngineering() Graph2vecFeatureEngineering() #Node2vecFeatureEngineering() # load features encodedPrograms_train = pickleRead('encodedPrograms_train') encodedPrograms_test = pickleRead('encodedPrograms_verify') graphEncodedPrograms_train = pickleRead('graphEncodedPrograms_train') graphEncodedPrograms_test = pickleRead('graphEncodedPrograms_verify') encodedHints_train = pickleRead('encodedHints_train') encodedHints_test = pickleRead('encodedHints_verify') graphencodedHints_train = pickleRead('graphEncodedHints_train') graphencodedHints_test = pickleRead('graphEncodedHints_verify') train_Y = pickleRead('train_Y') verify_Y = pickleRead('verify_Y') # train batch_size = round(encodedPrograms_train.shape[0] / 100) if(batch_size<1): batch_size=1 epochs = 100 # #without graph # history, model = train2(encodedPrograms_train, encodedPrograms_test,\ # encodedHints_train, encodedHints_test, train_Y,\ # verify_Y, batch_size, epochs) # #with program graph # history, model = train3(encodedPrograms_train, encodedPrograms_test,\ # graphEncodedPrograms_train,graphEncodedPrograms_test,\ # encodedHints_train, encodedHints_test,\ # train_Y,verify_Y, batch_size, epochs) #with program graph and hint graph history, model = train4(encodedPrograms_train, encodedPrograms_test,\ graphEncodedPrograms_train,graphEncodedPrograms_test,\ encodedHints_train, encodedHints_test,\ graphencodedHints_train,graphencodedHints_test,\ train_Y,verify_Y, batch_size, epochs)
def wrapped_prediction( trained_model_path, benchmark, benchmark_fold, label="template_relevance", force_read=True, form_label=True, json_type=".hyperEdgeHornGraph.JSON", graph_type="hyperEdgeHornGraph", gathered_nodes_binary_classification_task=["template_relevance"], hyper_parameter={}, set_max_nodes_per_batch=False, file_list=[]): path = "../benchmarks/" + benchmark_fold + "/" benchmark_name = path[len("../benchmarks/"):-1] parameters = pickleRead(benchmark + "-" + label + "-parameters", "../src/trained_model/") parameters["benchmark"] = benchmark_name print("vocabulary size:", parameters["node_vocab_size"]) if set_max_nodes_per_batch == True: parameters['max_nodes_per_batch'] = hyper_parameter[ "max_nodes_per_batch"] if force_read == True: write_graph_to_pickle( benchmark_name, data_fold=["test"], label=label, path=path, file_type=".smt2", graph_type=graph_type, max_nodes_per_batch=parameters['max_nodes_per_batch'], vocabulary_name=benchmark, file_list=file_list) else: print("Use pickle data for training") # if form_label == True and not os.path.isfile("../pickleData/" + label + "-" + benchmark_name + "-gnnInput_train_data.txt"): if form_label == True: form_GNN_inputs_and_labels(label=label, datafold=["test"], benchmark=benchmark_name, graph_type=graph_type, gathered_nodes_binary_classification_task= gathered_nodes_binary_classification_task) quiet = False dataset = HornGraphDataset(parameters) dataset.load_data([DataFold.TEST]) test_data = dataset.get_tensorflow_dataset(DataFold.TEST) loaded_model = tf2_gnn.cli_utils.model_utils.load_model_for_prediction( trained_model_path, dataset) _, _, test_results = loaded_model.run_one_epoch(test_data, training=False, quiet=quiet) test_metric, test_metric_string = loaded_model.compute_epoch_metrics( test_results) predicted_Y_loaded_model = loaded_model.predict(test_data) print("test_metric_string", test_metric_string) print("test_metric", test_metric) # test measurement true_Y = [] true_Y_by_file = [] true_Y_file_list = [] for data in iter(test_data): true_Y.extend(np.array(data[1]["node_labels"])) for data in dataset._label_list["test"]: true_Y_by_file.append(np.array(data)) for file_name in dataset._file_list["test"]: true_Y_file_list.append(file_name) mse_loaded_model = tf.keras.losses.MSE(true_Y, predicted_Y_loaded_model) print("\n mse_loaded_model_predicted_Y_and_True_Y", mse_loaded_model) mse_mean = tf.keras.losses.MSE([np.mean(true_Y)] * len(true_Y), true_Y) print("\n mse_mean_Y_and_True_Y", mse_mean) best_set_threshold = ( lambda: hyper_parameter["best_threshold_set"] if hyper_parameter["read_best_threshold"] else write_best_threshod_to_pickle( parameters, true_Y, predicted_Y_loaded_model, label, benchmark))() best_set_ranks = (lambda: { "top_percentage": 0, "accuracy": 0 } if hyper_parameter[ "read_best_threshold"] else wrapped_set_threshold_by_ranks( true_Y, true_Y_by_file, predicted_Y_loaded_model, true_Y_file_list) )() print("----------", benchmark_fold, "-----", label, "----------") print(hyper_parameter) positive_label_number = sum(true_Y) negative_label_number = len(true_Y) - positive_label_number print("best_set_threshold", best_set_threshold) print("positive_label_percentage", positive_label_number / len(true_Y)) print("negative_label_number", negative_label_number / len(true_Y)) print("best_set_threshold", "threshold value:", best_set_threshold["threshold"], "accuracy:", best_set_threshold["accuracy"]) print("best_set_ranks", "top_percentage:", best_set_ranks["top_percentage"], "accuracy:", best_set_ranks["accuracy"]) random_guess_accuracy = max(positive_label_number / len(true_Y), negative_label_number / len(true_Y)) print( "{0:.2%}".format( max(best_set_threshold["accuracy"], best_set_ranks["accuracy"]) - random_guess_accuracy), "better than random guess") return { "trained_model_path": trained_model_path, "best_set_threshold": best_set_threshold["accuracy"], "best_set_ranks": best_set_ranks["accuracy"], "benchmark_fold": benchmark_fold, "label": label, "hyper_parameter": hyper_parameter, "positive_label_percentage": positive_label_number / len(true_Y), "negative_label_number": negative_label_number / len(true_Y), "dataset": dataset, "predicted_Y_loaded_model": predicted_Y_loaded_model, "best_threshold": best_set_threshold["threshold"] }