def Doc2vecFeatureEngineering(): benchmark = 'trainData' curpath = os.path.abspath(os.curdir) parenDir = os.path.abspath(os.path.pardir) path = parenDir + '/' + benchmark + '/' print(path) #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True) train_X = pickleRead('trainData_X') train_Y = pickleRead('trainData_Y') verify_X = pickleRead('verifyData_X') verify_Y = pickleRead('verifyData_Y') # train_X=train_X[0:10] #cut training size for debug # train_Y = train_Y[0:10] #cut training size for debug # split data to training and verifiying sets #train_X, verify_X, train_Y, verify_Y = train_test_split(train_X, train_Y, test_size=0.2, random_state=42) #load Doc2vec model programDoc2VecModel = gensim.models.doc2vec.Doc2Vec.load( parenDir + '/models/programDoc2VecModel') hintsDoc2VecModel = gensim.models.doc2vec.Doc2Vec.load( parenDir + '/models/hintsDoc2VecModel') transformDatatoFeatures_doc2vec(train_X, verify_X, programDoc2VecModel, hintsDoc2VecModel) #transformDatatoFeatures_node2vec(train_X, verify_X) pickleWrite(train_Y, 'train_Y') pickleWrite(verify_Y, 'verify_Y')
def write_best_threshod_to_pickle(parameters, true_Y, predicted_Y_loaded_model, label, benchmark): best_set_threshold = set_threshold_by_roundings(true_Y, predicted_Y_loaded_model) parameters["best_threshold_set"] = best_set_threshold pickleWrite(parameters, benchmark + "-" + label + "-parameters", "../src/trained_model/") return best_set_threshold
def Node2vecFeatureEngineering(): benchmark = 'trainData' curpath = os.path.abspath(os.curdir) parenDir = os.path.abspath(os.path.pardir) path = parenDir + '/' + benchmark + '/' print(path) #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True) train_X = pickleRead('trainData_X') train_Y = pickleRead('trainData_Y') verify_X = pickleRead('verifyData_X') verify_Y = pickleRead('verifyData_Y') transformDatatoFeatures_node2vec(train_X, verify_X) pickleWrite(train_Y, 'train_Y') pickleWrite(verify_Y, 'verify_Y')
def main(): programList = readMultiplePrograms() trainData, testData = shuffleData(programList, 0.8) trainData_X, trainData_Y = transformDataToTrainingVector( pickleRead("argumentTrainData", path="../")) testData_X, testData_Y = transformDataToTrainingVector( pickleRead("argumentTestData", path="../")) pickleWrite(trainData_X, "argumentTrainData_X", path="../") pickleWrite(trainData_Y, "argumentTrainData_Y", path="../") pickleWrite(testData_X, "argumentTestData_X", path="../") pickleWrite(testData_Y, "argumentTestData_Y", path="../")
def Graph2vecFeatureEngineering(): benchmark = 'trainData' curpath = os.path.abspath(os.curdir) parenDir = os.path.abspath(os.path.pardir) path = parenDir + '/' + benchmark + '/' print(path) #train_X,train_Y,verify_X,verify_Y = readHornClausesAndHints_resplitTrainAndVerifyData(path, dataset='train', discardNegativeData=True) train_X = pickleRead('trainData_X') train_Y = pickleRead('trainData_Y') verify_X = pickleRead('verifyData_X') verify_Y = pickleRead('verifyData_Y') programGraph2VecModel = gensim.models.doc2vec.Doc2Vec.load( parenDir + '/models/programGraph2VecModel') hintsGraph2VecModel = gensim.models.doc2vec.Doc2Vec.load( parenDir + '/models/hintsGraph2VecModel') transformDatatoFeatures_graph2vec(train_X, verify_X, programGraph2VecModel, hintsGraph2VecModel) pickleWrite(train_Y, 'train_Y') pickleWrite(verify_Y, 'verify_Y')
def shuffleData(programList, trainDataSplitRate): #splite train and test data random.shuffle(programList) splitPoint = int(trainDataSplitRate * len(programList)) trainData = programList[:splitPoint] testData = programList[splitPoint:] #write train and test data to file path = "../../trainData/" if (os.path.exists("../../testData")): shutil.rmtree("../../testData/") os.mkdir("../../testData") for pi in testData: fileName = pi.programName #print(fileName) if (os.path.exists("../../trainData/" + fileName + ".arguments")): shutil.move("../../trainData/" + fileName + ".arguments", "../../testData/" + fileName + ".arguments") shutil.move("../../trainData/" + fileName + ".gv", "../../testData/" + fileName + ".gv") shutil.move("../../trainData/" + fileName + ".hints.graphs", "../../testData/" + fileName + ".hints.graphs") shutil.move("../../trainData/" + fileName + ".horn", "../../testData/" + fileName + ".horn") shutil.move("../../trainData/" + fileName + ".HornGraph", "../../testData/" + fileName + ".HornGraph") shutil.move("../../trainData/" + fileName + ".initialHints", "../../testData/" + fileName + ".initialHints") shutil.move("../../trainData/" + fileName + ".negativeHints", "../../testData/" + fileName + ".negativeHints") shutil.move("../../trainData/" + fileName + ".positiveHints", "../../testData/" + fileName + ".positiveHints") shutil.move("../../trainData/" + fileName + ".smt2", "../../testData/" + fileName + ".smt2") pickleWrite(trainData, "argumentTrainData", path="../") pickleWrite(testData, "argumentTestData", path="../") return trainData, testData
def transformDatatoFeatures_doc2vec(X_train, X_test, programDoc2VecModel, hintsDoc2VecModel): #create Doc2Vec model #programDoc2VecModel, hintsDoc2VecModel=trainDoc2VectModel(X_train) #infer/embedding programs and hints to vectors print("Doc2Vec (text) inferring begin") encodedPrograms_train, encodedHints_train = doc2vecModelInferNewData( X_train, programDoc2VecModel, hintsDoc2VecModel) encodedPrograms_verify, encodedHints_verify = doc2vecModelInferNewData( X_test, programDoc2VecModel, hintsDoc2VecModel) print("Doc2Vec (text) inferring end") print('write infered train and test data to files') pickleWrite(content=encodedPrograms_train, name='encodedPrograms_train') pickleWrite(content=encodedHints_train, name='encodedHints_train') pickleWrite(content=encodedPrograms_verify, name='encodedPrograms_verify') pickleWrite(content=encodedHints_verify, name='encodedHints_verify') return encodedPrograms_train, encodedPrograms_verify, encodedHints_train, encodedHints_verify
def transformDatatoFeatures_graph2vec(X_train, X_test, programGraph2VecModel, hintsGraph2VecModel): #create Doc2Vec model #programDoc2VecModel, hintsDoc2VecModel=trainDoc2VectModel(X_train) #infer/embedding programs and hints to vectors print("Doc2Vec (graph) inferring begin") graphEncodedPrograms_train, graphEncodedHints_train = graph2vecModelInferNewData( X_train, programGraph2VecModel, hintsGraph2VecModel) graphEncodedPrograms_verify, graphEncodedHints_verify = graph2vecModelInferNewData( X_test, programGraph2VecModel, hintsGraph2VecModel) print('write infered train and test data to files') pickleWrite(content=graphEncodedPrograms_train, name='graphEncodedPrograms_train') pickleWrite(content=graphEncodedHints_train, name='graphEncodedHints_train') pickleWrite(content=graphEncodedPrograms_verify, name='graphEncodedPrograms_verify') pickleWrite(content=graphEncodedHints_verify, name='graphEncodedHints_verify') return graphEncodedPrograms_train, graphEncodedPrograms_verify, graphEncodedHints_train, graphEncodedHints_verify
def transformDatatoFeatures_node2vec(X_train, X_test): graphEncodedPrograms_train = list() for graph in X_train: graphEncodedPrograms_train.append(graph[2]) graphEncodedPrograms_train = np.expand_dims(graphEncodedPrograms_train, axis=2) graphEncodedPrograms_verify = list() for graph in X_test: graphEncodedPrograms_verify.append(graph[2]) graphEncodedPrograms_verify = np.expand_dims(graphEncodedPrograms_verify, axis=2) print('write train and test graph embedding data to files') pickleWrite(content=graphEncodedPrograms_train, name='graphEncodedPrograms_train') pickleWrite(content=graphEncodedPrograms_verify, name='graphEncodedPrograms_test') graphEncodedHints_train = list() for graph in X_train: graphEncodedHints_train.append(graph[3]) graphEncodedHints_train = np.expand_dims(graphEncodedHints_train, axis=2) #graphEncodedHints_train=np.array(graphEncodedHints_train) graphEncodedHints_verify = list() for graph in X_test: graphEncodedHints_verify.append(graph[3]) graphEncodedHints_verify = np.expand_dims(graphEncodedHints_verify, axis=2) #graphEncodedHints_verify=np.array(graphEncodedHints_verify) pickleWrite(content=graphEncodedHints_train, name='graphEncodedHints_train') pickleWrite(content=graphEncodedHints_verify, name='graphEncodedHints_test') return graphEncodedPrograms_train, graphEncodedPrograms_verify, graphEncodedHints_train, graphEncodedHints_verify
def main(): path=sys.argv[1] df=sys.argv[2] curssor = int(sys.argv[3]) file_type=sys.argv[4] label=sys.argv[5] buckets = sys.argv[6] reading_type=sys.argv[7] graphInfoList = DotToGraphInfo(df + "_data", path) graphInfoList._split_flag = curssor graphInfoList._file_type=file_type graphInfoList._buckets=int(buckets) start=time.time() if reading_type == "gnn_inputs": print("reading_type",reading_type) graphs_node_label_ids, graphs_argument_indices, graphs_adjacency_lists, graphs_argument_scores, total_number_of_node,total_control_flow_node_list,graph_info_list = graphInfoList.getHornGraphSample_no_offset() pickleWrite(graphs_node_label_ids, df + "-graphs_node_label_ids-" + str(curssor)) pickleWrite(graphs_argument_indices, df + "-graphs_argument_indices-" + str(curssor)) pickleWrite(graphs_adjacency_lists, df + "-graphs_adjacency_lists-" + str(curssor) ) pickleWrite(graphs_argument_scores, df + "-graphs_argument_scores-" + str(curssor)) pickleWrite(total_number_of_node, df + "-total_number_of_node-" + str(curssor)) pickleWrite(total_control_flow_node_list, df + "-total_control_flow_node_list-" + str(curssor)) pickleWrite(graph_info_list, df + "-graphs_graph_info_list-" + str(curssor)) else: graphs_node_label_ids, graphs_argument_indices, graphs_adjacency_lists, graphs_argument_scores, total_number_of_node, graph_info_list = graphInfoList.getHornGraphSample_analysis() pickleWrite(graphs_node_label_ids, label + "-graphs_node_label_ids-" + str(curssor)) pickleWrite(graphs_argument_indices, label + "-graphs_argument_indices-" + str(curssor)) pickleWrite(graphs_adjacency_lists, label + "-graphs_adjacency_lists-" + str(curssor)) pickleWrite(graphs_argument_scores, label + "-graphs_argument_scores-" + str(curssor)) pickleWrite(total_number_of_node, label + "-total_number_of_node-" + str(curssor)) pickleWrite(graph_info_list, label + "-graphs_graph_info_list-" + str(curssor)) print("--time for transform dot to GNN input",time.time()-start,"--")