def crossvalidation(folds_folder, number_of_folds, combination_name_indexes, qrels, summary_file): torch.multiprocessing.set_start_method("spawn") lrs = [0.01, 0.001] batch_sizes = [3] epochs = [5, 10, 17] # epochs = [1] momentums = [0.9] # dropouts = [0.2,0.5] scores = {} models = {} evaluator = eval(metrics=["map", "ndcg", "P.2", "P.5"]) test_trec_file = "NN_test_trec_file.txt" for fold in range(1, number_of_folds + 1): print("in fold:", fold) models[fold] = {} scores[fold] = {} training_folder = folds_folder + str(fold) + "/train/" validation_folder = folds_folder + str(fold) + "/validation/" test_folder = folds_folder + str(fold) + "/test/" validation_results_folder = folds_folder + str( fold) + "/validation_results/" if not os.path.exists(validation_results_folder): os.makedirs(validation_results_folder) current_labels_file = "labels_fold_" + str(fold) + ".pkl" for lr in lrs: for epoch in epochs: for momentum in momentums: for batch_size in batch_sizes: model_name = "_".join((str(lr), str(epoch), str(momentum), str(batch_size))) model, model_file = train_model( lr, momentum, current_labels_file, training_folder, batch_size, epoch, fold) results = predict_folder_content( validation_folder, model) trec_file_name = validation_results_folder + "NN_" + model_name + ".txt" evaluator.create_trec_eval_file_nn( results, combination_name_indexes["val"][fold], trec_file_name) score = evaluator.run_trec_eval(trec_file_name, qrels) scores[fold][model_name] = float(score) models[fold][model_name] = model_file best_model = max(scores[fold].items(), key=operator.itemgetter(1))[0] print("chosen model on fold", fold, ":", best_model) test_model = torch.load(models[fold][best_model]) results = predict_folder_content(test_folder, test_model) evaluator.create_trec_eval_file_nn( results, combination_name_indexes["test"][fold], test_trec_file, True) final_trec_file = evaluator.order_trec_file(test_trec_file) run_bash_command("rm " + test_trec_file) evaluator.run_trec_eval_on_test(summary_file=summary_file, qrels=qrels, method="NN", trec_file=final_trec_file)
def create_tfidf_features_and_features_file(features_dir, index_path, sentence_file, top_doc_files, input_query, past_winners_file, key): query = input_query + key command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/home/greg/indri-5.6/swig/obj/java/ -cp indri.jar Main " + index_path + " " + sentence_file + " " + top_doc_files + " " + past_winners_file + " " + query print(run_bash_command(command)) command = "mv doc*_* " + features_dir run_bash_command(command)
def run(): for i in [j / 10 for j in range(11)]: run_name1 = str(i) command = "rm -r /lv_local/home/sgregory/auto_seo/new_merged_index*" run_bash_command(command) command = "nohup python pagerank_experiment_platform.py 1 " + run_name1 + " &" run_bash_command(command) while True: if os.path.isfile("stop.stop_1" + run_name1.replace(".", "")): break
def merge_indexes_for_experiments(index1, index2, merged_index): if os.path.isdir(merged_index): print("merged index exists, deleting the index") run_bash_command("rm -r " + merged_index) print("deletion of old merged index is done") command = '/home/greg/indri_test/bin/dumpindex ' + merged_index + ' merge ' + index1 + ' ' + index2 print("merging command:", command) sys.stdout.flush() out = run_bash_command(command) print("merging out command:", out) return merged_index
def run_model(self, test_file, fold, trees, leaves, model_path): #TODO:add to main functionality + test file # score_file = self.code_base_path+"lm_scores/"+str(fold)+"/score" + str(trees)+"_"+str(leaves) score_file = "lm_score/" + str(fold) + "/score" + str( trees) + "_" + str(leaves) if not os.path.exists(os.path.dirname(score_file)): os.makedirs(os.path.dirname(score_file)) run_bash_command('touch ' + score_file) command = self.java_path + " -jar " + self.jar_path + " -load " + model_path + " -rank " + test_file + " -score " + score_file run_bash_command(command) return score_file
def run_model(test_file, run_name=""): java_path = "/home/greg/jdk1.8.0_181/bin/java" jar_path = "/home/greg/SEO_CODE/model_running/RankLib.jar" score_file = "scores_winners/scores_of_seo_run" + run_name if not os.path.exists("scores_winners/"): os.makedirs("scores_winners/") features = test_file model_path = "/home/greg/auto_seo/CrossValidationUtils/model_bot_group" run_bash_command('touch ' + score_file) command = java_path + " -jar " + jar_path + " -load " + model_path + " -rank " + features + " -score " + score_file out = run_bash_command(command) print(out) return score_file
def create_features(reference_docs, past_winners_file_index, doc_ids_file, index_path, top_docs, doc_text): print("loading w2v model") model = load_model() print("loading done") final_features_dir = "sentence_feature_files_test/" features_file = final_features_dir + "new_data_sentence_features_test" features_dir = "sentence_feature_values_test/" if not os.path.exists(features_dir): os.makedirs(features_dir) if not os.path.exists(final_features_dir): os.makedirs(final_features_dir) total_working_set_file = "total_working_set_file_test" run_bash_command("touch " + total_working_set_file) for key in reference_docs: past_winners_file = past_winners_file_index[key] for query in reference_docs[key]: print("working on", query) doc = reference_docs[key][query] print("working on", doc) print("top_doc_file is created") top_docs_file = create_top_docs_per_ref_doc( top_docs, key, doc, query) sentence_file_name, sentences_index = create_sentence_file( top_docs_file, doc, query, key, doc_text) if len(sentences_index[query + key][doc]) < 2: continue print("sentence_file is created") working_set_file = create_sentence_working_set( doc, sentence_file_name, query, key) run_bash_command("cat " + working_set_file + " >> " + total_working_set_file) print("sentence working-set is created") create_w2v_features(sentence_file_name, top_docs_file, doc_ids_file, past_winners_file, model, query, key) print("created seo w2v features") # create_coherency_features(sentences_index,doc,query,model,key) # print("created coherency features") create_tfidf_features_and_features_file(features_dir, index_path, sentence_file_name, top_docs_file, query, past_winners_file, key) print("created tf-idf features") print("creating all features") create_features_from_dir(features_dir, features_file, total_working_set_file) return features_file
def check(self): """Check pipeline run results.""" if self._run_pipeline: ###### Monitor Job ###### try: start_time = datetime.now() response = self._client.wait_for_run_completion( self._run_id, self._test_timeout ) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test( self._test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time ) finally: ###### Output Argo Log for Debugging ###### workflow_json = self._client._get_workflow_json(self._run_id) workflow_id = workflow_json['metadata']['name'] print("Argo Workflow Name: ", workflow_id) argo_log, _ = utils.run_bash_command( 'argo logs {} -n {}'.format( workflow_id, self._namespace ) ) print('=========Argo Workflow Log=========') print(argo_log) if not succ: utils.write_junit_xml( self._test_name, self._result, self._test_cases ) exit(1) ###### Validate the results for specific test cases ###### if self._testname == 'xgboost_training_cm': # For xgboost sample, check its confusion matrix. cm_tar_path = './confusion_matrix.tar.gz' utils.get_artifact_in_minio( workflow_json, 'confusion-matrix', cm_tar_path, 'mlpipeline-ui-metadata' ) with tarfile.open(cm_tar_path) as tar_handle: file_handles = tar_handle.getmembers() assert len(file_handles) == 1 with tar_handle.extractfile(file_handles[0]) as f: cm_data = f.read() utils.add_junit_test( self._test_cases, 'confusion matrix format', (len(cm_data) > 0), 'the confusion matrix file is empty' ) ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(self._test_name, self._result, self._test_cases)
def merge_indices(new_index,new_index_name): path_to_folder = '/lv_local/home/sgregory/Bots/' command = '/lv_local/home/sgregory/indri/bin/dumpindex '+new_index_name+' merge '+new_index+' '+'/lv_local/home/sgregory/cluewebindex' print("merging command:",command) out=run_bash_command(command) print("merging out command:",out) return new_index_name
def delete_doc_from_index(index, doc, dic, run_name=""): did = dic[doc] command = '/lv_local/home/sgregory/indri_test/bin/dumpindex ' + index + ' delete ' + did print("deleting command:", command) sys.stdout.flush() out = run_bash_command(command) print("deleting out command:", out)
def create_features_file_original(features_dir, index_path, queries_file, new_features_file, run_name=""): run_bash_command("rm -r " + features_dir) if not os.path.exists(features_dir): os.makedirs(features_dir) command = "/home/greg/auto_seo/past_winners/LTRFeatures " + queries_file + ' -stream=doc -index=' + index_path + ' -repository=' + index_path + ' -useWorkingSet=true -workingSetFile=/home/greg/auto_seo/SentenceRanking/working_set' + run_name + ' -workingSetFormat=trec' print(command) out = run_bash_command(command) print(out) # command='/home/greg/auto_seo/past_winners/Cent ' + queries_file + ' -index=' + index_path + ' -useWorkingSet=true -workingSetFile=/home/greg/auto_seo/SentenceRanking/working_set'+run_name + ' -workingSetFormat=trec' # print(command) # out = run_bash_command(command) # print(out) run_bash_command("mv doc*_* " + features_dir) command = "perl /home/greg/auto_seo/past_winners/generate.pl " + features_dir + " /home/greg/auto_seo/SentenceRanking/working_set" + run_name print(command) out = run_bash_command(command) print(out) command = "mv features " + new_features_file print(command) out = run_bash_command(command) print(out)
def run_svm_light_model(self,test_file,model_file,fold): predictions_folder = "svm_light_score/"+str(fold)+"/" if not os.path.exists(predictions_folder): os.makedirs(predictions_folder) predictions_file = predictions_folder+os.path.basename(model_file) command = "./svm-predict "+test_file +" "+model_file+" "+ predictions_file out = run_bash_command(command) print(out) return predictions_file
def learn_svm_rank_model(self,train_file,fold,C): models_folder = "svm_rank_models/" + str(fold) + "/" if not os.path.exists(models_folder): os.makedirs(models_folder) model_file = models_folder + "model_"+str(C)+".txt" command = "./svm_rank_learn -c "+str(C)+" "+ train_file + " " + model_file out = run_bash_command(command) # print(out) return model_file
def check(self): """ Check the pipeline running results of the notebook sample. """ test_cases = [] test_name = self._testname + ' Sample Test' ###### Write the script exit code log ###### utils.add_junit_test( test_cases, 'test script execution', (self._exit_code == '0'), 'test script failure with exit code: ' + self._exit_code) try: with open(DEFAULT_CONFIG, 'r') as f: raw_args = yaml.safe_load(f) except yaml.YAMLError as yamlerr: raise RuntimeError('Illegal default config:{}'.format(yamlerr)) except OSError as ose: raise FileExistsError('Default config not found:{}'.format(ose)) else: test_timeout = raw_args['test_timeout'] if self._run_pipeline: experiment = self._experiment_name ###### Initialization ###### client = Client(host=self._host) ###### Get experiments ###### experiment_id = client.get_experiment( experiment_name=experiment).id ###### Get runs ###### list_runs_response = client.list_runs(page_size=RUN_LIST_PAGE_SIZE, experiment_id=experiment_id) ###### Check all runs ###### for run in list_runs_response.runs: run_id = run.id response = client.wait_for_run_completion(run_id, test_timeout) succ = (response.run.status.lower() == 'succeeded') utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure') ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] print("Argo Workflow Name: ", workflow_id) argo_log, _ = utils.run_bash_command( 'argo logs {} -n {}'.format(workflow_id, self._namespace)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, self._result, test_cases) exit(1) ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, self._result, test_cases)
def main(): args = parse_arguments() test_cases = [] test_name = args.testname + ' Sample Test' ###### Initialization ###### client = Client(namespace=args.namespace) ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) print('Error: job not found.') exit(1) ###### Create Experiment ###### experiment_name = args.testname + ' sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = args.testname + '_sample' params = {} response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format( args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def learn_svm_light_model(self,train_file,fold,C,number_of_queries): models_folder = "svm_light_models/" + str(fold) + "/" if not os.path.exists(models_folder): os.makedirs(models_folder) model_file = models_folder + "model_"+str(C)+".txt" # command = "./svm_learn -z p -c "+str(C)+" -m 70000 "+ train_file + " " + model_file command = "./svm-train -c "+str(C)+" "+ train_file + " " + model_file out = run_bash_command(command) print(out) return model_file
def init_top_doc_vectors(top_docs,doc_ids,model): top_docs_vectors={} for query in top_docs: docs = top_docs[query] command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/home/greg/indri-5.6/swig/obj/java/ -cp /home/greg/auto_seo/scripts/indri.jar DocStems ~/mergedindex \""+" ".join([doc_ids[d.rstrip()].strip() for d in docs])+"\"" print(command) print(run_bash_command(command)) top_docs_vectors[query]=[] with open("/home/greg/auto_seo/SentenceRanking/docsForVectors") as docs: for i,doc in enumerate(docs): top_docs_vectors[query].append(get_document_vector(doc,model)) return top_docs_vectors
def create_model_LambdaMart(self, number_of_trees, number_of_leaves, train_file, fold, test=False): if test: add = "test" else: add = "" # model_path = self.model_base_path+str(fold) +"/" + add +'model_' + str(number_of_trees) + "_" + str(number_of_leaves) model_path = "lm_models/" + str(fold) + "/" + add + 'model_' + str( number_of_trees) + "_" + str(number_of_leaves) if not os.path.exists(os.path.dirname(model_path)): os.makedirs(os.path.dirname(model_path)) command = self.java_path + ' -jar ' + self.jar_path + ' -train ' + train_file + ' -ranker 6 -metric2t NDCG@5' \ ' -tree ' + str(number_of_trees) + ' -leaf ' + str(number_of_leaves) +' -save ' +model_path print("command = ", command) run_bash_command(command) return model_path
def create_tfidf_features_and_features_file(sentence_working_set, features_file, features_dir, index_path, sentence_file, top_doc_files, query, past_winners_file): command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/home/greg/indri-5.6/swig/obj/java/ -cp indri.jar Main " + index_path + " " + sentence_file + " " + top_doc_files + " " + past_winners_file + " " + query print(run_bash_command(command)) command = "mv doc*_* " + features_dir run_bash_command(command) command = "perl " + params.sentence_feature_creator + " " + features_dir + " " + sentence_working_set run_bash_command(command) command = "mv features " + features_file run_bash_command(command)
def merge_indices(new_index, run_name="", new_index_name=""): path_to_folder = '/home/greg/auto_seo' if new_index_name == "": new_index_name = path_to_folder + '/new_merged_index' + run_name # print("deleting old merged index repository") # command = "rm -r "+path_to_folder+'/new_merged_index*' # print("delete command = ",command) # run_bash_command(command) # print("delete finished") command = '/home/greg/indri_test/bin/dumpindex ' + new_index_name + ' merge ' + new_index + ' ' + params.corpus_path_56 print("merging command:", command) sys.stdout.flush() out = run_bash_command(command) print("merging out command:", out) # run_command(command) return new_index_name
def main(): args = parse_arguments() test_cases = [] test_name = args.testname + ' Sample Test' ###### Write the script exit code log ###### utils.add_junit_test( test_cases, 'test script execution', (args.exit_code == '0'), 'test script failure with exit code: ' + args.exit_code) if args.experiment is not None: ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % args.namespace client = Client(host=host) ###### Get experiments ###### experiment_id = client.get_experiment( experiment_name=args.experiment).id ###### Get runs ###### list_runs_response = client.list_runs(page_size=1000, experiment_id=experiment_id) ###### Check all runs ###### for run in list_runs_response.runs: run_id = run.id response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure') ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command( 'argo logs -n {} -w {}'.format(args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def main(): args = parse_arguments() test_cases = [] test_name = args.testname + ' Sample Test' ###### Initialization ###### client = Client(namespace=args.namespace) ###### Get experiments ###### list_experiments_response = client.list_experiments(page_size=100) for experiment in list_experiments_response.experiments: if experiment.name == args.experiment: experiment_id = experiment.id ###### Get runs ###### import kfp_run resource_reference_key_type = kfp_run.models.api_resource_type.ApiResourceType.EXPERIMENT resource_reference_key_id = experiment_id list_runs_response = client.list_runs( page_size=1000, resource_reference_key_type=resource_reference_key_type, resource_reference_key_id=resource_reference_key_id) ###### Check all runs ###### for run in list_runs_response.runs: run_id = run.id response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure') ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format( args.namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit(1) ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def add_docs_to_index(index, run_name=""): """ Parse the trectext file given, and create an index. """ path_to_folder = '/lv_local/home/sgregory/auto_seo' indri_build_index = '/lv_local/home/sgregory/indri_test/bin/IndriBuildIndex' corpus_path = params.new_trec_text_file + run_name corpus_class = 'trectext' memory = '1G' stemmer = 'krovetz' os.popen('mkdir -p ' + path_to_folder) if not os.path.exists(path_to_folder + "/index/"): os.makedirs(path_to_folder + "/index/") command = indri_build_index + ' -corpus.path=' + corpus_path + ' -corpus.class=' + corpus_class + ' -index=' + index + ' -memory=' + memory + ' -stemmer.name=' + stemmer print(command) out = run_bash_command(command) print(out) return index
def check(self): test_cases = [] test_name = self._testname + ' Sample Test' ###### Write the script exit code log ###### utils.add_junit_test(test_cases, 'test script execution', (self._exit_code == '0'), 'test script failure with exit code: ' + self._exit_code) if self._experiment is not None: # Bypassing dsl type check sample. ###### Initialization ###### host = 'ml-pipeline.%s.svc.cluster.local:8888' % self._namespace client = Client(host=host) ###### Get experiments ###### experiment_id = client.get_experiment(experiment_name=self._experiment).id ###### Get runs ###### list_runs_response = client.list_runs(page_size=_RUN_LIST_PAGE_SIZE, experiment_id=experiment_id) ###### Check all runs ###### for run in list_runs_response.runs: run_id = run.id response = client.wait_for_run_completion(run_id, _TEST_TIMEOUT) succ = (response.run.status.lower()=='succeeded') utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure') ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] argo_log, _ = utils.run_bash_command( 'argo logs -n {} -w {}'.format(self._namespace, workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) if not succ: utils.write_junit_xml(test_name, self._result, test_cases) exit(1) ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, self._result, test_cases)
def create_index(trec_text_file, run_name=""): """ Parse the trectext file given, and create an index. """ path_to_folder = '/home/greg/auto_seo' indri_build_index = '/home/greg/indri_test/bin/IndriBuildIndex' corpus_path = trec_text_file corpus_class = 'trectext' memory = '1G' index = path_to_folder + "/index/new_index" + run_name stemmer = 'krovetz' os.popen('mkdir -p ' + path_to_folder) if not os.path.isdir(path_to_folder + "/index/"): os.makedirs(path_to_folder + "/index/") command = indri_build_index + ' -corpus.path=' + corpus_path + ' -corpus.class=' + corpus_class + ' -index=' + index + ' -memory=' + memory + ' -stemmer.name=' + stemmer print(command) out = run_bash_command(command) print(out) return index
def run_model_on_test(self, test_file, fold): trees, leaves = self.chosen_model_per_fold[fold] score_file = "lm_score/" + str(fold) + "/score" + str( trees) + "_" + str(leaves) if not os.path.exists(os.path.dirname(score_file)): os.makedirs(os.path.dirname(score_file)) run_bash_command('touch ' + score_file) model_path = "lm_models/" + str(fold) + "/model_" + str( trees) + "_" + str(leaves) run_bash_command('touch ' + score_file) command = self.java_path + " -jar " + self.jar_path + " -load " + model_path + " -rank " + test_file + " -score " + score_file run_bash_command(command) return score_file
def create_features_file_sentence_exp(features_dir, index_path, queries_file, new_features_file, working_set): run_bash_command("rm -r " + features_dir) if not os.path.exists(features_dir): os.makedirs(features_dir) command = params.ltr_features_script + " " + queries_file + ' -stream=doc -index=' + index_path + ' -repository=' + index_path + ' -useWorkingSet=true -workingSetFile=' + working_set + ' -workingSetFormat=trec' print(command) out = run_bash_command(command) print(out) run_bash_command("mv doc*_* " + features_dir) command = "perl " + params.features_generator_script_path + " " + features_dir + " " + working_set print(command) out = run_bash_command(command) print(out) command = "mv features_ " + new_features_file print(command) out = run_bash_command(command) print(out)
def create_features_file(features_dir, index_path, queries_file, new_features_file, add_remove_file, run_name, working_set): run_bash_command("rm -r " + features_dir) if not os.path.exists(features_dir): os.makedirs(features_dir) # command= params.ltr_features_script+" "+ queries_file + ' -stream=doc -index=' + index_path + ' -repository='+ index_path +' -useWorkingSet=true -workingSetFile='+ params.working_set_file+run_name + ' -workingSetFormat=trec' command = " java -Djava.library.path=/home/greg/indri-5.6/swig/obj/java/ -cp /home/greg/auto_seo/scripts/indri.jar LTRFeaturesCreator " + add_remove_file + " " + working_set + " docIDs" print(command) out = run_bash_command(command) print(out) # command=params.cent_script+' ' + queries_file + ' -index=' + index_path + ' -useWorkingSet=true -workingSetFile='+ params.working_set_file+run_name + ' -workingSetFormat=trec' # print(command) # out = run_bash_command(command) # print(out) run_bash_command("mv doc*_* " + features_dir) command = "perl " + params.features_generator_script_path + " " + features_dir + " " + working_set + " " + run_name print(command) out = run_bash_command(command) print(out)
def main(): args = parse_arguments() test_cases = [] test_name = 'Kubeflow Sample Test' ###### Initialization ###### client = Client() ###### Check Input File ###### utils.add_junit_test(test_cases, 'input generated yaml file', os.path.exists(args.input), 'yaml file is not generated') if not os.path.exists(args.input): utils.write_junit_xml(test_name, args.result, test_cases) exit() ###### Create Experiment ###### experiment_name = 'kubeflow sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### job_name = 'kubeflow_sample' params = { 'output': args.output, 'project': 'ml-pipeline-test', 'evaluation': 'gs://ml-pipeline-dataset/sample-test/flower/eval15.csv', 'train': 'gs://ml-pipeline-dataset/sample-test/flower/train30.csv', 'hidden-layer-size': '10,5', 'steps': '5' } response = client.run_pipeline(experiment_id, job_name, args.input, params) run_id = response.id utils.add_junit_test(test_cases, 'create pipeline run', True) ###### Monitor Job ###### start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1200) succ = (response.run.status.lower() == 'succeeded') end_time = datetime.now() elapsed_time = (end_time - start_time).seconds utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) exit() ###### Output Argo Log for Debugging ###### workflow_json = client._get_workflow_json(run_id) workflow_id = workflow_json['metadata']['name'] #TODO: remove the namespace dependency or make is configurable. argo_log, _ = utils.run_bash_command( 'argo logs -n kubeflow -w {}'.format(workflow_id)) print("=========Argo Workflow Log=========") print(argo_log) ###### Validate the results ###### # confusion matrix should show three columns for the flower data # target, predicted, count cm_tar_path = './confusion_matrix.tar.gz' cm_filename = 'mlpipeline-ui-metadata.json' utils.get_artifact_in_minio(workflow_json, 'confusionmatrix', cm_tar_path) tar_handler = tarfile.open(cm_tar_path) tar_handler.extractall() with open(cm_filename, 'r') as f: cm_data = json.load(f) utils.add_junit_test( test_cases, 'confusion matrix format', (len(cm_data['outputs'][0]['schema']) == 3), 'the column number of the confusion matrix output is not equal to three' ) ###### Delete Job ###### #TODO: add deletion when the backend API offers the interface. ###### Write out the test result in junit xml ###### utils.write_junit_xml(test_name, args.result, test_cases)
def move_feature_file(feature_file, run_name): command = 'mv ' + feature_file + ' ' + feature_file + run_name run_bash_command(command) print("feature file moved")