def run_model(test_file,home_path,java_path,jar_path,score_file,model_path):
    java_path = home_path+"/"+java_path+"/bin/java"
    if not os.path.exists(os.path.dirname(score_file)):
        os.makedirs(os.path.dirname(score_file))
    features = test_file
    run_bash_command('touch ' + score_file)
    command = java_path + " -jar " + jar_path + " -load " + model_path + " -rank " + features + " -score " + score_file
    out = run_bash_command(command)
    print(str(out))
    return score_file
Пример #2
0
def create_features_file(features_dir, index_path, queries_file,
                         new_features_file, working_set_file, scripts_path):
    """
    Creates  a feature file via a given index and a given working set file
    """
    run_bash_command("rm -r " + features_dir)
    if not os.path.exists(features_dir):
        os.makedirs(features_dir)
    if not os.path.exists(os.path.dirname(new_features_file)):
        os.makedirs(os.path.dirname(new_features_file))
    command = scripts_path + "LTRFeatures " + queries_file + ' -stream=doc -index=' + index_path + ' -repository=' + index_path + ' -useWorkingSet=true -workingSetFile=' + working_set_file + ' -workingSetFormat=trec'
    print(command)
    out = run_bash_command(command)
    print(out)
    run_bash_command("mv doc*_* " + features_dir)
    command = "perl " + scripts_path + "generate.pl " + features_dir + " " + working_set_file
    print(command)
    out = run_bash_command(command)
    print(out)
    command = "mv features " + new_features_file
    print(command)
    out = run_bash_command(command)
    print(out)
    run_bash_command("mv featureID " + os.path.dirname(new_features_file))
    return new_features_file
Пример #3
0
def feature_creation_parallel(raw_dataset_file, ranked_lists, doc_texts, top_doc_index, ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir, queries, output_feature_files_dir,output_final_features_dir,workingset_file):
    global word_embd_model
    args = [qid for qid in queries]
    if not os.path.exists(output_feature_files_dir):
        os.makedirs(output_feature_files_dir)
    if not os.path.exists(output_final_features_dir):
        os.makedirs(output_final_features_dir)
    raw_ds = read_raw_ds(raw_dataset_file)
    create_ws(raw_ds,workingset_file,ref_doc_index)
    func = partial(create_features, raw_ds, ranked_lists, doc_texts, top_doc_index, ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir, queries, output_feature_files_dir)
    workers = cpu_count()-1
    list_multiprocessing(args,func,workers=workers)
    command = "perl generateSentences.pl " + output_feature_files_dir+" "+workingset_file
    run_bash_command(command)
    run_bash_command("mv features "+output_final_features_dir)
Пример #4
0
def create_features_file_diif(features_dir, base_index_path, new_index_path,
                              queries_file, new_features_file,
                              working_set_file, scripts_path):
    """
    Creates  a feature file via a given index and a given working set file
    """
    run_bash_command("rm -r " + features_dir)
    if not os.path.exists(features_dir):
        os.makedirs(features_dir)
    if not os.path.exists(os.path.dirname(new_features_file)):
        os.makedirs(os.path.dirname(new_features_file))
    # command= scripts_path+"LTRFeatures "+ queries_file + ' -stream=doc -index=' + index_path + ' -repository='+ index_path +' -useWorkingSet=true -workingSetFile='+working_set_file + ' -workingSetFormat=trec'
    command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/lv_local/home/sgregory/indri-5.6/swig/obj/java/ -cp seo_summarization.jar LTRFeatures " + base_index_path + " " + new_index_path + " data/stopWordsList data/working_comp_queries.txt " + working_set_file + " " + features_dir
    print(command)
    out = run_bash_command(command)
    print(out)
    # run_bash_command("mv doc*_* "+features_dir)
    command = "perl " + scripts_path + "generate.pl " + features_dir + " " + working_set_file
    print(command)
    out = run_bash_command(command)
    print(out)
    command = "mv features " + new_features_file
    print(command)
    out = run_bash_command(command)
    print(out)
    run_bash_command("mv featureID " + os.path.dirname(new_features_file))
    return new_features_file
Пример #5
0
def run_svm_rank_model(test_file, model_file, predictions_folder):
    if not os.path.exists(predictions_folder):
        os.makedirs(predictions_folder)
    predictions_file = predictions_folder + os.path.basename(model_file)
    command = "./svm_rank_classify " + test_file + " " + model_file + " " + predictions_file
    print("##Running command: "+command+"##")
    out = run_bash_command(command)
    print("Output of ranking command: "+str(out),flush=True)
    return predictions_file
def merge_indices(merged_index,new_index_name, base_index, home_path ='/home/greg/', indri_path ="indri_test"):
    """
    merges two different indri indices into one
    """
    # new_index_name = home_path +'/' + index_path +'/' + new_index_name
    if not os.path.exists(os.path.dirname(merged_index)):
        os.makedirs(os.path.dirname(merged_index))
    command = home_path+"/"+indri_path+'/bin/dumpindex '+merged_index +' merge ' + new_index_name + ' ' + base_index
    print("##merging command:",command+"##",flush=True)
    out=run_bash_command(command)
    print("merging command output:"+str(out),flush=True)
    return new_index_name
def run_summarization_model(script_file,model_file,input_file,output_file,**kwargs):
    """
     cmd example:
     nohup python ~/OpenNMT-py/translate.py --replace_unk  -beam_size 10 --model ~/OpenNMT-py/sum_transformer_model_acc_57.25_ppl_9.22_e16.pt
      --src input_transformer.txt --output transformer_real_par2.txt
      --batch_size 1  -min_length 1  -gpu 0 &
    """
    command = "python "+script_file+" --replace_unk  -beam_size 10 --model "+model_file+" --src "+input_file+" --output "+output_file+" --batch_size 1 -gpu 0 "
    for key, value in kwargs.items():
        command+="--"+key+" "+value+" "
    print("##Running summarization command: "+command+"##",flush=True)
    out = run_bash_command(command)
    print("Summarization output= "+str(out),flush=True)
def create_index(trec_text_file,index_path,new_index_name,home_path = '/home/greg/',indri_path = "indri_test"):
    """
    Parse the trectext file given, and create an index.
    """
    indri_build_index = home_path+'/'+indri_path+'/bin/IndriBuildIndex'
    corpus_path = trec_text_file
    corpus_class = 'trectext'
    memory = '1G'
    index = index_path+"/"+new_index_name
    if not os.path.exists(index_path):
        os.makedirs(index_path)
    stemmer =  'krovetz'
    if not  os.path.exists(home_path+"/"+index_path):
        os.makedirs(home_path+"/"+index_path)
    command = indri_build_index + ' -corpus.path=' + corpus_path + ' -corpus.class=' + corpus_class + ' -index=' + index + ' -memory=' + memory + ' -stemmer.name=' + stemmer
    print("##Running IndriBuildIndex command ="+command+"##",flush=True)
    out=run_bash_command(command)
    print("IndriBuildIndex output:"+str(out),flush=True)
    return index
def create_features_file_diff(features_dir, base_index_path, new_index_path, new_features_file, working_set_file, scripts_path,java_path,swig_path,stopwords_file,queries_text_file,home_path):
    """
    Creates  a feature file via a given index and a given working set file
    """
    run_bash_command("rm -r "+features_dir)
    if not os.path.exists(features_dir):
        os.makedirs(features_dir)
    if not os.path.exists(os.path.dirname(new_features_file)):
        os.makedirs(os.path.dirname(new_features_file))
    command = home_path+java_path+"/bin/java -Djava.library.path="+swig_path+ " -cp seo_indri_utils.jar LTRFeatures "+base_index_path+" "+new_index_path+" "+stopwords_file+" "+queries_text_file+" "+working_set_file+" "+features_dir
    print(command)
    out = run_bash_command(command)
    print(out)
    command = "perl " + scripts_path + "generate.pl " + features_dir + " " + working_set_file
    print(command)
    out=run_bash_command(command)
    print(out)
    command = "mv features "+new_features_file
    print(command)
    out = run_bash_command(command)
    print(out)
    run_bash_command("mv featureID "+os.path.dirname(new_features_file))
    return new_features_file
Пример #10
0
def create_sentence_vector_files(output_dir, raw_ds_file, index_path,java_path,swig_path,home_path):
    command = home_path+java_path+"/bin/java -Djava.library.path="+swig_path+" -cp seo_indri_utils.jar PrepareTFIDFVectorsSentences "+index_path+" "+raw_ds_file+" "+output_dir
    logger.info("##Running command: "+command+"##")
    out = run_bash_command(command)
    logger.info("Command output: "+str(out))
Пример #11
0
def create_sentence_vector_files(output_dir, raw_ds_file, index_path):
    command = " ~/jdk1.8.0_181/bin/java -Djava.library.path=/lv_local/home/sgregory/indri-5.6/swig/obj/java/ -cp seo_summarization.jar PrepareTFIDFVectorsSentences " + index_path + " " + raw_ds_file + " " + output_dir
    logger.info("##Running command: " + command + "##")
    out = run_bash_command(command)
    logger.info("Command output: " + str(out))
Пример #12
0
     ref_index
 ) + " --top_docs_index=" + str(
     number_of_top_docs
 ) + " --doc_tfidf_dir=asr_tfidf_vectors/ --sentences_tfidf_dir=sentences_tfidf_vectors/ --queries_file=data/queries_seo_exp.xml --scores_dir=scores_bot_weighted_0_" + str(
     ref_index
 ) + "/ --trec_file=trecs/trec_file_original_sorted.txt --sentence_trec_file=trecs/bot_weighted_0_sentence_trec_file_" + str(
     ref_index
 ) + ".txt --output_feature_files_dir=Features_bot_weighted_0_" + str(
     ref_index
 ) + "/ --output_final_feature_file_dir=features_bot_weighted_0_" + str(
     ref_index
 ) + "/ --trectext_file=data/documents.trectext --new_trectext_file=data/bot_weighted_0_documents_" + str(
     ref_index
 ) + ".trectext --model_file=bot_exp_utils/word2vec_model --svm_model_file=bot_exp_utils/weighted_0 --workingset_file=data/workingset_bot_pairs_" + str(
     ref_index)
 out = run_bash_command(bot_command)
 print(out, flush=True)
 rerank_command = "python reranking_process.py --mode=all --features_dir=Features_bot_weighted_0_post_" + str(
     ref_index
 ) + "/ --merged_index=merged_indices/merged_index/ --queries_file=data/queries_seo_exp.xml --new_features_file=final_features_dir/features_bot_weighted_0_post_" + str(
     ref_index
 ) + " --workingset_file=data/workingset_original --scripts_path=scripts/ --java_path=jdk1.8.0_181 --jar_path=scripts/RankLib.jar --score_file=scores/scores_bot_weighted_0_post_" + str(
     ref_index
 ) + ".txt --model_file=rank_models/model_bot --trec_file=trecs/trec_file_bot_weighted_0_post_" + str(
     ref_index
 ) + " --trectext_file=data/bot_weighted_0_documents_" + str(
     ref_index
 ) + ".trectext --home_path=~/ --base_index=merged_indices/merged_index --new_index=new_indices/all_doc_bot_weighted_0_" + str(
     ref_index) + " --indri_path=work_indri"
 out = run_bash_command(rerank_command)
 print(out, flush=True)
Пример #13
0
    ) + ".txt --trec_file=trecs/trec_file_original_sorted.txt --number_of_top_docs=3 --trectext_file=data/documents.trectext --new_trectext_file=data/updated_documents_" + str(
        ref_index
    ) + ".trectext --new_ws_file=data/updated_workingset --model_file=/lv_local/home/sgregory/textGen/summarization/wiki.en.bin"

    # rerank_command = "python ranking_process.py --mode=all --features_dir=Features_post_"+str(ref_index)+" --merged_index=merged_indices/merged_index_post --queries_file=data/queries_seo_exp.xml --new_features_file=final_features_dir/features_post_"+str(ref_index)+" --workingset_file=data/workingset_original --scripts_path=scripts/ --java_path=jdk1.8.0_181 --jar_path=scripts/RankLib.jar --score_file=scores/scores_post_"+str(ref_index)+".txt --model_file=rank_models/model_bot --trec_file=trecs/trec_file_post_"+str(ref_index)+" --trectext_file=data/updated_documents_"+str(ref_index)+".trectext --home_path=~/ --base_index=~/cluewebindex --new_index=new_indices/all_doc_index_post_"+str(ref_index)+" --indri_path=work_indri"
    rerank_command = "python reranking_process.py --mode=all --features_dir=Features_post_" + str(
        ref_index
    ) + "/ --merged_index=merged_indices/merged_index --queries_file=data/queries_seo_exp.xml --new_features_file=final_features_dir/features_post_" + str(
        ref_index
    ) + " --workingset_file=data/workingset_original --scripts_path=scripts/ --java_path=jdk1.8.0_181 --jar_path=scripts/RankLib.jar --score_file=scores/scores_post_" + str(
        ref_index
    ) + ".txt --model_file=rank_models/model_bot --trec_file=trecs/trec_file_post_" + str(
        ref_index
    ) + " --trectext_file=data/updated_documents_" + str(
        ref_index
    ) + ".trectext --home_path=~/ --base_index=~/cluewebindex --new_index=new_indices/all_doc_index_post_" + str(
        ref_index) + " --indri_path=work_indri"

    out = run_bash_command(prep_vectors_command)
    print(out, flush=True)
    out = run_bash_command(summarization_command)
    print(out, flush=True)
    run_bash_command("rm -r summary_vectors/")
    out = run_bash_command(vectors_command)
    print(out, flush=True)
    out = run_bash_command(updata_text_command)
    print(out, flush=True)
    # run_bash_command("rm -r merged_indices/merged_index_post")
    out = run_bash_command(rerank_command)
    print(out, flush=True)
Пример #14
0
def order_trec_file(trec_file):
    final = trec_file.replace(".txt", "_sorted.txt")
    command = "sort -k1,1n -k5nr -k2,1 " + trec_file + " > " + final
    print(command)
    run_bash_command(command)
    return final
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    parser = OptionParser()
    parser.add_option("--trec_file", dest="trec_file")
    parser.add_option("--trectext_file", dest="trectext_file")
    parser.add_option("--ref_index", dest="ref_index")
    parser.add_option("--index", dest="index")
    parser.add_option("--sentences_out_file", dest="sentences_out_file")
    parser.add_option("--vectors_output_dir", dest="vectors_output_dir")
    (options, args) = parser.parse_args()
    if not os.path.exists(options.vectors_output_dir):
        os.makedirs(options.vectors_output_dir)
    if not os.path.exists(os.path.dirname(options.sentences_out_file)):
        os.makedirs(os.path.dirname(options.sentences_out_file))
    reference_docs = get_reference_docs(options.trec_file,
                                        int(options.ref_index))
    document_text = load_file(options.trectext_file)
    with open(options.sentences_out_file, 'w') as out_file:
        for query in reference_docs:
            doc = reference_docs[query]
            doc_text = document_text[doc]
            sentences = nltk.sent_tokenize(doc_text)
            for i, sentence in enumerate(sentences):
                out_file.write(query + "\t" + doc + "_" + str(i) + "\t" +
                               sentence.rstrip().replace("\n", "") + "\n")

    command = " ~/jdk1.8.0_181/bin/java -Djava.library.path=/lv_local/home/sgregory/indri-5.6/swig/obj/java/ -cp seo_summarization.jar PrepareTFIDFVectorsReferenceDocs " + options.index + " " + options.sentences_out_file + " " + options.vectors_output_dir
    logger.info("## Running vector creation command: " + command + " ##")
    logger.info(run_bash_command(command))
    logger.info("Vector creation is DONE..")
Пример #16
0
from gen_utils import run_bash_command

for ref_index in [1, 2, 3, 4]:
    number_of_top_docs = str(min(3, ref_index))
    raw_summary_command = "python create_raw_ds_summaries.py " + str(ref_index)
    out = run_bash_command(raw_summary_command)
    print(out, flush=True)
    vectors_command = "~/jdk1.8.0_181/bin/java -Djava.library.path=/lv_local/home/sgregory/indri-5.6/swig/obj/java/ -cp seo_summarization.jar PrepareTFIDFVectorsSummaries merged_indices/merged_index/ docs.txt competition_doc_summaries summary_vectors_docs_" + str(
        ref_index) + "/"
    out = run_bash_command(vectors_command)
    print(out, flush=True)
    bot_summaries_command = "python bot_execution_summaries.py --mode=all --index_path=merged_indices/merged_index --raw_ds_out=data/raw_bot_summary_" + str(
        ref_index
    ) + ".txt --ref_index=" + str(
        ref_index
    ) + " --top_docs_index=" + number_of_top_docs + " --doc_tfidf_dir=asr_tfidf_vectors/ --sentences_tfidf_dir=sentence_ref_vectors_" + str(
        ref_index
    ) + "/ --summary_tfidf_dir=summary_vectors_docs_" + str(
        ref_index
    ) + "/ --queries_file=data/queries_seo_exp.xml --scores_dir=scores_summary_bot_1_" + str(
        ref_index
    ) + "/ --trec_file=trecs/trec_file_original_sorted.txt --sentence_trec_file=trecs/bot_summary_1_trec_file_" + str(
        ref_index
    ) + ".txt --output_feature_files_dir=Features_bot_1_summary_" + str(
        ref_index
    ) + "/ --output_final_feature_file_dir=features_bot_summary_1_" + str(
        ref_index
    ) + "/ --trectext_file=data/documents.trectext --new_trectext_file=data/bot_summary_1_documents_" + str(
        ref_index
    ) + ".trectext --model_file=/lv_local/home/sgregory/textGen/summarization/seo_experiment/bot_exp_utils/word2vec_model --svm_model_file=bot_exp_utils/harmonic_competition_model_all_data --workingset_file=data/workingset_bot_summary"
    out = run_bash_command(bot_summaries_command)