def summarization_ds(options):
    global model
    sum_model = options.sum_model
    logger.info("reading queries file")
    raw_queries = read_queries_file(options.queries_file)
    logger.info("reading trec file")
    ranked_lists = rtf(options.trec_file)
    logger.info("transforming queries")
    queries = transform_query_text(raw_queries)
    logger.info("reading trectext file")
    doc_texts = load_file(options.trectext_file)
    logger.info("calculating reference docs")
    reference_docs = get_reference_docs(options.trec_file,
                                        int(options.ref_index))
    logger.info("calculating top docs")
    top_docs = get_top_docs(options.trec_file, int(options.number_of_top_docs))
    logger.info("calculating sentences for replacement")
    senteces_for_replacement = get_sentences_for_replacement(
        doc_texts, reference_docs, queries, options.sentences_vectors_dir,
        options.documents_vectors_dir, top_docs, ranked_lists,
        int(options.starting_epoch), model)
    logger.info("writing input sentences file")
    input_file = write_input_dataset_file(senteces_for_replacement,
                                          reference_docs, doc_texts,
                                          options.suffix)
    logger.info("writing all files")
    return parrallel_create_summarization_task(input_file, queries, sum_model,
                                               doc_texts, reference_docs,
                                               top_docs,
                                               options.documents_vectors_dir,
                                               ranked_lists, options.suffix)
Пример #2
0
    parser.add_option("--queries_file", dest="queries_file")
    parser.add_option("--scores_dir", dest="scores_dir")
    parser.add_option("--trec_file", dest="trec_file")
    parser.add_option("--sentence_trec_file", dest="sentence_trec_file")
    parser.add_option("--output_feature_files_dir",
                      dest="output_feature_files_dir")
    parser.add_option("--output_final_feature_file_dir",
                      dest="output_final_feature_file_dir")
    parser.add_option("--trectext_file", dest="trectext_file")
    parser.add_option("--new_trectext_file", dest="new_trectext_file")
    parser.add_option("--model_file", dest="model_file")
    parser.add_option("--workingset_file", dest="workingset_file")
    parser.add_option("--svm_model_file", dest="svm_model_file")
    (options, args) = parser.parse_args()
    ranked_lists = read_trec_file(options.trec_file)
    doc_texts = load_file(options.trectext_file)
    mode = options.mode

    if mode == "qrels":
        create_raw_dataset(ranked_lists, doc_texts, options.raw_ds_out,
                           int(options.ref_index), int(options.top_docs_index))
        create_sentence_vector_files(options.sentences_tfidf_dir,
                                     options.raw_ds_out, options.index_path)
        # raw_ds = read_raw_ds(options.raw_ds_out)
        create_qrels(options.raw_ds_out, options.trec_file,
                     "qrels_seo_bot" + options.ref_index + ".txt",
                     int(options.ref_index), "qrels_indices/", doc_texts,
                     options)
    if mode == "features":
        queries = read_queries_file(options.queries_file)
        queries = transform_query_text(queries)
Пример #3
0
            "reference"]
    legends = ["reference", "next", "top", "post"]
    colors = ['b', 'r', "k", "y"]
    for f in feature_list:
        ys = [[
            original_features_stats[f][k][e]
            for e in sorted(list(original_features_stats[f][k].keys()))
        ] for k in legends]
        x = sorted(list(original_features_stats[f]["next"].keys()))
        plot_metric(ys, x,
                    f.lower().replace('.', ''), f.replace('.', ''), "Epochs",
                    legends, colors)

    scores = read_trec_scores(trec)
    post_scores = read_trec_scores(post_trec)
    doc_texts = load_file("../data/documents.trectext")
    updated_doc_texts = load_file("../data/updated_documents.trectext")

    stats = doc_frequency_eval(ranked_lists, queries, doc_texts)
    updated_stats = doc_frequency_eval(ranked_lists, queries,
                                       updated_doc_texts)
    stats["post"] = updated_stats["reference"]
    legends = ["reference", "next", "top", "post"]
    colors = ['b', 'r', "k", "y"]
    ys = [[stats[k][e] for e in sorted(list(stats[k].keys()))]
          for k in legends]
    x = sorted(list(stats["reference"].keys()))
    plot_metric(ys, x, "plt/qtf_comp_docs", "Avg QTF", "Epochs", legends,
                colors)

    stats = compare_scores(scores, ranked_lists)
Пример #4
0
from summarization.seo_experiment.utils import load_file

data = load_file("data/documents.trectext")
queries = set()
for doc in data:
    query = doc.split("-")[2]
    queries.add(query)

with open("data/queries.txt") as f1:
    with open("data/queries_comp.txt",'w') as f2:
        for line in f1:
            if line.split(":")[0] in queries:
                f2.write(line)
Пример #5
0
if __name__ == "__main__":
    trectext_file_prefix = sys.argv[1]
    trec_file = sys.argv[2]
    fname_addition = sys.argv[3]
    starting_epoch = int(sys.argv[4])
    for ref_index in ["1", "2", "3", "4"]:
        final_trec_name = "trecs/trec_file_" + fname_addition + "_post_" + str(
            ref_index)
        if os.path.exists(final_trec_name):
            os.remove(final_trec_name)
        for r in range(starting_epoch, 8):
            trectext_fname = trectext_file_prefix + "_" + ref_index + ".trectext"
            trectext_fname_new = trectext_file_prefix + "_" + ref_index + "_" + str(
                r) + "_new.trectext"
            trectext_file_for_read = fix_xml_file(trectext_fname)
            texts = load_file(trectext_file_for_read)
            original_texts = load_file("data/documents.trectext")
            ranked_lists = read_trec_file(trec_file)
            ref_docs = get_ref_docs(ranked_lists, int(ref_index))
            workingset_fname = "data/dynamic_experiment_workingset_" + ref_index + "_" + str(
                r) + ".txt"
            workingset_docs = create_working_set(ref_docs, texts, r, r + 1,
                                                 workingset_fname)
            create_trectext_dynamic(texts, original_texts, workingset_docs,
                                    trectext_fname_new)
            tmp_trec_file = run_reranking(workingset_fname, fname_addition, r,
                                          trectext_fname_new)
            append_to_file(tmp_trec_file, final_trec_name)
            os.remove(tmp_trec_file)
        order_trec_file(final_trec_name)
        os.remove(final_trec_name)
 logging.root.setLevel(level=logging.INFO)
 logger.info("running %s" % ' '.join(sys.argv))
 parser = OptionParser()
 parser.add_option("--doc_tfidf_dir", dest="doc_tfidf_dir")
 parser.add_option("--summaries_tfidf_dir", dest="summaries_tfidf_dir")
 parser.add_option("--queries_file", dest="queries_file")
 parser.add_option("--summaries_file", dest="summaries_file")
 parser.add_option("--input_data_file", dest="input_data_file")
 parser.add_option("--trec_file", dest="trec_file")
 parser.add_option("--number_of_top_docs", dest="number_of_top_docs")
 parser.add_option("--trectext_file", dest="trectext_file")
 parser.add_option("--new_trectext_file", dest="new_trectext_file")
 parser.add_option("--new_ws_file", dest="new_ws_file")
 parser.add_option("--model_file", dest="model_file")
 (options, args) = parser.parse_args()
 summary_stats, summary_tfidf_fname_index, replacement_indexes, queries_text, reference_docs = read_summaries_data(
     options.summaries_file, options.input_data_file,
     options.summaries_tfidf_dir, options.queries_file)
 document_texts = load_file(options.trectext_file)
 ranked_lists = read_trec_file(options.trec_file)
 # model = gensim.models.FastText.load_fasttext_format(options.model_file)
 model = gensim.models.KeyedVectors.load_word2vec_format(options.model_file,
                                                         binary=True,
                                                         limit=700000)
 updated_texts = update_texts_with_replacement_summary(
     replacement_indexes, summary_stats, ranked_lists,
     options.doc_tfidf_dir, queries_text, document_texts, options.trec_file,
     int(options.number_of_top_docs), summary_tfidf_fname_index,
     reference_docs, model)
 create_trectext(updated_texts, options.new_trectext_file,
                 options.new_ws_file)
from summarization.seo_experiment.utils import load_file,read_trec_file,run_summarization_model
from summarization.seo_experiment.borda_mechanism import read_queries
import nltk

def create_summarization_ds(ranked_lists,texts):
    with open("summarization_data.txt",'w',encoding="utf-8") as sum_data:
        with open("texts_for_summary.txt","w",encoding="utf-8") as text_data:
            for r in range(1,7):
                epoch = str(r).zfill(2)
                for query in ranked_lists[epoch]:
                    winner = ranked_lists[epoch][query][0]
                    text = texts[winner]
                    sentences = nltk.sent_tokenize(text)
                    line = " ".join(["<t> "+s.replace("\n","")+" </t>" for s in sentences])+"\n"
                    text_data.write(line)
                    sum_data.write(epoch+"\t"+query+"\t"+winner+"\n")

if __name__=="__main__":
    trectext_file = "../data/documents.trectext"
    # queries_file = "../data/queries.txt"
    trec_file = "../trecs/trec_file_original_sorted.txt"
    summary_kwargs = {"lstm":{"min_length" :"10","block_ngram_repeat": "2"},"transformer":{"min_length" :"3"}}
    ranked_lists = read_trec_file(trec_file)
    texts = load_file(trectext_file)
    create_summarization_ds(ranked_lists,texts)
    run_summarization_model("~/OpenNMT-py/translate.py","../summarization_models/sum_transformer_model_acc_57.25_ppl_9.22_e16.pt","texts_for_summary.txt","top_docs_summaries.txt",**summary_kwargs["transformer"])
Пример #8
0
from summarization.seo_experiment.utils import load_file
import nltk
import numpy as np

texts = load_file("../data/documents.trectext")
stats = {}
for doc in texts:
    r = doc.split("-")[1]
    if r not in ["06", "07"]:
        continue
    if r not in stats:
        stats[r] = []
    stats[r].append(len(nltk.sent_tokenize(texts[doc])))

for r in stats:
    stats[r] = np.mean(stats[r])

print(stats)