def summarization_ds(options):
    global model
    sum_model = options.sum_model
    logger.info("reading queries file")
    raw_queries = read_queries_file(options.queries_file)
    logger.info("reading trec file")
    ranked_lists = rtf(options.trec_file)
    logger.info("transforming queries")
    queries = transform_query_text(raw_queries)
    logger.info("reading trectext file")
    doc_texts = load_file(options.trectext_file)
    logger.info("calculating reference docs")
    reference_docs = get_reference_docs(options.trec_file,
                                        int(options.ref_index))
    logger.info("calculating top docs")
    top_docs = get_top_docs(options.trec_file, int(options.number_of_top_docs))
    logger.info("calculating sentences for replacement")
    senteces_for_replacement = get_sentences_for_replacement(
        doc_texts, reference_docs, queries, options.sentences_vectors_dir,
        options.documents_vectors_dir, top_docs, ranked_lists,
        int(options.starting_epoch), model)
    logger.info("writing input sentences file")
    input_file = write_input_dataset_file(senteces_for_replacement,
                                          reference_docs, doc_texts,
                                          options.suffix)
    logger.info("writing all files")
    return parrallel_create_summarization_task(input_file, queries, sum_model,
                                               doc_texts, reference_docs,
                                               top_docs,
                                               options.documents_vectors_dir,
                                               ranked_lists, options.suffix)
Пример #2
0
    parser.add_option("--workingset_file", dest="workingset_file")
    parser.add_option("--svm_model_file", dest="svm_model_file")
    (options, args) = parser.parse_args()
    ranked_lists = read_trec_file(options.trec_file)
    doc_texts = load_file(options.trectext_file)
    mode = options.mode

    if mode == "qrels":
        create_raw_dataset(ranked_lists, doc_texts, options.raw_ds_out,
                           int(options.ref_index), int(options.top_docs_index))
        create_sentence_vector_files(options.sentences_tfidf_dir,
                                     options.raw_ds_out, options.index_path)
        # raw_ds = read_raw_ds(options.raw_ds_out)
        create_qrels(options.raw_ds_out, options.trec_file,
                     "qrels_seo_bot" + options.ref_index + ".txt",
                     int(options.ref_index), "qrels_indices/", doc_texts,
                     options)
    if mode == "features":
        queries = read_queries_file(options.queries_file)
        queries = transform_query_text(queries)
        word_embd_model = gensim.models.FastText.load_fasttext_format(
            options.model_file)
        feature_creation_parallel(options.raw_ds_out, ranked_lists, doc_texts,
                                  int(options.top_docs_index),
                                  int(options.ref_index),
                                  options.doc_tfidf_dir,
                                  options.sentences_tfidf_dir, queries,
                                  options.output_feature_files_dir,
                                  options.output_final_feature_file_dir,
                                  options.workingset_file)
Пример #3
0
                stats[f][k][epoch] = np.mean(stats[f][k][epoch])
    return stats


if __name__ == "__main__":
    feature_list = [
        "docCoverQueryNum", "docCoverQueryRatio", "docLen", "docBM25",
        "docLMIR.DIR", "docLMIR.JM", "docEnt", "docStopCover", "docFracStops"
    ]
    features_dir = "Features"
    post_features_dir = "Features_post"

    trec = "../trecs/trec_file_original_sorted.txt"
    post_trec = "../trecs/trec_file_post_sorted.txt"
    queries_file = "../data/queries.xml"
    modified_queries = read_queries_file("../data/queries_seo_exp.xml")
    features_file = "../data/features_original"
    featues = read_features_file(features_file)
    queries = read_queries_file(queries_file)

    ranked_lists = read_trec_file(trec)

    original_features = read_features_dir(features_dir, feature_list,
                                          modified_queries)
    post_features = read_features_dir(post_features_dir, feature_list,
                                      modified_queries)

    original_features_stats = analyze_raw_fetures(ranked_lists,
                                                  original_features)
    post_features_stats = analyze_raw_fetures(ranked_lists, post_features)
    for f in feature_list:
Пример #4
0
                            fixed_query = str(int(query)) + str(epoch).zfill(2)
                            chosen_index = chosen_indexes[ref_doc]
                            chosen_sentence = chosen_sentences[ref_doc]
                            for doc in top_docs:
                                text = texts[doc]
                                sentences = sent_tokenize(text)
                                for sentence in sentences:
                                    source_file.write(chosen_sentence + "\n")
                                    queries_file.write(queries[fixed_query] +
                                                       "\n")
                                    target_sentences.write(
                                        sentence.rstrip().replace("\n", " ") +
                                        "\n")
                                    all_input_file.write(
                                        fixed_query + "\t" + ref_doc + "\t" +
                                        chosen_index + "\t" +
                                        chosen_sentence.rstrip() + "\t" +
                                        sentence.rstrip().replace("\n", "") +
                                        "\n")


if __name__ == "__main__":
    trec_file = "trecs/trec_file_original_sorted.txt"
    queries_file = "data/queries_seo_exp.xml"
    queries = read_queries_file(queries_file)
    ranked_lists = read_trec_file(trec_file)
    texts = load_file("data/documents.trectext")
    chosen_indexes, chosen_sentences = read_summary_data(
        "input_data/all_data_transformer.txt")
    create_files(ranked_lists, -1, 3, queries, "top_sentences_borda/",
                 chosen_indexes, texts, chosen_sentences)
Пример #5
0
from summarization.seo_experiment.borda_mechanism import query_term_freq, query_term_occ
from summarization.seo_experiment.utils import clean_texts
from summarization.seo_experiment.workingset_creator import read_queries_file
from summarization.seo_experiment.summarization_process import transform_query_text
import nltk
import numpy as np

queries = read_queries_file("../data/queries.xml")
queries = transform_query_text(queries)

summary_access = open("top_docs_summaries.txt")
summary_data_access = open("summarization_data.txt")
summaries = summary_access.readlines()
data_points = summary_data_access.readlines()

freqs = {"all": [], "first": []}
for i, summary in enumerate(summaries):
    data = data_points[i]
    qid = data.split("\t")[1]
    q_text = queries[qid]
    fixed_sum = summary.replace("<t>", "").replace("</t>", "").replace(
        ", .", ".").replace(". .", ".")
    freqs["all"].append(query_term_occ("sum", clean_texts(fixed_sum), q_text))
    first = nltk.sent_tokenize(fixed_sum)[0]
    freqs["first"].append(query_term_occ("sum", clean_texts(first), q_text))

for k in freqs:
    freqs[k] = np.mean(freqs[k])

print(freqs)