Exemplo n.º 1
0
    sentences = retrieve_sentences(text)
    new_text = ""
    for j in range(len(sentences)):
        new_text += str(j + 1) + ") " + sentences[j].replace(
            u"\u009D", "").replace("\n", "") + " <br><br>\n"
    return new_text


ranked_lists = retrieve_ranked_lists("ranked_lists/trec_file04")
query_data = get_queries_data("topics.full.xml")
reference_docs = {
    q: ranked_lists[q][-1].replace("EPOCH", "ROUND")
    for q in ranked_lists
}
winner_docs = {q: ranked_lists[q][:3] for q in ranked_lists}
a_doc_texts = load_file("documents.trectext")
doc_texts = {}
for doc in a_doc_texts:
    if doc.__contains__("ROUND-04"):
        doc_texts[doc] = a_doc_texts[doc]
sentence_map = map_set_of_sentences(doc_texts, winner_docs)
rows = {}
i = 1
sentence_data = {}

for query in sentence_map:
    reference = reference_docs[query]
    text = doc_texts[reference][1:].replace(u"\u009D", "")
    sentences = [s.replace("\"", "") for s in retrieve_sentences(text)]
    for sentence in sentence_map[query]:
Exemplo n.º 2
0
def create_sentence_similarities(stats):
    rows = {}
    model = WordToVec().load_model()
    ranked_lists = retrieve_ranked_lists(params.ranked_lists_file)
    reference_docs = {
        q: ranked_lists[q][-1].replace("EPOCH", "ROUND")
        for q in ranked_lists
    }
    winner_docs = {q: ranked_lists[q][:3] for q in ranked_lists}
    a_doc_texts = load_file(params.trec_text_file)
    doc_texts = {}
    index = 1
    for doc in a_doc_texts:
        if doc.__contains__("ROUND-04"):
            doc_texts[doc] = a_doc_texts[doc]
    sentence_map = map_set_of_sentences(doc_texts, winner_docs)
    for query in sentence_map:
        ref_doc = reference_docs[query]

        text = doc_texts[ref_doc]
        ref_sentences = retrieve_sentences(text)
        for sentence in sentence_map[query]:

            sentence_vec = get_sentence_vector(sentence_map[query][sentence],
                                               model=model)
            for i, ref_sentence in enumerate(ref_sentences):
                row = {}
                run_name = sentence + "_" + str(i + 1)
                if run_name not in stats:
                    continue
                print("run name in stats")
                window = []
                if i == 0:
                    window.append(numpy.ones(300))
                    window.append(get_sentence_vector(ref_sentences[1], model))

                elif i + 1 == len(ref_sentences):
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                    window.append(numpy.ones(300))
                else:
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                    window.append(
                        get_sentence_vector(ref_sentences[i + 1], model))

                ref_vector = get_sentence_vector(ref_sentence, model)
                # window_dict = {}
                # window_dict[0]=window
                # window_centroid_dict,_ = get_vectors(window_dict)
                # window_centroid=window_centroid_dict[0]
                # similarity_to_window = cosine_similarity(window_centroid,sentence_vec)
                similarity_to_ref_sentence = cosine_similarity(
                    ref_vector, sentence_vec)
                row["id"] = run_name
                row["similarity_to_prev"] = cosine_similarity(
                    sentence_vec, window[0])
                row["similarity_to_ref_sentence"] = similarity_to_ref_sentence
                row["similarity_to_pred"] = cosine_similarity(
                    sentence_vec, window[1])
                row["similarity_to_prev_ref"] = cosine_similarity(
                    ref_vector, window[0])
                row["similarity_to_pred_ref"] = cosine_similarity(
                    ref_vector, window[1])
                score = 0
                if numpy.mean(stats[run_name]) > 0.5:
                    score = 1
                row["score"] = score
                # row["score"]=numpy.mean(stats[run_name])
                rows[index] = row
                index += 1
    return rows
 }
 top_docs["42"] = {q: ranked_lists_new[q][:1] for q in ranked_lists_new}
 ranked_lists_new = retrieve_ranked_lists("trec_file06")
 reference_docs["65"] = {
     q: ranked_lists_new[q][-1].replace("EPOCH", "ROUND")
     for q in ranked_lists_new
 }
 top_docs["65"] = {q: ranked_lists_new[q][:3] for q in ranked_lists_new}
 reference_docs["62"] = {
     q: ranked_lists_new[q][1].replace("EPOCH", "ROUND")
     for q in ranked_lists_new
 }
 top_docs["62"] = {q: ranked_lists_new[q][:1] for q in ranked_lists_new}
 dir = "../../Crowdflower/nimo_annotations"
 sorted_files = sort_files_by_date(dir)
 tmp_doc_texts = load_file(params.trec_text_file)
 doc_texts = {}
 for doc in tmp_doc_texts:
     if doc.__contains__("ROUND-04") or doc.__contains__("ROUND-06"):
         doc_texts[doc] = tmp_doc_texts[doc]
 original_docs = retrieve_initial_documents()
 scores = {}
 for k in range(6):
     needed_file = sorted_files[k]
     scores = get_scores(scores, dir + "/" + needed_file, original_docs,
                         k + 1)
 banned_queries = get_banned_queries(scores, reference_docs)
 # banned_queries = []
 # rounds = ["4","6"]
 rounds = ["4", "6"]
 ranks = ["2", "5"]
Exemplo n.º 4
0
def create_coherency_features(ref_index=-1,
                              ranked_list_new_file="",
                              doc_text_modified=""):
    rows = {}
    max_min_stats = {}
    model = WordToVec().load_model()
    ranked_lists = retrieve_ranked_lists(params.ranked_lists_file)
    ranked_lists_new = retrieve_ranked_lists(ranked_list_new_file)
    reference_docs = {
        q: ranked_lists[q][ref_index].replace("EPOCH", "ROUND")
        for q in ranked_lists
    }
    winner_docs = {
        q: ranked_lists_new[q]
        [:determine_indexes(reference_docs[q], ranked_lists_new[q])]
        for q in ranked_lists_new
    }
    file_to_load = params.trec_text_file
    f****d = []
    if doc_text_modified:
        a_doc_texts = doc_text_modified
    else:
        a_doc_texts = load_file(file_to_load)
    doc_texts = {}
    for doc in a_doc_texts:
        if doc.__contains__("ROUND-04"):
            doc_texts[doc] = a_doc_texts[doc]
    sentence_map = map_set_of_sentences(doc_texts, winner_docs)
    for query in sentence_map:
        ref_doc = reference_docs[query]

        text = doc_texts[ref_doc]
        ref_sentences = retrieve_sentences(text)
        # if len(ref_sentences)<=2:
        #     f****d.append(len(ref_sentences)*len(sentence_map[query]))
        #     continue
        for sentence in sentence_map[query]:

            sentence_vec = get_sentence_vector(sentence_map[query][sentence],
                                               model=model)
            for i, ref_sentence in enumerate(ref_sentences):
                row = {}
                run_name = sentence + "_" + str(i + 1)
                window = []
                if i == 0:
                    window.append(get_sentence_vector(ref_sentences[1], model))
                    window.append(get_sentence_vector(ref_sentences[1], model))

                elif i + 1 == len(ref_sentences):
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                else:
                    window.append(
                        get_sentence_vector(ref_sentences[i - 1], model))
                    window.append(
                        get_sentence_vector(ref_sentences[i + 1], model))
                ref_vector = get_sentence_vector(ref_sentence, model)
                query = run_name.split("-")[2]
                row["similarity_to_prev"] = cosine_similarity(
                    sentence_vec, window[0])
                row["similarity_to_ref_sentence"] = cosine_similarity(
                    ref_vector, sentence_vec)
                row["similarity_to_pred"] = cosine_similarity(
                    sentence_vec, window[1])
                row["similarity_to_prev_ref"] = cosine_similarity(
                    ref_vector, window[0])
                row["similarity_to_pred_ref"] = cosine_similarity(
                    ref_vector, window[1])
                max_min_stats = save_max_mix_stats(max_min_stats, row, query)
                rows[run_name] = row
    print("missed ", sum(f****d), "examples")
    return rows, max_min_stats