def compare_frequecies(summary_stats_file): # stats={"sentence":{},"summary":{},"success":{}} stats = {"sentence": {}, "summary": {}} occ_stats = {"sentence": {}, "summary": {}} with open(summary_stats_file, encoding="utf-8") as file: for line in file: query = line.split("\t")[0] doc = line.split("\t")[1] epoch = int(doc.split("-")[1]) for k in stats: if epoch not in stats[k]: stats[k][epoch] = [] if k in occ_stats: occ_stats[k][epoch] = [] sentence = line.split("\t")[2] summary = line.split("\t")[3] sentence_qtf = query_term_freq("avg", clean_texts(sentence), query) stats["sentence"][epoch].append(sentence_qtf) occ_stats["sentence"][epoch].append( query_term_occ("sum", clean_texts(sentence), query)) summary_qtf = query_term_freq("avg", clean_texts(summary), query) stats["summary"][epoch].append(summary_qtf) occ_stats["summary"][epoch].append( query_term_occ("sum", clean_texts(summary), query)) for k in stats: for epoch in stats[k]: stats[k][epoch] = np.mean(stats[k][epoch]) if k in occ_stats: occ_stats[k][epoch] = np.mean(occ_stats[k][epoch]) return stats, occ_stats
def doc_frequency_eval(lists, queries, texts): stats = {"top": {}, "reference": {}, "next": {}} for epoch in lists: for k in stats: stats[k][epoch] = [] for query in lists[epoch]: query_text = queries[query] top_docs = lists[epoch][query][:3] ref_doc = lists[epoch][query][-1] next_doc = lists[epoch][query][-2] stats["top"][epoch].append( np.mean([ query_term_freq("avg", clean_texts(texts[doc]), query_text) for doc in top_docs ])) stats["reference"][epoch].append( query_term_freq("avg", clean_texts(texts[ref_doc]), query_text)) stats["next"][epoch].append( query_term_freq("avg", clean_texts(texts[next_doc]), query_text)) for k in stats: for epoch in stats[k]: stats[k][epoch] = np.mean(stats[k][epoch]) return stats
def choose_highest_rank_summary(chosen_idxs, summaries, document_text, replacement_index): for idx in chosen_idxs: summary = summaries[idx] sentences = nltk.sent_tokenize(document_text) sentences[replacement_index] = summary new_text = " ".join(sentences) if len(clean_texts(new_text).split()) <= 150: return summary return None
def context_similarity(replacement_index, ref_sentences, sentence_compared, mode, model, stemmer=None): if mode == "own": ref_sentence = ref_sentences[replacement_index] return centroid_similarity(clean_texts(ref_sentence), clean_texts(sentence_compared), model, stemmer) if mode == "pred": if replacement_index + 1 == len(ref_sentences): sentence = ref_sentences[replacement_index] else: sentence = ref_sentences[replacement_index + 1] return centroid_similarity(clean_texts(sentence), clean_texts(sentence_compared), model, stemmer) if mode == "prev": if replacement_index == 0: sentence = ref_sentences[replacement_index] else: sentence = ref_sentences[replacement_index - 1] return centroid_similarity(clean_texts(sentence), clean_texts(sentence_compared), model, stemmer)
def write_input_dataset_file(replacements,reference_docs,texts,suffix): input_dir = 'input_data/' if not os.path.exists(input_dir): os.makedirs(input_dir) with open(input_dir+"senteces_for_replace_"+suffix+".txt",'w') as file: file.write("query\tdocname\tsentence_index\tsentence\n") for query in replacements: index = replacements[query] docname = reference_docs[query] text = texts[docname] sentence = clean_texts(sent_tokenize(text)[index]) file.write("\t".join([query,docname,str(index),sentence])+'\n') return input_dir+"senteces_for_replace_"+suffix+".txt"
def past_winners_centroid(past_winners, texts, model, stemmer=None): sum_vector = None decay_factors = [ 0.01 * math.exp(-0.01 * (len(past_winners) - i)) for i in range(len(past_winners)) ] denominator = sum(decay_factors) for i, doc in enumerate(past_winners): text = texts[doc] vector = get_text_centroid(clean_texts(text), model, stemmer) if sum_vector is None: sum_vector = np.zeros(vector.shape[0]) sum_vector += vector * decay_factors[i] / denominator return sum_vector
def update_texts(doc_texts, pairs_ranked_lists, sentence_data): new_texts = {} for qid in pairs_ranked_lists: for chosen_pair in pairs_ranked_lists[qid]: ref_doc = chosen_pair.split("_")[0] replacement_index = int(chosen_pair.split("_")[1]) sentence_in = sentence_data[qid][chosen_pair]["in"] sentences = sent_tokenize(doc_texts[ref_doc]) sentences[replacement_index] = sentence_in new_text = "\n".join(sentences) if len(clean_texts(deepcopy(new_text)).split()) > 150: continue new_texts[ref_doc] = new_text break for doc in doc_texts: if doc not in new_texts: new_texts[doc] = doc_texts[doc] return new_texts
def calcualte_former_documents(current_epoch, qid, document_texts): former_docs = [] seen_texts = [] for doc in document_texts: query = doc.split("-")[2] epoch = int(doc.split("-")[1]) if epoch == 0: continue if qid != query: continue if epoch >= current_epoch: continue doc_text = document_texts[doc] cleaned_text = clean_texts(doc_text) cleaned_text = cleaned_text.replace(" ", "") if cleaned_text in seen_texts: continue former_docs.append(doc) seen_texts.append(cleaned_text) return former_docs
def create_features(raw_ds, ranked_lists, doc_texts, top_doc_index, ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir, queries, output_dir, qid): global word_embd_model feature_vals = {} relevant_pairs = raw_ds[qid] feature_list = [ "FractionOfQueryWordsIn", "FractionOfQueryWordsOut", "CosineToCentroidIn", "CosineToCentroidInVec", "CosineToCentroidOut", "CosineToCentroidOutVec", "CosineToWinnerCentroidInVec", "CosineToWinnerCentroidOutVec", "CosineToWinnerCentroidIn", "CosineToWinnerCentroidOut", "SimilarityToPrev", "SimilarityToRefSentence", "SimilarityToPred", "SimilarityToPrevRef", "SimilarityToPredRef" ] for feature in feature_list: feature_vals[feature] = {} epoch, qid_original = reverese_query(qid) if epoch not in ["04", "06"]: return past_winners = get_past_winners(ranked_lists, epoch, qid_original) past_winners_semantic_centroid_vector = past_winners_centroid( past_winners, doc_texts, word_embd_model) past_winners_tfidf_centroid_vector = get_past_winners_tfidf_centroid( past_winners, doc_tfidf_vectors_dir) top_docs = ranked_lists[epoch][qid_original][:top_doc_index] ref_doc = ranked_lists[epoch][qid_original][ref_doc_index] ref_sentences = sent_tokenize(doc_texts[ref_doc]) top_docs_tfidf_centroid = document_centroid( [get_java_object(doc_tfidf_vectors_dir + doc) for doc in top_docs]) for pair in relevant_pairs: sentence_in = relevant_pairs[pair]["in"] sentence_out = relevant_pairs[pair]["out"] in_vec = get_text_centroid(clean_texts(sentence_in), word_embd_model, True) out_vec = get_text_centroid(clean_texts(sentence_out), word_embd_model, True) replace_index = int(pair.split("_")[1]) query = queries[qid] feature_vals['FractionOfQueryWordsIn'][pair] = query_term_freq( "avg", clean_texts(sentence_in), clean_texts(query)) feature_vals['FractionOfQueryWordsOut'][pair] = query_term_freq( "avg", clean_texts(sentence_out), clean_texts(query)) feature_vals['CosineToCentroidIn'][ pair] = calculate_similarity_to_docs_centroid_tf_idf( tfidf_sentence_dir + pair.split("$")[1].split("_")[0] + "_" + pair.split("_")[2], top_docs_tfidf_centroid) feature_vals['CosineToCentroidOut'][ pair] = calculate_similarity_to_docs_centroid_tf_idf( tfidf_sentence_dir + pair.split("$")[0] + "_" + pair.split("_")[1], top_docs_tfidf_centroid) feature_vals["CosineToCentroidInVec"][ pair] = calculate_semantic_similarity_to_top_docs( sentence_in, top_docs, doc_texts, word_embd_model) feature_vals["CosineToCentroidOutVec"][ pair] = calculate_semantic_similarity_to_top_docs( sentence_out, top_docs, doc_texts, word_embd_model) feature_vals['CosineToWinnerCentroidInVec'][pair] = cosine_similarity( in_vec, past_winners_semantic_centroid_vector) feature_vals['CosineToWinnerCentroidOutVec'][pair] = cosine_similarity( out_vec, past_winners_semantic_centroid_vector) feature_vals['CosineToWinnerCentroidIn'][ pair] = calculate_similarity_to_docs_centroid_tf_idf( tfidf_sentence_dir + pair.split("$")[1].split("_")[0] + "_" + pair.split("_")[2], past_winners_tfidf_centroid_vector) feature_vals['CosineToWinnerCentroidOut'][ pair] = calculate_similarity_to_docs_centroid_tf_idf( tfidf_sentence_dir + pair.split("$")[0] + "_" + pair.split("_")[1], past_winners_tfidf_centroid_vector) feature_vals['SimilarityToPrev'][pair] = context_similarity( replace_index, ref_sentences, sentence_in, "prev", word_embd_model) feature_vals['SimilarityToRefSentence'][pair] = context_similarity( replace_index, ref_sentences, sentence_in, "own", word_embd_model) feature_vals['SimilarityToPred'][pair] = context_similarity( replace_index, ref_sentences, sentence_in, "pred", word_embd_model) feature_vals['SimilarityToPrevRef'][pair] = context_similarity( replace_index, ref_sentences, sentence_out, "prev", word_embd_model) feature_vals['SimilarityToPredRef'][pair] = context_similarity( replace_index, ref_sentences, sentence_out, "pred", word_embd_model) write_files(feature_list, feature_vals, output_dir, qid, ref_doc_index)
def cover(text, query): numerator = 0 for q in query.split(): if q in clean_texts(text): numerator += 1 return numerator / len(query.split())
def create_features(summaries_dir, ranked_lists, doc_texts, top_doc_index, ref_doc_index, doc_tfidf_vectors_dir, tfidf_sentence_dir, summary_tfidf_dir, queries, output_dir, qid): global word_embd_model feature_vals = {} feature_list = [ "FractionOfQueryWordsIn", "FractionOfQueryWordsOut", "CosineToCentroidIn", "CosineToCentroidInVec", "CosineToCentroidOut", "CosineToCentroidOutVec", "CosineToWinnerCentroidInVec", "CosineToWinnerCentroidOutVec", "CosineToWinnerCentroidIn", "CosineToWinnerCentroidOut", "SimilarityToPrev", "SimilarityToRefSentence", "SimilarityToPred", "SimilarityToPrevRef", "SimilarityToPredRef" ] for feature in feature_list: feature_vals[feature] = {} epoch, qid_original = reverese_query(qid) query = queries[qid] past_winners = get_past_winners(ranked_lists, epoch, qid_original) past_winners_semantic_centroid_vector = past_winners_centroid( past_winners, doc_texts, word_embd_model) past_winners_tfidf_centroid_vector = get_past_winners_tfidf_centroid( past_winners, doc_tfidf_vectors_dir) top_docs = ranked_lists[epoch][qid_original][:top_doc_index] ref_doc = ranked_lists[epoch][qid_original][ref_doc_index] ref_sentences = sent_tokenize(doc_texts[ref_doc]) top_docs_tfidf_centroid = document_centroid( [get_java_object(doc_tfidf_vectors_dir + doc) for doc in top_docs]) with open(summaries_dir + "_".join(query.split())) as summaries: for i, sentence_out in enumerate(ref_sentences): for j, sentence_in in enumerate(summaries): pair = ref_doc + "_" + str(i) + "_" + str(j) replace_index = i in_vec = get_text_centroid(clean_texts(sentence_in), word_embd_model, True) out_vec = get_text_centroid(clean_texts(sentence_out), word_embd_model, True) feature_vals['FractionOfQueryWordsIn'][pair] = query_term_freq( "avg", clean_texts(sentence_in), clean_texts(query)) feature_vals['FractionOfQueryWordsOut'][ pair] = query_term_freq("avg", clean_texts(sentence_out), clean_texts(query)) feature_vals['CosineToCentroidIn'][ pair] = calculate_similarity_to_docs_centroid_tf_idf( summary_tfidf_dir + pair, top_docs_tfidf_centroid) feature_vals['CosineToCentroidOut'][ pair] = calculate_similarity_to_docs_centroid_tf_idf( tfidf_sentence_dir + pair.split("_")[0] + "_" + pair.split("_")[1], top_docs_tfidf_centroid) feature_vals["CosineToCentroidInVec"][ pair] = calculate_semantic_similarity_to_top_docs( sentence_in, top_docs, doc_texts, word_embd_model, True) feature_vals["CosineToCentroidOutVec"][ pair] = calculate_semantic_similarity_to_top_docs( sentence_out, top_docs, doc_texts, word_embd_model, True) feature_vals['CosineToWinnerCentroidInVec'][ pair] = cosine_similarity( in_vec, past_winners_semantic_centroid_vector) feature_vals['CosineToWinnerCentroidOutVec'][ pair] = cosine_similarity( out_vec, past_winners_semantic_centroid_vector) feature_vals['CosineToWinnerCentroidIn'][ pair] = calculate_similarity_to_docs_centroid_tf_idf( summary_tfidf_dir + pair, past_winners_tfidf_centroid_vector) feature_vals['CosineToWinnerCentroidOut'][ pair] = calculate_similarity_to_docs_centroid_tf_idf( tfidf_sentence_dir + pair.split("_")[0] + "_" + pair.split("_")[1], past_winners_tfidf_centroid_vector) feature_vals['SimilarityToPrev'][pair] = context_similarity( replace_index, ref_sentences, sentence_in, "prev", word_embd_model, True) feature_vals['SimilarityToRefSentence'][ pair] = context_similarity(replace_index, ref_sentences, sentence_in, "own", word_embd_model, True) feature_vals['SimilarityToPred'][pair] = context_similarity( replace_index, ref_sentences, sentence_in, "pred", word_embd_model, True) feature_vals['SimilarityToPrevRef'][pair] = context_similarity( replace_index, ref_sentences, sentence_out, "prev", word_embd_model, True) feature_vals['SimilarityToPredRef'][pair] = context_similarity( replace_index, ref_sentences, sentence_out, "pred", word_embd_model, True) write_files(feature_list, feature_vals, output_dir, qid)
from summarization.seo_experiment.borda_mechanism import query_term_freq, query_term_occ from summarization.seo_experiment.utils import clean_texts from summarization.seo_experiment.workingset_creator import read_queries_file from summarization.seo_experiment.summarization_process import transform_query_text import nltk import numpy as np queries = read_queries_file("../data/queries.xml") queries = transform_query_text(queries) summary_access = open("top_docs_summaries.txt") summary_data_access = open("summarization_data.txt") summaries = summary_access.readlines() data_points = summary_data_access.readlines() freqs = {"all": [], "first": []} for i, summary in enumerate(summaries): data = data_points[i] qid = data.split("\t")[1] q_text = queries[qid] fixed_sum = summary.replace("<t>", "").replace("</t>", "").replace( ", .", ".").replace(". .", ".") freqs["all"].append(query_term_occ("sum", clean_texts(fixed_sum), q_text)) first = nltk.sent_tokenize(fixed_sum)[0] freqs["first"].append(query_term_occ("sum", clean_texts(first), q_text)) for k in freqs: freqs[k] = np.mean(freqs[k]) print(freqs)