def compute_rouge_abstract_score(sentence, abstract): """ Computes the ROUGE score of the given sentence compared to the given abstract. :param sentence: the sentence to compute the ROUGE score for, as a list of words. :param abstract: the abstract of the paper to compute the ROUGE score against, as a list of strings. :return: the ROUGE score of the sentence compared to the abstract. """ r = Rouge() return r.calc_score([" ".join(sentence)], abstract)
def __init__(self): """ ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper. """ self.summary_length = 10 self.r = Rouge()
def __init__(self, visualise): """ Oracle summariser is not an actual, usable summariser. It extracts the best sentences from the paper possible by comparing them to the gold summaries. It represents the high-water mark in what ROUGE score it is possible for a summariser to achieve. """ self.summary_length = 10 self.r = Rouge() self.visualise = visualise
def __init__(self): """ ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper. """ self.summary_length = 10 self.r = Rouge() self.preprocessor = AbstractNetPreprocessor() self.computation_graph = graph() self.features_input = self.computation_graph["features_input"] self.prediction_probs = self.computation_graph["prediction_probs"] self.similarity_threshold = 0.75
def __init__(self): """ ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper. """ self.summary_length = 10 self.min_sent_len = 10 self.r = Rouge() self.preprocessor = AbstractNetPreprocessor() # Hyperparameter to tune weight given to feature probability self.C = 0.30
with open(REFERENCE_DIR + filename, "rb") as f: generated = f.readlines() golds.append((gold, filename)) generateds.append((generated, filename)) # Sanity checks assert(type(golds) is list) assert(type(generateds) is list) assert(len(golds) > 0) assert(len(generateds) > 0) assert(type(golds[0]) is tuple) assert(type(generateds[0]) is tuple) assert(type(golds[0][0][0]) is str) assert(type(generateds[0][0][0]) is str) r = Rouge() summaries_and_scores = [] for gold, ref in zip(golds, generateds): avg_score = 0 count = 0 for sent in ref[0]: score = r.calc_score([sent], gold[0]) avg_score += score count += 1 if count > 0: avg_score = avg_score / count else: avg_score = 0
from operator import itemgetter from multiprocessing import Pool import numpy as np sys.path.insert(0, os.environ['SCRATCH']+"/MATH689/TextSum") from Dev.DataTools import useful_functions from Dev.DataTools.useful_functions import wait, BASE_DIR, PAPER_SOURCE, GLOBAL_WORDCOUNT_WRITE_LOC,\ TRAINING_DATA_WRITE_LOC, Color, NUMBER_OF_PAPERS from Dev.Evaluation.rouge import Rouge # =================================== # ======== CONFIG VARS ======== # Create a ROUGE evaluation object rouge = Rouge() # The number of summary sentences to find in each paper num_summary = 20 # ============================= # ======== FUNCTIONS ======== def process_paper(filename): """ The concurrent function which processes each paper into its training data format. :param filename: the filename of the paper to process. :return: none, but write the preprocessed file to "Data/Training_Data/" """
LOADING_SECTION_SIZE = NUMBER_OF_PAPERS / 30 GRAPH_SAVE_DIR = BASE_DIR + "/Analysis/Graphs/" # Count of how many papers have been processed. count = 0 # Sentences as vectors with their classification data = [] # Count of positive data pos_count = 0 # Count of negative data neg_count = 0 r = Rouge() pool = ThreadPool(2) # True if already processed PREPROCESSED = False # True if data is prepared for plotting PREPARED = False # To count number of highlights num_highlights = 0 # Number of highlights in first 150 papers num_highlights_150 = 0
def prepare_data(self): """ Puts the data in a form suitable for the Word2Vec classifier - it changes each sentence into the average of its constituent word vectors. :return: all sentences as vectors and their classification (data is balanced). """ # Count of how many papers have been processed. count = 0 # Sentences as vectors with their classification data = [] # Count of positive data pos_count = 0 # Count of negative data neg_count = 0 r = Rouge() # Iterate over every file in the paper directory for filename in os.listdir(PAPER_SOURCE): # Ignores files which are not papers e.g. hidden files if filename.endswith(".txt"): # Display a loading bar of progress useful_functions.loading_bar(self.loading_section_size, count, self.number_of_papers) count += 1 # Opens the paper as a dictionary, with keys corresponding to section titles and values corresponding # to the text in that section. The text is given as a list of lists, each list being a list of words # corresponding to a sentence. paper = useful_functions.read_in_paper(filename, sentences_as_lists=True) # Get the highlights of the paper highlights = paper["HIGHLIGHTS"] highlights_join = [" ".join(x) for x in highlights] abstract = paper["ABSTRACT"] sentences = [] # Iterate over the whole paper for section, sents in paper.iteritems(): # Iterate over each sentence in the section for sentence in sents: # We don't want to calculate ROUGE for the abstract if section != "ABSTRACT": # Calculate the ROUGE score and add it to the list r_score = r.calc_score([" ".join(sentence)], highlights_join) sentences.append((sentence, r_score, section)) sentences = [(x, section) for x, score, section in reversed( sorted(sentences, key=itemgetter(1)))] sents_pos = sentences[0:self.num_summary] sents_neg = sentences[self.num_summary:] if len(sents_neg) < len(sents_pos): continue sents_pos = [(x[0], x[1], y) for x, y in zip(sents_pos, [1] * len(sents_pos))] sents_neg = [x for x in reversed(sents_neg)][:len(sents_pos)] sents_neg = [(x[0], x[1], y) for x, y in zip(sents_neg, [0] * len(sents_neg))] sents_class = sents_pos + sents_neg random.shuffle(sents_class) # Each item in the sentence list has form [(sentence, section, classification)] paper = { "filename": filename, "title": paper["MAIN-TITLE"], "gold": paper["HIGHLIGHTS"], "abstract": abstract, "sentences": sents_class, "description": "All text data is given in the form of a list of words." } data.append(paper) if count % 1000 == 0: print("\nWriting data...") write_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/" with open(write_dir + "data.pkl", "wb") as f: pickle.dump(data, f) print("Done") return data
LOADING_SECTION_SIZE = NUMBER_OF_PAPERS / 30 GRAPH_SAVE_DIR = BASE_DIR + "/Analysis/Graphs/" # Count of how many papers have been processed. count = 0 # Sentences as vectors with their classification data = [] # Count of positive data pos_count = 0 # Count of negative data neg_count = 0 r = Rouge() pool = ThreadPool(2) # True if already processed PREPROCESSED = False # True if data is prepared for plotting PREPARED = False if not PREPROCESSED: # Holds the ROUGE scores by section rouge_by_section = defaultdict(list) # Iterate over every file in the paper directory for filename in os.listdir(PAPER_SOURCE):
LOADING_SECTION_SIZE = NUMBER_OF_PAPERS / 30 GRAPH_SAVE_DIR = BASE_DIR + "/Analysis/Graphs/" # Count of how many papers have been processed. count = 0 # Sentences as vectors with their classification data = [] # Count of positive data pos_count = 0 # Count of negative data neg_count = 0 r = Rouge() pool = ThreadPool(2) # True if already processed PREPROCESSED = True # True if data is prepared for plotting PREPARED = True if not PREPROCESSED: # Holds the ROUGE scores by section rouge_by_section = defaultdict(list) # Iterate over every file in the paper directory for filename in os.listdir(PAPER_SOURCE):