def compute_rouge_abstract_score(sentence, abstract):
    """
    Computes the ROUGE score of the given sentence compared to the given abstract.
    :param sentence: the sentence to compute the ROUGE score for, as a list of words.
    :param abstract: the abstract of the paper to compute the ROUGE score against, as a list of strings.
    :return: the ROUGE score of the sentence compared to the abstract.
    """
    r = Rouge()
    return r.calc_score([" ".join(sentence)], abstract)
Пример #2
0
 def __init__(self):
     """
     ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
     summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
     """
     self.summary_length = 10
     self.r = Rouge()
Пример #3
0
 def __init__(self, visualise):
     """
     Oracle summariser is not an actual, usable summariser. It extracts the best sentences from the paper possible
     by comparing them to the gold summaries. It represents the high-water mark in what ROUGE score it is possible
     for a summariser to achieve.
     """
     self.summary_length = 10
     self.r = Rouge()
     self.visualise = visualise
 def __init__(self):
     """
     ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
     summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
     """
     self.summary_length = 10
     self.r = Rouge()
     self.preprocessor = AbstractNetPreprocessor()
     self.computation_graph = graph()
     self.features_input = self.computation_graph["features_input"]
     self.prediction_probs = self.computation_graph["prediction_probs"]
     self.similarity_threshold = 0.75
Пример #5
0
    def __init__(self):
        """
        ROUGE based summariser. This compares each sentence in the paper to the abstract to see which ones make the best
        summaries for the abstract. It is assumed that these sentences will then also be good highlights for the paper.
        """
        self.summary_length = 10
        self.min_sent_len = 10
        self.r = Rouge()
        self.preprocessor = AbstractNetPreprocessor()

        # Hyperparameter to tune weight given to feature probability
        self.C = 0.30
Пример #6
0
        with open(REFERENCE_DIR + filename, "rb") as f:
            generated = f.readlines()
        golds.append((gold, filename))
        generateds.append((generated, filename))

# Sanity checks
assert(type(golds) is list)
assert(type(generateds) is list)
assert(len(golds) > 0)
assert(len(generateds) > 0)
assert(type(golds[0]) is tuple)
assert(type(generateds[0]) is tuple)
assert(type(golds[0][0][0]) is str)
assert(type(generateds[0][0][0]) is str)

r = Rouge()
summaries_and_scores = []
for gold, ref in zip(golds, generateds):

    avg_score = 0
    count = 0

    for sent in ref[0]:
        score = r.calc_score([sent], gold[0])
        avg_score += score
        count += 1

    if count > 0:
        avg_score = avg_score / count
    else:
        avg_score = 0
Пример #7
0
from operator import itemgetter
from multiprocessing import Pool
import numpy as np
sys.path.insert(0, os.environ['SCRATCH']+"/MATH689/TextSum")
from Dev.DataTools import useful_functions
from Dev.DataTools.useful_functions import wait, BASE_DIR, PAPER_SOURCE, GLOBAL_WORDCOUNT_WRITE_LOC,\
    TRAINING_DATA_WRITE_LOC, Color, NUMBER_OF_PAPERS
from Dev.Evaluation.rouge import Rouge


# ===================================

# ======== CONFIG VARS ========

# Create a ROUGE evaluation object
rouge = Rouge()

# The number of summary sentences to find in each paper
num_summary = 20

# =============================

# ======== FUNCTIONS ========


def process_paper(filename):
    """
    The concurrent function which processes each paper into its training data format.
    :param filename: the filename of the paper to process.
    :return: none, but write the preprocessed file to "Data/Training_Data/"
    """
LOADING_SECTION_SIZE = NUMBER_OF_PAPERS / 30
GRAPH_SAVE_DIR = BASE_DIR + "/Analysis/Graphs/"

# Count of how many papers have been processed.
count = 0

# Sentences as vectors with their classification
data = []

# Count of positive data
pos_count = 0

# Count of negative data
neg_count = 0

r = Rouge()

pool = ThreadPool(2)

# True if already processed
PREPROCESSED = False

# True if data is prepared for plotting
PREPARED = False

# To count number of highlights
num_highlights = 0

# Number of highlights in first 150 papers
num_highlights_150 = 0
Пример #9
0
    def prepare_data(self):
        """
        Puts the data in a form suitable for the Word2Vec classifier - it changes each sentence into the average of
        its constituent word vectors.
        :return: all sentences as vectors and their classification (data is balanced).
        """

        # Count of how many papers have been processed.
        count = 0

        # Sentences as vectors with their classification
        data = []

        # Count of positive data
        pos_count = 0

        # Count of negative data
        neg_count = 0

        r = Rouge()

        # Iterate over every file in the paper directory
        for filename in os.listdir(PAPER_SOURCE):

            # Ignores files which are not papers e.g. hidden files
            if filename.endswith(".txt"):

                # Display a loading bar of progress
                useful_functions.loading_bar(self.loading_section_size, count,
                                             self.number_of_papers)
                count += 1

                # Opens the paper as a dictionary, with keys corresponding to section titles and values corresponding
                # to the text in that section. The text is given as a list of lists, each list being a list of words
                # corresponding to a sentence.
                paper = useful_functions.read_in_paper(filename,
                                                       sentences_as_lists=True)

                # Get the highlights of the paper
                highlights = paper["HIGHLIGHTS"]
                highlights_join = [" ".join(x) for x in highlights]
                abstract = paper["ABSTRACT"]

                sentences = []

                # Iterate over the whole paper
                for section, sents in paper.iteritems():

                    # Iterate over each sentence in the section
                    for sentence in sents:

                        # We don't want to calculate ROUGE for the abstract
                        if section != "ABSTRACT":
                            # Calculate the ROUGE score and add it to the list
                            r_score = r.calc_score([" ".join(sentence)],
                                                   highlights_join)
                            sentences.append((sentence, r_score, section))

                sentences = [(x, section) for x, score, section in reversed(
                    sorted(sentences, key=itemgetter(1)))]

                sents_pos = sentences[0:self.num_summary]
                sents_neg = sentences[self.num_summary:]

                if len(sents_neg) < len(sents_pos):
                    continue

                sents_pos = [(x[0], x[1], y)
                             for x, y in zip(sents_pos, [1] * len(sents_pos))]
                sents_neg = [x for x in reversed(sents_neg)][:len(sents_pos)]
                sents_neg = [(x[0], x[1], y)
                             for x, y in zip(sents_neg, [0] * len(sents_neg))]
                sents_class = sents_pos + sents_neg
                random.shuffle(sents_class)

                # Each item in the sentence list has form [(sentence, section, classification)]
                paper = {
                    "filename":
                    filename,
                    "title":
                    paper["MAIN-TITLE"],
                    "gold":
                    paper["HIGHLIGHTS"],
                    "abstract":
                    abstract,
                    "sentences":
                    sents_class,
                    "description":
                    "All text data is given in the form of a list of words."
                }

                data.append(paper)

                if count % 1000 == 0:
                    print("\nWriting data...")
                    write_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/"
                    with open(write_dir + "data.pkl", "wb") as f:
                        pickle.dump(data, f)
                    print("Done")

        return data
Пример #10
0
LOADING_SECTION_SIZE = NUMBER_OF_PAPERS / 30
GRAPH_SAVE_DIR = BASE_DIR + "/Analysis/Graphs/"

# Count of how many papers have been processed.
count = 0

# Sentences as vectors with their classification
data = []

# Count of positive data
pos_count = 0

# Count of negative data
neg_count = 0

r = Rouge()

pool = ThreadPool(2)

# True if already processed
PREPROCESSED = False

# True if data is prepared for plotting
PREPARED = False

if not PREPROCESSED:
    # Holds the ROUGE scores by section
    rouge_by_section = defaultdict(list)

    # Iterate over every file in the paper directory
    for filename in os.listdir(PAPER_SOURCE):
Пример #11
0
LOADING_SECTION_SIZE = NUMBER_OF_PAPERS / 30
GRAPH_SAVE_DIR = BASE_DIR + "/Analysis/Graphs/"

# Count of how many papers have been processed.
count = 0

# Sentences as vectors with their classification
data = []

# Count of positive data
pos_count = 0

# Count of negative data
neg_count = 0

r = Rouge()

pool = ThreadPool(2)

# True if already processed
PREPROCESSED = True

# True if data is prepared for plotting
PREPARED = True

if not PREPROCESSED:
    # Holds the ROUGE scores by section
    rouge_by_section = defaultdict(list)

    # Iterate over every file in the paper directory
    for filename in os.listdir(PAPER_SOURCE):