def calculate_features(self, sentence, bag_of_words, keyphrases, abstract, title, section, shorter=False):
        """
        Calculates the features for a sentence.
        :param sentence: the sentence to calculate features for, as a list of words.
        :param bag_of_words: a dictionary bag of words representation for the paper, keys are words vals are counts.
        :param keyphrases: the keyphrases of the paper
        :param shorter: returns a shorter list of features
        :param abstract: the abstract of the paper as a list of strings
        :param title: the title of the paper as a string
        :param section: the section of the paper the sentence came from
        :return: a vector of features for the sentence.
        """
        # Calculate features
        abstract_rouge_score = useful_functions.compute_rouge_abstract_score(sentence, abstract)
        tf_idf = useful_functions.calculate_tf_idf(sentence, self.global_paper_count, bag_of_words)
        document_tf_idf = useful_functions.calculate_document_tf_idf(sentence, bag_of_words)
        keyphrase_score = useful_functions.calculate_keyphrase_score(sentence, keyphrases)
        title_score = useful_functions.calculate_title_score(sentence, set([x for x in title if x not in STOPWORDS]))
        sent_len = len(sentence)
        numeric_count = len([word for word in sentence if useful_functions.is_number(word)])
        sec = -1

        if "HIGHLIGHT" in section:
            sec = HIGHLIGHT
        elif "ABSTRACT" in section:
            sec = ABSTRACT
        elif "INTRODUCTION" in section:
            sec = INTRODUCTION
        elif "RESULT" in section or "DISCUSSION" in section:
            sec = RESULT_DISCUSSION
        elif "CONCLUSION" in section:
            sec = CONCLUSION
        elif "METHOD" in section:
            sec = METHOD
        else:
            sec = OTHER

        if shorter:
            return abstract_rouge_score, tf_idf, document_tf_idf, keyphrase_score, title_score, numeric_count, \
                   sent_len, sec

        if sent_len > 2 and sentence[0] in self.vocab:
            first_word = self.word2vec[sentence[0]]
        else:
            first_word = [0] * self.word2vec_feature_nums

        if sent_len > 2 and sentence[0] in self.vocab and sentence[1] in self.vocab:
            first_pair = np.concatenate((self.word2vec[sentence[0]], self.word2vec[sentence[1]]))
        else:
            first_pair = [0] * (self.word2vec_feature_nums * 2)

        return abstract_rouge_score, tf_idf, document_tf_idf, keyphrase_score, title_score, numeric_count, \
               sent_len, sec, first_word, first_pair
Exemplo n.º 2
0
    def summarise(self, filename):
        """
        Generates a summary of the paper.
        :param filename: the name of the file to summaries
        :param name: the name of the file that will be written
        :return: a sumamry of the paper.
        """

        paper = self.prepare_paper(filename)

        bag_of_words = self.paper_bags_of_words[filename]
        paper_keyphrases = self.keyphrases[filename]

        # We don't want to make any predictions for the Abstract or Highlights as these are already summaries.
        sections_to_predict_for = []
        for section, text in paper.iteritems():
            if section != "ABSTRACT" and section != "HIGHLIGHTS":
                sections_to_predict_for.append(text)

        # Sorts the sections according to the order in which they appear in the paper.
        sorted_sections_to_predict_for = sorted(sections_to_predict_for, key=itemgetter(1))

        # Creates a list of the sentences in the paper in the correct order. Each item in the list is formed of
        # a list of words making up the sentence.
        sentence_list = []
        for sentence_text, section_position_in_paper in sorted_sections_to_predict_for:
            section_sentences = sentence_text
            for sentence in section_sentences:
                sentence_list.append(sentence)

        # Use the model to predict if each sentence is a summary sentence or not.
        predictions = []
        for sentence_text in sentence_list:

            tf_idf = useful_functions.calculate_tf_idf(sentence_text, self.global_paper_count, bag_of_words)
            predictions.append(tf_idf)

        # Produces a list of the form [sentence_text, sentence_index_in_paper, sentence tf_idf score]
        sentence_list_with_predictions = zip(sentence_list, range(len(sentence_list)), predictions)

        # Sort according to likelihood of being a summary
        sorted_predictions = reversed(sorted(sentence_list_with_predictions, key=itemgetter(-1)))
        sorted_predictions = [x for x in sorted_predictions]

        # Slice the top few sentences to form the summary sentences
        summary_sents = sorted_predictions[0:self.summary_length]

        # Order sumamry sentences according to the order they appear in the paper
        ordered_summary = sorted(summary_sents, key=itemgetter(-2))

        # Print the summary
        summary = []

        for item in ordered_summary:
            sentence_position = item[1]
            sentence = " ".join(item[0])
            summary.append((sentence, sentence_position))

        useful_functions.write_summary(SUMMARY_WRITE_LOC, summary, filename.strip(".txt"))

        for sentence in summary:
            print(sentence)
            print()