示例#1
0
    def __init__(self):
        """
        Double Dataset based Summariser.
        """
        self.summary_length = 10
        print("Reading bags of words...")
        t = time.time()
        self.paper_bags_of_words = useful_functions.load_pickled_object(PAPER_BAG_OF_WORDS_LOC)
        print("Done, took ", time.time() - t, " seconds.")

        # Dictionary holding the keyphrases for each paper
        print("Reading keyphrases...")
        t = time.time()
        self.keyphrases = useful_functions.load_pickled_object(KEYPHRASES_LOC)
        print("Done, took ", time.time() - t, " seconds.")

        # Dictionary which contains the counts of the number of different papers that a word occurs in
        print("Reading global count...")
        t = time.time()
        self.global_paper_count = useful_functions.load_pickled_object(GLOBAL_COUNT_LOC)
        print("Done, took ", time.time() - t, " seconds.")

        # The word2vec model to represent the first word and first and second words of the sentence
        print("Loading word2vec...")
        t = time.time()
        self.word2vec_model = useful_functions.load_word2vec()
        self.vocab = set(self.word2vec_model.index2word)
        print("Done, took ", time.time() - t, " seconds.")
示例#2
0
    def __init__(self):
        """
        Preprocessess data into a form suitable for use with networks that use the abstract vector as part of their
        input for classification.
        """

        # The number of summary sentences to extract from the paper as training data
        self.num_summary = 20

        # The number of papers to process and the loading section size which will be used to print a loading bar as
        # the papers are processed
        self.number_of_papers = len([
            name for name in os.listdir(PAPER_SOURCE) if name.endswith(".txt")
        ])
        self.loading_section_size = self.number_of_papers / 30

        # Number of classes to classify into
        self.num_classes = 2

        # A thread pool for parallel processing of dats
        self.pool2 = ThreadPool(4)

        # Load the word2vec model
        print("Reading word2vec...")
        t = time.time()
        self.word2vec = useful_functions.load_word2vec()
        self.vocab = set(self.word2vec.index2word)
        print("Done, took ", time.time() - t, " seconds.")

        # Dictionary which contains bag of words representations for every paper
        print("Reading bags of words...")
        t = time.time()
        self.paper_bags_of_words = useful_functions.load_pickled_object(
            PAPER_BAG_OF_WORDS_LOC)
        print("Done, took ", time.time() - t, " seconds.")

        # Dictionary holding the keyphrases for each paper
        print("Reading keyphrases...")
        t = time.time()
        self.keyphrases = useful_functions.load_pickled_object(KEYPHRASES_LOC)
        print("Done, took ", time.time() - t, " seconds.")

        # Dictionary which contains the counts of the number of different papers that a word occurs in
        print("Reading global count...")
        t = time.time()
        self.global_paper_count = useful_functions.load_pickled_object(
            GLOBAL_COUNT_LOC)
        print("Done, took ", time.time() - t, " seconds.")

        # Running start time to measure running time
        self.start_time = time.time()
示例#3
0
def get_data():
    print("Loading Data...")
    t = time.time()
    data = useful_functions.load_pickled_object(DATA_DIR)
    sentences_class = []
    for item in data:
        sentences = item["sentences"]
        features = item["sentence_features"]

        for sentence, feat in zip(sentences, features):
            sent = sentence[0]
            sec = sentence[1]
            y = sentence[2]
            sentences_class.append((sent, feat, y))
    data = sentences_class
    print("Done, took ", time.time() - t, " seconds")

    print("Processing Data...")

    #new_data = [x for x in data if len(x[0]) < MAX_SENT_LEN]
    new_data = []
    for sent, feat, y in data:
        if len(sent) > MAX_SENT_LEN:
            new_sent = sent[0:MAX_SENT_LEN]
        else:
            new_sent = sent
        new_data.append((new_sent, feat, y))

    print("Done")

    return new_data
示例#4
0
def get_data():
    """
    Loads the data from the data directory given above and puts it into the form required by the summarisers. In this
    summariser the data we require is: the raw sentences, the abstract and the features.
    :return: The data, but discarding the sentences longer than the maximum length.
    """

    print("Loading Data...")
    t = time.time()

    # The data is a pickled object
    data = useful_functions.load_pickled_object(DATA_DIR)

    # Data list
    sents_absvec_feats_class = []

    for item in data:

        sentences = item["sentences"]
        abstract_vec = item["abstract_vec"]
        features = item["sentence_features"]

        for sentence, feat in zip(sentences, features):
            sent = sentence[0]
            sec = sentence[1]
            y = sentence[2]
            sents_absvec_feats_class.append((sent, abstract_vec, feat, y))

    data = sents_absvec_feats_class

    print("Done, took ", time.time() - t, " seconds")

    print("Processing Data...")

    new_data = []
    for sent, abs_vec, feat, y in data:

        new_feat = (feat[1], feat[2], feat[3], feat[4], feat[5], feat[6],
                    feat[7])

        if len(sent) > MAX_SENT_LEN:
            new_sent = sent[0:MAX_SENT_LEN]
        else:
            new_sent = sent
        new_data.append((new_sent, abs_vec, new_feat, y))

    return new_data
示例#5
0
    def extra_processing(self):

        data_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/data.pkl"
        write_dir = BASE_DIR + \
            "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/abstractnet_data.pkl"

        print("----> Reading data...")
        t = time.time()
        data = useful_functions.load_pickled_object(data_dir)
        print("----> Done, took ", time.time() - t, " seconds")

        print("----> Beginning processing...")
        t = time.time()
        self.start_time = t
        new_data = self.pool2.map(self.process_item, data)
        # new_data = np.concatenate(new_data, axis=0)
        print("----> Done, took ", (time.time() - t) / 60, " minutes")

        useful_functions.pickle_list(new_data, write_dir)
示例#6
0
def get_data():

    print("Loading Data...")
    t = time.time()

    data = useful_functions.load_pickled_object(DATA_DIR)
    sents = []
    labs = []
    for item in data:
        sentences = item["sentences"]
        for sent, sec, y in sentences:
            sents.append(sent)
            labs.append(num2onehot(y, NUM_CLASSES))

    print("Done, took ", time.time() - t, " seconds")

    data = {
        "sentences": sents,
        "labels": labs
    }

    return data
示例#7
0
    correct_answers = graph_outputs["correct_answers"]

    # Accuracy operation
    accuracy = graph_outputs["accuracy"]

    with tf.Session() as sess:

        # Initialise all variables
        sess.run(tf.global_variables_initializer())

        # Saving object
        saver = tf.train.Saver()

        print("Loading Data...")
        t = time.time()
        data = useful_functions.load_pickled_object(DATA_DIR)
        sentvec_abstracvec_features_class = []
        for item in data:
            sentence_vecs = item["sentence_vecs"]
            abstract_vec = item["abstract_vec"]
            features = item["sentence_features"]
            for sent, feat in zip(sentence_vecs, features):
                vec = np.concatenate((sent[0], abstract_vec, feat))
                sentvec_abstracvec_features_class.append((vec, sent[2]))
        data = sentvec_abstracvec_features_class
        print("Done, took ", time.time() - t, " seconds")

        test_len = int(len(data) * (1/3))
        test_data = data[0:test_len]
        train_data = data[test_len:]
html.append("</div>")
html.append("<div id=\"gold\" class=\"text\">")
html.append("<h2>Human Written Summary</h2>")
html.append("<hr>")
html.append("<br>")
html.append("<p>")

highlights = paper["HIGHLIGHTS"]
print("Reading stuff...")
bag_of_words = defaultdict(float)
for key, val in paper.iteritems():
    sents = val[0]
    for sent in sents:
        for word in sent:
            bag_of_words[word] += 1.0
global_paper_count = useful_functions.load_pickled_object(GLOBAL_COUNT_LOC)
print("Done")

sents_and_scores = []
for sentence in highlights[0]:
    sents_and_scores.append(
        calculate_tf_idf(sentence, global_paper_count, bag_of_words))

max_tf_idf = -1
for sentence in sents_and_scores:
    for word, score in sentence:
        if score > max_tf_idf:
            max_tf_idf = score

highlights_and_scores = []
for sentence in sents_and_scores: