예제 #1
0
 def create_my_summarizer(cls,
                          cae_model_path,
                          word_vector_model_path="vector/100",
                          mode=0):
     word_vectors = WordVectors.load(word_vector_model_path)
     convae = ConvolutionAutoEncoder.rebuild_for_testing(
         mini_batch_size=1, filemodel=cae_model_path)
     return CAESummarizer(convae, word_vectors, mode)
예제 #2
0
def statistic_freq():

    wordvectors = WordVectors.load("model/wordvector.txt")

    freq_array = [0] * 500

    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        freq_array[len(words)] += 1

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
        freq_array[len(words)] += 1

    # DUC data
    folder_path = "/Users/HyNguyen/Documents/Research/Data/DUC20042005/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
    clusters_name = os.listdir(folder_path)
    for cluster_name in clusters_name:
        if cluster_name[0] == ".":
            # except file .DStore in my macbook
            continue
        files_name = os.listdir(folder_path + "/" + cluster_name)
        for file_name in files_name:
            if file_name[0] == ".":
                # except file .DStore in my macbook
                continue
            file_path = folder_path + "/" + cluster_name + "/" + file_name
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    freq_array[len(words)] += 1
            except:
                print "exception parse XML: ", file_name
                continue
        print("Finish cluster name:", cluster_name, " , Wordvector size: ",
              str(wordvectors.embed_matrix.shape[0]))

    plt.plot(range(200), freq_array[:200], color='red', marker='.')
    plt.show()
예제 #3
0
def statistic_freq():

    wordvectors = WordVectors.load("model/wordvector.txt")

    freq_array = [0] * 500

    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        freq_array[len(words)] +=1

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
        freq_array[len(words)] +=1

    # DUC data
    folder_path = "/Users/HyNguyen/Documents/Research/Data/DUC20042005/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
    clusters_name = os.listdir(folder_path)
    for cluster_name in clusters_name:
        if cluster_name[0] == ".":
            # except file .DStore in my macbook
            continue
        files_name = os.listdir(folder_path + "/" + cluster_name)
        for file_name in files_name:
            if file_name[0] == ".":
                # except file .DStore in my macbook
                continue
            file_path = folder_path + "/" + cluster_name +"/"+ file_name
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    freq_array[len(words)] +=1
            except:
                print "exception parse XML: ", file_name
                continue
        print("Finish cluster name:", cluster_name," , Wordvector size: ", str(wordvectors.embed_matrix.shape[0]))

    plt.plot(range(200), freq_array[:200], color='red', marker='.')
    plt.show()
예제 #4
0
def create_summary_format_duc2004(ducpath, wordvectors_path, summary_path):
    wordvectors = WordVectors.load(wordvectors_path)
    clusters = []
    for cluster_id in os.listdir(ducpath):
        if cluster_id[0] == ".":
            continue
        cluster = Cluster.load_from_folder_duc(cluster_id,ducpath+ "/"+cluster_id,wordvectors)
        summary = CAESummarizer.summary(cluster,100)
        file_summary = summary_path + "/" + cluster_id[:-1].upper()+".M.100.T.1"
        with open(file_summary, mode="w") as f:
            for line in summary:
                f.write(line + "\n")
        clusters.append(cluster)
        print("Finish loading cluster_id: ", cluster_id)
    return clusters
예제 #5
0
def create_summary_format_duc2004(ducpath, wordvectors_path, summary_path):
    wordvectors = WordVectors.load(wordvectors_path)
    clusters = []
    for cluster_id in os.listdir(ducpath):
        if cluster_id[0] == ".":
            continue
        cluster = Cluster.load_from_folder_duc(cluster_id,
                                               ducpath + "/" + cluster_id,
                                               wordvectors)
        summary = CAESummarizer.summary(cluster, 100)
        file_summary = summary_path + "/" + cluster_id[:-1].upper(
        ) + ".M.100.T.1"
        with open(file_summary, mode="w") as f:
            for line in summary:
                f.write(line + "\n")
        clusters.append(cluster)
        print("Finish loading cluster_id: ", cluster_id)
    return clusters
예제 #6
0
def create_summary_format_opinosis(opinosis_path, wordvectors_path, summary_path):
    wordvectors = WordVectors.load(wordvectors_path)
    clusters = []
    for cluster_id in os.listdir(opinosis_path):
        if cluster_id[0] == ".":
            continue
        cluster = Cluster.load_from_opinosis(cluster_id,opinosis_path+"/"+cluster_id, wordvectors)
        summary = CAESummarizer.summary(cluster,25,"kmean_simple")
        if len(summary) == 0:
            print("ttdt")
        cluster_id,_,_ = cluster_id.split(".")
        folder_summary = summary_path+"/"+cluster_id
        if not os.path.isdir(folder_summary):
            os.makedirs(folder_summary)
        file_summary = folder_summary+"/"+cluster_id+".1.txt"
        with open(file_summary, mode="w") as f:
            for line in summary:
                f.write(line + "\n")
        clusters.append(cluster)
        print("Finish loading cluster_id: ", folder_summary)
    return clusters
예제 #7
0
def create_summary_format_opinosis(opinosis_path, wordvectors_path,
                                   summary_path):
    wordvectors = WordVectors.load(wordvectors_path)
    clusters = []
    for cluster_id in os.listdir(opinosis_path):
        if cluster_id[0] == ".":
            continue
        cluster = Cluster.load_from_opinosis(cluster_id,
                                             opinosis_path + "/" + cluster_id,
                                             wordvectors)
        summary = CAESummarizer.summary(cluster, 25, "kmean_simple")
        if len(summary) == 0:
            print("ttdt")
        cluster_id, _, _ = cluster_id.split(".")
        folder_summary = summary_path + "/" + cluster_id
        if not os.path.isdir(folder_summary):
            os.makedirs(folder_summary)
        file_summary = folder_summary + "/" + cluster_id + ".1.txt"
        with open(file_summary, mode="w") as f:
            for line in summary:
                f.write(line + "\n")
        clusters.append(cluster)
        print("Finish loading cluster_id: ", folder_summary)
    return clusters
예제 #8
0
        self.length = content.count(" ")
        self.sentece_id = -1


import numpy as np
import time
import cPickle

if __name__ == "__main__":

    clusterpath = "data/vietnamesemds/cluster_1/"
    vectormodel = "model/word2vec/100"
    vietnamesemds_path = "data/vietnamesemds/"

    start = time.time()
    w2v = WordVectors.load("vector/100")
    end = time.time()

    convae = ConvolutionAutoEncoder.rebuild_for_testing(
        mini_batch_size=1, filemodel="model/CAE.model")
    clusters = [None] * 201

    counter = 1
    for cluster_id in os.listdir(vietnamesemds_path):
        _, id = cluster_id.split("_")
        cluster = Cluster.load_from_folder(
            cluster_id, vietnamesemds_path + cluster_id + "/")
        print("Cluster ", counter)
        counter += 1
        for document in cluster.list_documents:
            for sentence in document.list_sentences:
예제 #9
0
            if len(elements) < 1:
                print(elements)
            idx = sentence.count(" ")
            try:
                histo[idx] += 1
                if idx > 70 and idx < 100:
                    print sentence
            except:
                pass

    for khung in histo:
        print khung

if __name__ == "__main__":

    wordvector_w2v = WordVectors.load_from_text_format("model/word2vec.txt", name="word2vec")
    prepare_indexs_score_of_file("data/sentence.score.dailymail.txt", wordvector_w2v, "dailymail")
    prepare_indexs_score_of_file("data/sentence.score.duc04.txt", wordvector_w2v, "duc04")
    prepare_indexs_score_of_file("data/sentence.score.duc05.txt", wordvector_w2v, "duc05")

    wordvector_glove = WordVectors.load_from_text_format("model/glove.filter.txt", name="glove")
    prepare_indexs_score_of_file("data/sentence.score.dailymail.txt", wordvector_glove, "dailymail")
    prepare_indexs_score_of_file("data/sentence.score.duc04.txt", wordvector_glove, "duc04")
    prepare_indexs_score_of_file("data/sentence.score.duc05.txt", wordvector_glove, "duc05")

    wordvector_cw = WordVectors.load_from_text_format("model/cwvector.txt", name="cw")
    prepare_indexs_score_of_file("data/sentence.score.dailymail.txt", wordvector_cw, "dailymail")
    prepare_indexs_score_of_file("data/sentence.score.duc04.txt", wordvector_cw, "duc04")
    prepare_indexs_score_of_file("data/sentence.score.duc05.txt", wordvector_cw, "duc05")

예제 #10
0
파일: rnn.py 프로젝트: giahy2507/rnn
            print "Grad check failed for dWh: sum of error = %.9f"%(err/count)



from nltk.parse.stanford import StanfordParser
from nltk.treetransforms import chomsky_normal_form
from nltk.tree import Tree
from vector.wordvectors import WordVectors
parser = StanfordParser(path_to_jar="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser.jar",
                        path_to_models_jar="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar"
                        ,model_path="/Users/HyNguyen/Downloads/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

if __name__ == "__main__":

    rng = np.random.RandomState(4488)
    wordvector = WordVectors.load_from_text_format("model/word2vec.txt", "word2vec")
    pos_sent = []
    neg_sent = []
    with open("data/rt-polarity.neg.txt",mode="r") as f:
        neg_sent.append(f.readline())
        neg_sent.append(f.readline())
        neg_sent.append(f.readline())

    with open("data/rt-polarity.pos.txt",mode="r") as f:
        pos_sent.append(f.readline())
        pos_sent.append(f.readline())
        pos_sent.append(f.readline())


    trees = []
    labels = [0]*3 + [1]*3
예제 #11
0
import sys

from mpi4py import MPI

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()

if __name__ == "__main__":
    data_scatters = []
    start_total = 0
    max_word = 70
    min_word = 10
    if rank == 0:
        start_total = time.time()
        wordvectors = WordVectors.load_from_text_format("model/cwvector.txt",
                                                        name="word2vec")
        print("Finished read wordvectors ...")
        with open("data/sentence.score.duc04.txt", mode="r") as f:
            traindata = f.readlines()
        size_sample = int(len(traindata) / size)
        for i in range(size):
            if i * size_sample + size_sample > len(traindata):
                data_scatters.append(traindata[i * size_sample:])
            else:
                data_scatters.append(
                    traindata[i * size_sample:i * size_sample + size_sample])
    else:
        wordvectors = None
        data_scatter = None

    wordvectors = comm.bcast(wordvectors, root=0)
예제 #12
0
파일: rnn.py 프로젝트: giahy2507/summarynew
    Returns:
        total_cost: cost of foward
        total_grad: of theta
    """


    cost = 0
    grad = 0
    forward=instances
    return cost, grad


if __name__ == "__main__":

    rng = np.random.RandomState(4488)
    wordvector = WordVectors.load_from_text_format("../model/cwvector.txt", "cwvector")

    rnn = RecursiveNeuralNetworl(embsize=wordvector.embsize,mnb_size=30,lr=0.1,wordvector=wordvector,act_func=act.tanh)
    X_train, X_valid  = load_sentiment_data()

    rnn.train(X_train , X_valid)

    # with open("../data/rt-polarity.neg.out.txt", mode="r") as f:
    #     neg_trees_str = f.readlines()
    #
    # X_neg = []
    # for neg_tree_str in neg_trees_str[:5]:
    #     t = Tree(neg_tree_str)
    #     t = merge_bin_tree(t)
    #     t.label = 0
    #     X_neg.append(t)
예제 #13
0
                print(elements)
            idx = sentence.count(" ")
            try:
                histo[idx] += 1
                if idx > 70 and idx < 100:
                    print sentence
            except:
                pass

    for khung in histo:
        print khung


if __name__ == "__main__":

    wordvector_w2v = WordVectors.load_from_text_format("model/word2vec.txt",
                                                       name="word2vec")
    prepare_indexs_score_of_file("data/sentence.score.dailymail.txt",
                                 wordvector_w2v, "dailymail")
    prepare_indexs_score_of_file("data/sentence.score.duc04.txt",
                                 wordvector_w2v, "duc04")
    prepare_indexs_score_of_file("data/sentence.score.duc05.txt",
                                 wordvector_w2v, "duc05")

    wordvector_glove = WordVectors.load_from_text_format(
        "model/glove.filter.txt", name="glove")
    prepare_indexs_score_of_file("data/sentence.score.dailymail.txt",
                                 wordvector_glove, "dailymail")
    prepare_indexs_score_of_file("data/sentence.score.duc04.txt",
                                 wordvector_glove, "duc04")
    prepare_indexs_score_of_file("data/sentence.score.duc05.txt",
                                 wordvector_glove, "duc05")
예제 #14
0
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    samples.append(words)
            except:
                print "exception parse XML: ", file_name
                continue
    print("Finish collecting training data from DUC2004")
    print("length of samples", len(samples))
    end_collect = time.time()
    print("Total time for collecting training data: " +
          str(end_collect - start_collect))
    return samples


if __name__ == "__main__":

    wordvectors = WordVectors.load("model/wordvector.txt")
    train_data = collect_data_from_ptb_brow_duc2004()
    final_array = []
    for i, words in enumerate(train_data):
        words_array = wordvectors.cae_prepare_data_from_words(words, 10, 100)
        final_array.append(words_array)
        if i == 69:
            break
    final_array = np.array(final_array)
    print(final_array.shape)
예제 #15
0
    sys.stdout.write("Finish collecting training data from DUC2004")
    sys.stdout.flush()
    sys.stdout.write("length of samples" + str(len(samples)))
    sys.stdout.flush()
    end_collect = time.time()
    sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect))
    sys.stdout.flush()
    return samples


if __name__ == "__main__":
    data_scatters = []
    start_total = 0
    if rank == 0:
        start_total = time.time()
        wordvectors = WordVectors.load("model/wordvector.txt")
        print("Finished read wordvectors ...")
        traindata = collect_data_from_ptb_brow_duc2004()
        size_sample = int(len(traindata)/size)
        for i in range(size):
            if i* size_sample + size_sample > len(traindata):
                data_scatters.append(traindata[i*size_sample:])
            else:
                data_scatters.append(traindata[i*size_sample : i*size_sample+size_sample])
    else:
        wordvectors = None
        data_scatter = None

    wordvectors = comm.bcast(wordvectors, root = 0)
    print("Process:", rank, "broadcasted wordvectors ...")
    data_scatter = comm.scatter(data_scatters,root=0)
예제 #16
0
        self.vector = vector
        self.length = content.count(" ")
        self.sentece_id = -1

import numpy as np
import time
import cPickle

if __name__ == "__main__":

    clusterpath = "data/vietnamesemds/cluster_1/"
    vectormodel = "model/word2vec/100"
    vietnamesemds_path = "data/vietnamesemds/"

    start = time.time()
    w2v = WordVectors.load("vector/100")
    end = time.time()

    convae = ConvolutionAutoEncoder.rebuild_for_testing(mini_batch_size=1,filemodel="model/CAE.model")
    clusters = [None]* 201

    counter = 1
    for cluster_id in os.listdir(vietnamesemds_path):
        _, id = cluster_id.split("_")
        cluster = Cluster.load_from_folder(cluster_id, vietnamesemds_path + cluster_id + "/")
        print ("Cluster ", counter)
        counter+=1
        for document in cluster.list_documents:
            for sentence in document.list_sentences:
                sentence_matrix = w2v.cae_prepare_data(sentence.content)
                if sentence_matrix is None:
예제 #17
0
 def create_my_summarizer(cls, cae_model_path , word_vector_model_path = "vector/100", mode = 0):
     word_vectors = WordVectors.load(word_vector_model_path)
     convae = ConvolutionAutoEncoder.rebuild_for_testing(mini_batch_size=1, filemodel=cae_model_path)
     return CAESummarizer(convae, word_vectors, mode)