def main(transform_func = None, n = 10):
    parser=StanfordParser(
        path_to_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser.jar",
        path_to_models_jar = "/cs/fs/home/hxiao/code/stanford-parser-full-2015-01-30/stanford-parser-3.5.1-models.jar",
        model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
    )

    test_sents = treebank.sents()[-n:]

    print "len(test_sents) = %d" %(len(test_sents))

    if transform_func and callable(transform_func):
        print "transforming it using ", transform_func
        test_sents = [[transform_func(w) for w in s] 
                      for s in test_sents] # transform it

    print test_sents[:10]

    print "predicting"
    pred_parses = parser.parse_sents(test_sents)
    
    gold_parses = treebank.parsed_sents()
    
    print "evaluating"

    correct_n = gold_n = predicted_n = 0.0
    
    for gparse, pparse in zip(gold_parses, pred_parses):
        cn, gn, pn = precision_and_recall_stat(get_nodes_with_range(gparse), 
                                               get_nodes_with_range(pparse))
        correct_n += cn
        gold_n += gn
        predicted_n += pn
        
    print "Prediction: %f, Recall: %f" %(correct_n / predicted_n, correct_n / gold_n)
Пример #2
0
 def __init__(self):
     nltk.download('treebank')
     nltk.download('stopwords')
     ids = nltk.corpus.treebank.fileids()
     self.sents=[]
     for id in ids:
         self.sents+=list(treebank.sents(id))
     self.wc={}
     
     stop_words = set(stopwords.words('english'))
     for i in range(len(self.sents)):
         self.sents[i]=[word for word in self.sents[i] if word[0]!='*']
         for j in range(len(self.sents[i])):
             if not self.sents[i][j] in self.wc:
                 self.wc[self.sents[i][j]]=[]
             self.wc[self.sents[i][j]].append((i,j))
     self.word_set=[]
     frequency = 10
     for i in self.wc:
         if len(self.wc[i])>= frequency and i.isalpha() and not i in stop_words:
             self.word_set.append(i)
     self.n=len(self.word_set)
     self.n_s=len(self.word_set)
     print(len(self.sents))
     print(len(self.wc))
     print(len(self.word_set))
def get_word2vec(
        train_fn="data/rap/input.txt",
        saved_model_fn="save/save/GoogleNews-vectors-negative300.bin"):
    try:
        print "loading word2vec model at {0}".format(saved_model_fn)
        model = Word2Vec.load_word2vec_format(saved_model_fn, binary=True)
        print "model loaded"
        return model
    except IOError:
        print "no word2vec model found at {0}".format(saved_model_fn)
        with open(train_fn) as f:
            data = f.read()
            clean = TextLoader.clean_str(data)
            lines = [line.split(" ") for line in clean.split('\n')]
            full_data = brown.sents() + movie_reviews.sents() + treebank.sents(
            ) + lines
            print "training word2vec model"
            model = Word2Vec(workers=8)
            model.build_vocab(full_data)
            for i in xrange(0, 5):
                print "epoch " + str(i + 1)
                # full_data = shuffle(full_data)
                pb = ProgressBar(maxval=len(full_data))
                chunk_size = len(full_data) / 100
                j = 0
                pb.start()
                while j + chunk_size < len(full_data):
                    model.train(full_data[j:j + chunk_size])
                    j += chunk_size
                    pb.update(j)

            print "done training"
            model.save(saved_model_fn)
            return model
Пример #4
0
def test():
    model = torch.load('./ckpt/model0.pt')
    leafmodel = LeafNet()
    x = treebank.sents('wsj_0003.mrg')[0]
    y = treebank.parsed_sents('wsj_0003.mrg')[0]
    preprocess(y)
    # embed_x is the list of embedding vectors of x
    embed_x = []
    x_list = []
    l = int(len(x))

    for i in range(0, l):
        txlist = []
        x[i] = x[i].lower()
        txlist.append(x[i])
        tembed = torch.Tensor(get_embed(x[i]))
        embed_x.append(tembed)

        pred = leafmodel(embed_x[i])
        gt = (torch.argmax(pred)).item()
        txlist.append(gt)
        x_list.append(txlist)

    # we got the (sentence,gt) list, embedding vector list for the leafs
    xscore = 0.0
    while (len(x_list) != 1):
        x_list, embed_x, tscore = calculate_score(x_list, embed_x, model)
        xscore = xscore + tscore
    x_list = str(x_list).replace('[', '(').replace(']', ')').replace(
        '\'', '').replace(',', '')
    x_list_tree = Tree.fromstring((x_list))

    draw_trees(x_list_tree)
    draw_trees(y)
Пример #5
0
def create_input_dataset():
	print 'Loading input'
	input_data = []
	tags = []
	sents = wsj.sents()
	json_file  = open('data.json','w') 
	counter = 0
	for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
		prev = None
		prev_prev = None
		for j,word in enumerate(sentence):
			datapoint = {}
			temp = []
			len_sentence = len(sentence)

			
			if(j > 0):
				temp.append(sents[i][j-1])
			else:
				temp.append('*')
			if(j > 1):
				temp.append(sents[i][j-2])
			else:
				temp.append('*')
			temp.append(sents[i][j])
			if(j < len_sentence-1):
				temp.append(sents[i][j+1])
			else:
				temp.append('*')
			if(j < len_sentence-2):
				temp.append(sents[i][j+2])
			else:
				temp.append('*')

			datapoint['wn'] = temp
			
			datapoint['index'] = j

			datapoint['i'] = counter
			counter += 1
			if(prev == None):
				datapoint['t_minus_one'] = '*'
			else:
				datapoint['t_minus_one'] = prev[1]
			if(prev_prev == None):
				datapoint['t_minus_two'] = '*'
			else:
				datapoint['t_minus_two'] = prev_prev[1]

			prev_prev = prev
			prev = word
			# print datapoint,word[1]
			datapoint['tag'] = word[1]
			json_file.write(json.dumps(datapoint))
			json_file.write('\n')
			input_data.append(datapoint)
			tags.append(word[1])
	print 'Done'
	json_file.close()
	return input_data, tags
Пример #6
0
def create_filter_index():
        sents = [" ".join(sent) for sent in treebank.sents()[:no_comp_reqs]]
        ids = list(range(no_comp_reqs))
        tags = ["" for sent in sents]
        filter_index,_,_,_ = glossary_extraction(sents, ids, tags, tag_mode="load tagger", filter_mode="threshold", threshold_coverage=1)
        with open('../temp/filter_index.pickle','wb') as f:
            pickle.dump(filter_index,f)
Пример #7
0
def build_index(out_filename, in_filename = None):
    '''Builds data files for word lookup. Can take an optional input file
    to add to the data pool which is processed (not working).
    Data is then dumped to a pickle file.'''

    sents_data = []
    try:
        in_file = open(in_filename).read()
        sents_data += sent_tokenize(in_file)
        in_file.close()
    except:
        print("Warning: Failed to load external file for building.")

    sents_data += brown.sents() + treebank.sents()

    # get sentences, chop of rtheir ambiguous heads, and look at their words!
    mysents = [sent[1:] for sent in sents_data]
    # flatten sublists of words to list of words
    mywords = [word for word in mysents for word in word]
    cfd = ConditionalFreqDist((word.lower(), word) for word in mywords)
    # look up most frequent form of lowercase word by doing cfd['word'].max()
    # but need to check for existance of word in cfd first

    # made pickle file too large and slow
    # wordlist = set(words.words())
    # wordlist.update(brown.words())
    # wordlist.update(treebank.words())
    # common_words_lower = set([w for w in wordlist if w.islower()])
    # common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)])

    out_file = open(out_filename, 'wb')
    pickle.dump(cfd, out_file, 2)
    # pickle.dump(common_words_lower, out_file, 2)
    # pickle.dump(common_words_titlecase, out_file, 2)
    out_file.close()
Пример #8
0
def collect_data_from_ptb_brow_duc2004():

    start_collect = time.time()
    samples = []
    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        samples.append(words)

    sys.stdout.write("Finish collecting training data from Penn Tree Bank")
    sys.stdout.flush()

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
        samples.append(words)
    sys.stdout.write("Finish collecting training data from Brown")
    sys.stdout.flush()

    # DUC data
    folder_path = "/Users/HyNguyen/Documents/Research/Data/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
    clusters_name = os.listdir(folder_path)
    for cluster_name in clusters_name:
        if cluster_name[0] == ".":
            # except file .DStore in my macbook
            continue
        files_name = os.listdir(folder_path + "/" + cluster_name)
        for file_name in files_name:
            if file_name[0] == ".":
                # except file .DStore in my macbook
                continue
            file_path = folder_path + "/" + cluster_name + "/" + file_name
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    samples.append(words)
            except:
                print "exception parse XML: ", file_name
                continue
    sys.stdout.write("Finish collecting training data from DUC2004")
    sys.stdout.flush()
    sys.stdout.write("length of samples" + str(len(samples)))
    sys.stdout.flush()
    end_collect = time.time()
    sys.stdout.write("Total time for collecting training data: " +
                     str(end_collect - start_collect))
    sys.stdout.flush()
    return samples
Пример #9
0
def collect_data_from_ptb_brow_duc2004():

    start_collect = time.time()
    samples = []
    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        samples.append(words)

    sys.stdout.write("Finish collecting training data from Penn Tree Bank")
    sys.stdout.flush()

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
        samples.append(words)
    sys.stdout.write("Finish collecting training data from Brown")
    sys.stdout.flush()

    # DUC data
    folder_path = "/Users/HyNguyen/Documents/Research/Data/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
    clusters_name = os.listdir(folder_path)
    for cluster_name in clusters_name:
        if cluster_name[0] == ".":
            # except file .DStore in my macbook
            continue
        files_name = os.listdir(folder_path + "/" + cluster_name)
        for file_name in files_name:
            if file_name[0] == ".":
                # except file .DStore in my macbook
                continue
            file_path = folder_path + "/" + cluster_name +"/"+ file_name
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    samples.append(words)
            except:
                print "exception parse XML: ", file_name
                continue
    sys.stdout.write("Finish collecting training data from DUC2004")
    sys.stdout.flush()
    sys.stdout.write("length of samples" + str(len(samples)))
    sys.stdout.flush()
    end_collect = time.time()
    sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect))
    sys.stdout.flush()
    return samples
Пример #10
0
 def read_wsj_from_treebank(self, index):
     from nltk.corpus import treebank
     self.__reset()
     self.__input_text = 'wsj_000' + str(index) + '.mrg'
     self.__sents = treebank.sents(self.__input_text)
     self.__tagged_sents = treebank.parsed_sents(self.__input_text)
     if self.__verbose:
         self.__print_all()
     return self.__tagged_sents
Пример #11
0
def read_treebank(input_vocab_size=10000, output_vocab_size=10000, seq_len=10):
    all_sents = []
    for fname in treebank.fileids():
        sents = treebank.sents(fname)
        if sents:
            all_sents.extend(sents)

    return read_dataset(all_sents, input_vocab_size, output_vocab_size,
                        seq_len)
Пример #12
0
def create_dataset():
	#print 'Loading dataset'
	dataset = []
	tags = []
	sents = wsj.sents()

	for i,sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
		prev = None
		prev_prev = None
		for j,word in enumerate(sentence):
			datapoint = {}
			temp = []
			
			len_sentence = len(sentence)
			
			if(j > 0):
				temp.append(sents[i][j-1])
			else:
				temp.append('*')
			if(j > 1):
				temp.append(sents[i][j-2])
			else:
				temp.append('*')
			
			temp.append(sents[i][j])

			if(j < len_sentence-1):
				temp.append(sents[i][j+1])
			else:
				temp.append('*')
			if(j < len_sentence-2):
				temp.append(sents[i][j+2])
			else:
				temp.append('*')

			#what is WN ?
			datapoint['wn'] = temp
			
			datapoint['index'] = j
			if(prev == None):
				datapoint['t_minus_one'] = '*'
			else:
				datapoint['t_minus_one'] = prev[1]
			if(prev_prev == None):
				datapoint['t_minus_two'] = '*'
			else:
				datapoint['t_minus_two'] = prev_prev[1]

			prev_prev = prev
			prev = word
			# print datapoint,word[1]
			dataset.append(datapoint)
			tags.append(word[1])
	#print 'Done'
	return dataset, tags
Пример #13
0
def create_dataset():
    print "Loading dataset"
    dataset = []
    tags = []
    sents = wsj.sents()

    for i, sentence in enumerate(wsj.tagged_sents()[:no_of_sentences]):
        prev = None
        prev_prev = None
        for j, word in enumerate(sentence):
            datapoint = {}
            temp = []
            len_sentence = len(sentence)

            if j > 0:
                temp.append(sents[i][j - 1])
            else:
                temp.append("*")
            if j > 1:
                temp.append(sents[i][j - 2])
            else:
                temp.append("*")

            temp.append(sents[i][j])

            if j < len_sentence - 1:
                temp.append(sents[i][j + 1])
            else:
                temp.append("*")
            if j < len_sentence - 2:
                temp.append(sents[i][j + 2])
            else:
                temp.append("*")

            datapoint["wn"] = temp

            datapoint["index"] = j
            if prev == None:
                datapoint["t_minus_one"] = "*"
            else:
                datapoint["t_minus_one"] = prev[1]
            if prev_prev == None:
                datapoint["t_minus_two"] = "*"
            else:
                datapoint["t_minus_two"] = prev_prev[1]

            prev_prev = prev
            prev = word
            # print datapoint,word[1]
            dataset.append(datapoint)
            tags.append(word[1])
    print "Done"
    return dataset, tags
Пример #14
0
 def generator(self):
     for index, file in enumerate(self.file_ids):
         if index % 10 == 0:
             print("Processed " + str(index) + " of " + str(len(self.file_ids)) + " files")
         parsed_sentences = treebank.parsed_sents(file)
         sentences = treebank.sents(file)
         for i in range(len(parsed_sentences)):
             yield {
                 'file': file,
                 'id': i,
                 'raw': sentences[i],
                 'parsed': parsed_sentences[i]
             }
Пример #15
0
def create_dataset():
    print 'Loading dataset'
    dataset = []
    tags = []
    sents = wsj.sents()

    for i, sentence in enumerate(wsj.tagged_sents()[:10]):
        prev = None
        prev_prev = None
        for j, word in enumerate(sentence):
            datapoint = {}
            temp = []
            len_sentence = len(sentence)

            temp.append(sents[i][j])
            if (j > 0):
                temp.append(sents[i][j - 1])
            else:
                temp.append('*')
            if (j > 1):
                temp.append(sents[i][j - 2])
            else:
                temp.append('*')
            if (j < len_sentence - 1):
                temp.append(sents[i][j + 1])
            else:
                temp.append('*')
            if (j < len_sentence - 2):
                temp.append(sents[i][j + 2])
            else:
                temp.append('*')

            datapoint['wn'] = temp

            datapoint['index'] = j
            if (prev == None):
                datapoint['t_minus_one'] = '*'
            else:
                datapoint['t_minus_one'] = prev[1]
            if (prev_prev == None):
                datapoint['t_minus_two'] = '*'
            else:
                datapoint['t_minus_two'] = prev_prev[1]

            prev_prev = prev
            prev = word
            # print datapoint,word[1]
            dataset.append(datapoint)
            tags.append(word[1])
    print 'Done'
    return dataset, tags
Пример #16
0
def statistic_freq():

    wordvectors = WordVectors.load("model/wordvector.txt")

    freq_array = [0] * 500

    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        freq_array[len(words)] += 1

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
        freq_array[len(words)] += 1

    # DUC data
    folder_path = "/Users/HyNguyen/Documents/Research/Data/DUC20042005/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
    clusters_name = os.listdir(folder_path)
    for cluster_name in clusters_name:
        if cluster_name[0] == ".":
            # except file .DStore in my macbook
            continue
        files_name = os.listdir(folder_path + "/" + cluster_name)
        for file_name in files_name:
            if file_name[0] == ".":
                # except file .DStore in my macbook
                continue
            file_path = folder_path + "/" + cluster_name + "/" + file_name
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    freq_array[len(words)] += 1
            except:
                print "exception parse XML: ", file_name
                continue
        print("Finish cluster name:", cluster_name, " , Wordvector size: ",
              str(wordvectors.embed_matrix.shape[0]))

    plt.plot(range(200), freq_array[:200], color='red', marker='.')
    plt.show()
def get_trees_sentences():
    trees = []
    sentences = []
    for file in treebank.fileids():
        for tree in treebank.parsed_sents(file):
            tree_str = str(tree)
            trees.append(tree_str)
        for sentence in treebank.sents(file):
            s = ""
            s = " ".join(words for words in sentence)
            sentences.append(s)
    assert len(trees) == len(sentences)
    sentences = list(map(lambda x: x.lower(), sentences))
    return (trees, sentences)
Пример #18
0
def collect_word_from_data():
    vocab = {}

    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        for word in treebank_sents[i]:
            vocab[str(word)] = 1
    print("Finish Penn Tree Bank corpus, vocab size: ", str(len(vocab.keys())))

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        for word in brown_sents[i]:
            vocab[str(word)] = 1
    print("Finish Broww corpus, vocab size: ", str(len(vocab.keys())))

    def parse_xml(file_path):
        try:
            tree = ET.parse(file_path)
            return tree
        except:
            return None

    # dailymail data
    with open("../data/sentence.score.dailymail.txt", mode="r") as f:
        for line in f:
            sentence, score = line.split("hynguyensplit")
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word)] = 1

    # duc04 data
    with open("../data/sentence.score.duc04.txt", mode="r") as f:
        for line in f:
            sentence, score = line.split("hynguyensplit")
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word)] = 1

    # duc05 data
    with open("../data/sentence.score.duc05.txt", mode="r") as f:
        for line in f:
            sentence, score = line.split("hynguyensplit")
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word)] = 1

    print("Finish reading vocab size: ", str(len(vocab.keys())))
    return vocab
Пример #19
0
def statistic_freq():

    wordvectors = WordVectors.load("model/wordvector.txt")

    freq_array = [0] * 500

    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        freq_array[len(words)] +=1

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
        freq_array[len(words)] +=1

    # DUC data
    folder_path = "/Users/HyNguyen/Documents/Research/Data/DUC20042005/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
    clusters_name = os.listdir(folder_path)
    for cluster_name in clusters_name:
        if cluster_name[0] == ".":
            # except file .DStore in my macbook
            continue
        files_name = os.listdir(folder_path + "/" + cluster_name)
        for file_name in files_name:
            if file_name[0] == ".":
                # except file .DStore in my macbook
                continue
            file_path = folder_path + "/" + cluster_name +"/"+ file_name
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()
                text_tag = root._children[3]
                if text_tag.tag == "TEXT":
                    text = text_tag.text.replace("\n", "")
                sentences = nltk.tokenize.sent_tokenize(text)
                for sentence in sentences:
                    words = nltk.word_tokenize(sentence)
                    freq_array[len(words)] +=1
            except:
                print "exception parse XML: ", file_name
                continue
        print("Finish cluster name:", cluster_name," , Wordvector size: ", str(wordvectors.embed_matrix.shape[0]))

    plt.plot(range(200), freq_array[:200], color='red', marker='.')
    plt.show()
Пример #20
0
    def _init_train(self):
        lemmas = [
            tup[0].split() for tup in self.db.loadProcessed("lemmatized")
        ]

        model = FastText(min_count=5)
        model.build_vocab(brown.sents())
        model.train(
            brown.sents(),
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words,
            epochs=model.epochs,
        )
        model.build_vocab(treebank.sents(), update=True)
        model.train(
            treebank.sents(),
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words,
            epochs=model.epochs,
        )
        model.build_vocab(movie_reviews.sents(), update=True)
        model.train(
            movie_reviews.sents(),
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words,
            epochs=model.epochs,
        )
        model.build_vocab(lemmas, update=True)
        model.train(
            lemmas,
            total_examples=model.corpus_count,
            total_words=model.corpus_total_words,
            epochs=model.epochs,
        )

        return model
Пример #21
0
def preprocess_corpora():
	brown_words = brown.tagged_words(simplify_tags=True)
	treebank_words = treebank.tagged_words(simplify_tags=True)
	'''
	#this takes forever.
	bwog_corpus = nltk.corpus.PlaintextCorpusReader('../bwog-corpus-txt', '.*\.txt')
	bwog_sents = bwog_corpus.sents(bwog_corpus.fileids())
	bwog_words = []
	for s_i in xrange(0, len(bwog_sents)/100000):
		#TODO: skip punctuation
		simp_tagged_sent = [(word,simp_tag(tag)) for word,tag in nltk.pos_tag(bwog_sents[s_i])]
		bwog_words.extend(simp_tagged_sent)
	'''
	all_tagged_words = brown_words + treebank_words #+ bwog_words
	all_sents = brown.sents() + treebank.sents() #+ bwog_sents
	compute_concordance(all_tagged_words)
Пример #22
0
def collect_word_from_data():
    vocab = {}
    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        for word in treebank_sents[i]:
            vocab[str(word).lower()] = 1
    print("Finish Penn Tree Bank corpus, vocab size: ", str(len(vocab.keys())))

    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        for word in brown_sents[i]:
            vocab[str(word).lower()] = 1
    print("Finish Broww corpus, vocab size: ", str(len(vocab.keys())))

    # dailymail data
    with open("../data/sentence.score.dailymail.txt", mode="r") as f:
        for line in f:
            sentence, score = line.split("hynguyensplit")
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word).lower()] = 1

    # duc04 data
    with open("../data/sentence.score.duc04.txt", mode="r") as f:
        for line in f:
            sentence, score = line.split("hynguyensplit")
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word).lower()] = 1

    # duc05 data
    with open("../data/sentence.score.duc05.txt", mode="r") as f:
        for line in f:
            sentence, score = line.split("hynguyensplit")
            words = nltk.word_tokenize(sentence)
            for word in words:
                vocab[str(word).lower()] = 1

    print("Finish reading vocab size: ", str(len(vocab.keys())))
    return vocab
Пример #23
0
def main():
    k = 10  # k-cross validation
    correctSum = 0.
    totalSum = 0.
    untagged = corpus.sents()
    tagged = corpus.tagged_sents()  # optional parameter: tagset='universal'
    share = int(len(tagged) / k)

    print("####", k, "Fold Cross Validation ####")
    for i in range(k):
        print("Round", i + 1, end='\t')
        testRange = (i * share, (i + 1) * share)
        test_data = untagged[testRange[0]:testRange[1]]
        train_data = tagged[:testRange[0]] + tagged[testRange[1]:]
        ans_data = tagged[testRange[0]:testRange[1]]

        eva = validation(corpus, train_data, test_data, ans_data)
        correctSum += eva[0]
        totalSum += eva[1]
    print("### Average accuracy: %.4f" % (correctSum / totalSum), "###")
Пример #24
0
def syntax(tagtokens):
    print "\n"
    print "Step 3: Syntax Analysis\n"

    dataset_size = len(treebank.parsed_sents())
    split_size = int(dataset_size * 0.97)
    learning_set = treebank.parsed_sents()[:split_size]
    test_set = treebank.parsed_sents()[split_size:]

    #create a set containing the raw sentences
    sents = treebank.sents()
    raw_test_set = [[w for w in sents[i]]
                    for i in range(split_size, dataset_size)]

    #construct the PCFG
    tbank_productions = []

    for sent in learning_set:
        for production in sent.productions():
            tbank_productions.append(production)

    for word, tag in tagtokens:
        t = Tree.fromstring("(" + tag + " " + word + ")")
        for production in t.productions():
            tbank_productions.append(production)

    tbank_grammar = nltk.grammar.induce_pcfg(Nonterminal('S'),
                                             tbank_productions)
    chart_parser = nltk.ChartParser(tbank_grammar)

    trees = chart_parser.parse(tokens)
    count = 1
    for tree in trees:
        print tree  # show first few trees
        if (count == 3):
            break
        else:
            count += 1
    return trees
def main():

    # Read sentence i of the treebank, with n1 <= i < n2
    n1 = 0
    n2 = 10

    # Parse sentence j of the treebank, with m1 <= j < m2
    # NB: usually, one should _NOT_ test a parser on the same sentences used to train it!!!
    #     But, in this example, the grammar is so small that it is unlikely to parse a new
    #     "unknown" sentence. The "unknown" sentence should use the same terminal symbols
    #     that the grammar "knows", and should be parseable with the productions the grammar "knows"...
    m1 = 0
    m2 = 1

    # Induce grammar from a subset of the treebank parsed sentences. Allocate parsers
    cfg_earley_parser, pcfg_pchart_parser = generate_grammar_and_parsers(
        treebank.parsed_sents()[n1:n2])

    # Parse sentences from the treebank
    for i in range(m1, m2):
        sentence = treebank.sents()[i]  # the sentence to parse
        print("Parsing:", sentence)

        gold_tree = treebank.parsed_sents()[i]  # the right parse tree

        # Parse the sentence with parsers we define;
        # see:
        # http://www.nltk.org/book/ch08-extras.html
        # http://www.nltk.org/book/ch08.html
        earley(
            cfg_earley_parser, sentence,
            gold_tree)  # here do not return any tree... just show all of them
        tree = pchart(pcfg_pchart_parser, sentence,
                      gold_tree)  # here get the best tree
        print("\nBEST TREE WITH PROB.: %.12e" % tree.prob())
        tree.draw()  # draw the tree
# Extracts Penn Treebank from NLTK.
from nltk.corpus import treebank
from operator import itemgetter
import codecs
words = treebank.sents()
tagged_words = [map(itemgetter(1), sent) for sent in treebank.tagged_sents()]
parsed_sents = treebank.parsed_sents()

total_sents = len(parsed_sents)

f = codecs.open('../data/penn_treebank','w','utf-8')
assert (len(words) == len(tagged_words) and len(words) == len(parsed_sents)), ' '.join(map(str, [len(words), len(tagged_words), len(parsed_sents)]))
f.write(str(total_sents) + '\n')
for i in xrange(total_sents):
	sent_len = len(words[i])
	f.write(str(sent_len) + '\n')
	
	sent = ' '.join(words[i])
	pos = ' '.join(tagged_words[i])
	assert(sent.count('\n') == 0 and pos.count('\n') == 0 and len(sent.split(' ')) == sent_len and len(pos.split(' ')) == sent_len)
	f.write(sent + '\n')
	f.write(pos + '\n')
	
	tree = str(parsed_sents[i]).split('\n')
	f.write(str(len(tree)) + '\n')
	f.write('\n'.join(tree) + '\n')
Пример #27
0
from nltk.corpus import treebank as wsj

#file_in=open('wsj_0003.pos','r')

#content=file_in.readlines()
#file_in.close()

#print content[:5]

file_out_tag=open('tagged_sent_sample','w')
file_out_sent=open('untagged_sent_sample','w')

out1=wsj.sents()[:20]
out2=wsj.tagged_sents()[:20]
line=''
for i in out1 :
	#file_out_sent.write('\n\n')
	line=' '.join(i)
	file_out_sent.write(line)
	file_out_sent.write('\n')
	#print line	

for i in out2 :
	file_out_tag.write('\n\n')
	words=''
	for j in i :
		#print j
		words='/'.join(j)
		#print words
		file_out_tag.write(words)
		file_out_tag.write('\n')
Пример #28
0
        elif token[1] in ["RB", "RBR", "RBS"]:
            mapped_tag = "SRB"
        else:
            mapped_tag = "MISC"
        mapped_sentence.append((token[0], mapped_tag))

    return mapped_sentence

#Training step
train_sents = treebank.tagged_sents()[:500]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)

#Testing step
test_sents = treebank.sents()[500:1000]
tagged_sents = treebank.tagged_sents()[500:1000]
id = 0
file = open('Method-A-predictions.tsv', 'w')
w = csv.writer(file, delimiter='\t')
for actual_tagged_sent, actual_sent in zip(tagged_sents, test_sents):
    predicted_sent = t2.tag(actual_sent)
    evaluate(coarse_map(actual_tagged_sent), coarse_map(predicted_sent))
    row = id,coarse_map(actual_tagged_sent), coarse_map(predicted_sent)
    w.writerow(row)
    id += 1

table = []
total_tokens = 0
total_correct_tokens = 0
for k1, v1 in total_count.items():
Пример #29
0
import nltk
import MeCab
from nltk import Tree
from nltk.corpus import brown, gutenberg, treebank
from nltk.tokenize.api import TokenizerI

# <markdowncell>

# ### Corpora
# 
# NLTK has several built-in corpora and resources

# <codecell>

treebank.sents()

# <codecell>

nltk.download('treebank')

# <codecell>

print treebank.parsed_sents()[0]

# <markdowncell>

# NLTK's CorpusReader classes manage files:

# <codecell>
Пример #30
0
######## UNIGRAM TAGGER ##########

from nltk.tag import UnigramTagger
from nltk.corpus import treebank

#We use the first 3000 sentences of the treebank corpus as the training set to initialize
#the UnigramTagger class
#Unigram tagger can be trained by giving it a list of tagged sentences at initialization.
train_sents=treebank.tagged_sents()[:3000]
tagger=UnigramTagger(train_sents)
print treebank.sents()[0]
print tagger.tag(treebank.sents()[0])

test_sents=treebank.tagged_sents()[3000:]
print tagger.evaluate(test_sents)



tagger=UnigramTagger(model={'Pierre':'NN'})
tagger.tag(treebank.sents())[0]
Пример #31
0
'''
Created on Jul 2, 2015

@author: dongx
'''

import nltk
from nltk.corpus import brown, treebank
from nltk.tag import untag, tnt, DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger, brill, brill_trainer
from nltk.corpus.reader import ChunkedCorpusReader

#trainning data
sent = treebank.sents()[0]
brown_train_sents = brown.tagged_sents(categories='news')[1001:]
brown_test_sents = brown.tagged_sents(categories='news')[:1000]

#form multiple tagger in a tagging chain
def backoff_tagger(tagged_sents, tagger_classes, backoff=None):
    if not backoff:
        backoff = tagger_classes[0](tagged_sents)
        del tagger_classes[0]
 
    for cls in tagger_classes:
        tagger = cls(tagged_sents, backoff=backoff)
        backoff = tagger
 
    return backoff

def train_brill_tagger(initial_tagger, train_sents, **kwargs):
    templates = [
        brill.Template(brill.Pos([-1])), #a rule can be generated using the previous part-of-speech tag
Пример #32
0
def validate(model, leafmodel):
    model.eval()
    loss_list = []
    tick = time.time()
    total_avg_loss = 0
    for name in val_data_list:
        print('loaded file ', name)
        filex = treebank.sents(name)
        filey = treebank.parsed_sents(name)

        for s in range(0, len(filex)):
            print('working on sentence ', s)
            x = filex[s]
            y = filey[s]
            preprocess(y)
            # embed_x is the list of embedding vectors of x
            embed_x = []
            x_list = []
            l = int(len(x))

            for i in range(0, l):
                txlist = []
                x[i] = x[i].lower()
                txlist.append(x[i])
                tembed = torch.Tensor(get_embed(x[i]))
                embed_x.append(tembed)

                pred = leafmodel(embed_x[i])
                gt = (torch.argmax(pred)).item()
                txlist.append(gt)
                x_list.append(txlist)

            # we got the (sentence,gt) list, embedding vector list for the leafs
            xscore = 0.0
            while (len(x_list) != 1):
                x_list, embed_x, tscore = calculate_score(
                    x_list, embed_x, model)
                xscore = xscore + tscore

            x_list = str(x_list).replace('[', '(').replace(']', ')').replace(
                '\'', '').replace(',', '')
            x_list_tree = Tree.fromstring((x_list))

            # print('xscore is .....', xscore)
            yscore, _, celoss = compute_gtscore(y, model)
            delta_loss = compute_delta(x_list_tree, y)
            flist = []
            flist.append(delta_loss)
            delta_loss = torch.Tensor(flist)
            delta_loss = delta_loss.detach()

            # print('xscore is .....', xscore)
            yscore, _, celoss = compute_gtscore(y, model)
            # print('yscore is .....', yscore)
            # print('classification celosss is ....', celoss)
            loss = (xscore - yscore) + celoss + delta_loss
            total_avg_loss = total_avg_loss + loss.item()

    total_avg_loss = total_avg_loss / (len(filex) * len(data_list))
    print('validated for this epoch')
    return total_avg_loss
Пример #33
0

bigram.evaluate(test_sents)


# # Find most frequent nouns
# The most frequent nouns usually provide information on the subject of a text. Below, the most frequent nouns of an already tagged text of the *Treebank*-corpus are determined. Let's see if we can conclude the text's subject.  

# In[20]:


from nltk.corpus import treebank
from nltk import FreqDist
from nltk import bigrams

print("\nTreebank sentences: ", treebank.sents(fileids="wsj_0003.mrg"))


# In[21]:


tagged0003=treebank.tagged_words(tagset="universal",fileids="wsj_0003.mrg")
print("File tagged0003: ",tagged0003)


# In[22]:


fdist=FreqDist(a[0].lower() for a in tagged0003 if a[1]=="NOUN")
#fdist.tabulate(20)
print(fdist.most_common(20))
Пример #34
0
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


if __name__ == "__main__":
    # Load Word2Vec from Google
    w2v = word2vec.Word2Vec.load_word2vec_format("/Users/HyNguyen/Documents/Research/Data/GoogleNews-vectors-negative300.bin",binary=True)

    # Create object WordVectors

    wordvectors = WordVectors(300,np.empty((0,300),dtype=float),{})

    # wordvectors = WordVectors.load("model/wordvector.txt")

    # Penn Tree Bank
    treebank_sents = treebank.sents()
    for i in range(len(treebank_sents)):
        senttmp = " ".join(treebank_sents[i])
        words = nltk.word_tokenize(senttmp)
        wordvectors.add_wordvector_from_w2vmodel(w2v,words)
    print("Finish penn tree bank corpus, Wordvector size: ", str(wordvectors.embed_matrix.shape[0]))



    # Brown
    brown_sents = brown.sents()
    for i in range(len(brown_sents)):
        if i % 1000 == 0:
            print("brow, process line: ", i)
        senttmp = " ".join(brown_sents[i])
        words = nltk.word_tokenize(senttmp)
Пример #35
0
    for command in commands:
        c.execute(command)
    
print "Building clean words list..."
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))

    
def singles(words):
        if len(words) < 1:
            return
        for w in words:
            if re.match("[a-zA-Z'-]+", w) and w.strip() != "''":
                yield w
Пример #36
0
categories_df = {cat: pd.read_csv(f"./data/{cat}.csv") for cat in categories}

negative_sample_size = int(len(categories_df[selected_category]) / 4)
print(f"Selected Category: {selected_category}")
for category in categories_df:
    categories_df[category].drop('URL', 1, inplace=True)
    if category != selected_category:
        categories_df[category] = categories_df[category].sample(
            negative_sample_size)
    categories_df[category] = categories_df[category].assign(
        **{selected_category: category == selected_category})
    print("{} has {} samples;".format(category, len(categories_df[category])))
    #print(categories_df[category].head())
treebank_background = pd.DataFrame(
    map(lambda sent: ' '.join(sent),
        random.sample(list(treebank.sents()), negative_sample_size)),
    columns=["excerpt"]).assign(description=False)
#print("Treebank has {} samples.".format(len(treebank_background)))
#print("categories_df")
corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False)
corpus.append(treebank_background, ignore_index=True, sort=False)
corpus.fillna(value='', inplace=True)
#print(corpus)

pipeline = make_pipeline(TfidfVectorizer(),
                         LogisticRegression(solver='liblinear'))

X, y = corpus.excerpt, corpus[selected_category]

#cross validation
cv_results = cross_validate(pipeline, X, y, cv=5)
Пример #37
0
    print("false positives: ", falsePositives, "out of", amountPosTest,
          "tests")
    print("false negatives: ", falseNegative, "out of", amountNegTest, "tests")
    print("correctly parsed noun phrases:", posSucess, "out of",
          posSucess + falseNegative, "in gold standard")


#----------------------------------- MAIN -----------------------------------

sentences = treebank.parsed_sents()

train_len = round(0.9 * len(sentences))
data_train = sentences[0:train_len]
data_test = sentences[train_len + 1:-1]

sents = treebank.sents()
raw_sents = [[w for w in sents[i]]
             for i in range(train_len + 1, len(sentences))]

train_rules = chunker(data_train)
test_rules = chunker(data_test)

# All the rules with a probability under the value of
# prob_thresh will be discarded
prob_thresh = 0

rulesExclusivelyInTrain, rulesExclusivelyInTest, trainAmount, testAmount = accuracy(
    train_rules, test_rules, prob_thresh)

print('Noun phrase rules that occur exclusively in the train corpus: ',
      rulesExclusivelyInTrain)
Пример #38
0
def train(model, leafmodel, optimizer, total_epochs):
    model.train()
    loss_list = []
    val_loss_list = []
    for t in range(total_epochs):
        tick = time.time()
        total_avg_loss = 0
        print('epoch is .....', t)
        for name in train_data_list:
            print('loaded file ', name)
            filex = treebank.sents(name)
            filey = treebank.parsed_sents(name)

            for s in range(0, len(filex)):
                print('working on sentence ', s)
                x = filex[s]
                y = filey[s]
                preprocess(y)
                # embed_x is the list of embedding vectors of x
                embed_x = []
                x_list = []
                l = int(len(x))
                optimizer.zero_grad()

                for i in range(0, l):
                    txlist = []
                    x[i] = x[i].lower()
                    txlist.append(x[i])
                    tembed = torch.Tensor(get_embed(x[i]))
                    embed_x.append(tembed)

                    pred = leafmodel(embed_x[i])
                    gt = (torch.argmax(pred)).item()
                    txlist.append(gt)
                    x_list.append(txlist)

                # we got the (sentence,gt) list, embedding vector list for the leafs
                xscore = 0.0
                while (len(x_list) != 1):
                    x_list, embed_x, tscore = calculate_score(
                        x_list, embed_x, model)
                    xscore = xscore + tscore
                x_list = str(x_list).replace('[',
                                             '(').replace(']', ')').replace(
                                                 '\'', '').replace(',', '')
                x_list_tree = Tree.fromstring((x_list))

                # print('xscore is .....', xscore)
                yscore, _, celoss = compute_gtscore(y, model)
                delta_loss = compute_delta(x_list_tree, y)
                flist = []
                flist.append(delta_loss)
                delta_loss = torch.Tensor(flist)
                delta_loss = delta_loss.detach()

                # print('yscore is .....', yscore)
                # print('classification celosss is ....', celoss)
                loss = (xscore + delta_loss - yscore) + celoss
                loss.backward()
                optimizer.step()
                total_avg_loss = total_avg_loss + loss.item()

        total_avg_loss = total_avg_loss / (len(filex) * len(data_list))
        loss_list.append(total_avg_loss)
        print('************************************************')
        tock = time.time()
        print('epoch_time ====', (tock - tick))
        val_loss = validate(model, leafmodel)
        val_loss_list.append(val_loss)

        torch.save(model, './ckpt/model' + str(t) + '.pt')
    loss_array = np.array(loss_list)
    val_loss_array = np.array(val_loss_list)
    np.savetxt("train_loss.csv", loss_array, delimiter=",")
    np.savetxt("val_loss.csv", val_loss_array, delimiter=",")
Пример #39
0
from nltk.tag import UnigramTagger
from nltk.corpus import treebank

# train
train_sents = treebank.tagged_sents()[:3000]
tagger = UnigramTagger(train_sents)

print(treebank.sents()[0])
print(tagger.tag(treebank.sents()[0]))

# test
test_sents = treebank.tagged_sents()[3000:]
print(tagger.evaluate(test_sents))

Пример #40
0
from nltk.corpus import treebank as wsj

#file_in=open('wsj_0003.pos','r')

#content=file_in.readlines()
#file_in.close()

#print content[:5]

file_out_tag=open('tagged_sent_sample','w')
file_out_sent=open('untagged_sent_sample','w')
out1=wsj.sents()[:10]
out2=wsj.tagged_sents()[:10]
line=''
for i in out1 :
	#file_out_sent.write('\n\n')
	line=' '.join(i)
	file_out_sent.write(line)
	file_out_sent.write('\n')
	#print line	

for i in out2 :
	file_out_tag.write('\n\n')
	words=''
	for j in i :
		#print j
		words='/'.join(j)
		#print words
		file_out_tag.write(words)
		file_out_tag.write('\n')
	
Пример #41
0
print("ABC to sentences")
genesis_corp_sents = genesis.sents()
print("Genesis to sents")
frame_net_corp_sents = fn.sents()
print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
print('state union to sents')
subject_corp_sents = subjectivity.sents()
print('Subjectvity to sents')
brown_corp_sents = brown.sents()
print("Brown corpus to sents")
movie_reviews_corp_sents = movie_reviews.sents()
print("Movie reviews to sents ")
guttenberg_corp_sents = gutenberg.sents()
print("Guttenberg to sents")
treebank_corb_sents = treebank.sents()
print("Freebank to sents")
reuters_corp_sents = reuters.sents()
print("Reuters to sents")
webtext_corp_sents = webtext.sents()
print("Webtext to sents")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

print("Cleaning data ...")

discard_punctuation_and_lowercased_sents_condll2007 = [[
    word.lower() for word in sent if word not in punctuation
] for sent in conll2007_corp_sents]
# -*- coding: utf-8 -*-
#!/usr/bin/env python 
from gensim.models import Word2Vec 
from nltk.corpus import brown, movie_reviews, treebank 

if __name__ == '__main__':
	brown_sentences = Word2Vec(brown.sents())
	movie_sentences = Word2Vec(movie_reviews.sents()) 
	treebank_sentences = Word2Vec(treebank.sents()) 

	print brown_sentences.most_similar('money', topn=5) 
	print movie_sentences.most_similar('money', topn=5) 
	print treebank_sentences.most_similar('money', topn=5) 

Пример #43
0
def build_corpus(selected_category):
    categories_df = {cat : pd.read_csv(f"../data/{cat}.csv") for cat in categories}
    negative_sample_size = int(len(categories_df[selected_category]) / 4)
    print(f"Selected Category: {selected_category}")
    for category in categories_df:
        categories_df[category].drop('URL', 1, inplace=True)
        if category != selected_category:
            categories_df[category] = categories_df[category].sample(negative_sample_size)
        categories_df[category] = categories_df[category].assign(**{selected_category: category == selected_category})
        print("{} has {} samples;".format(category, len(categories_df[category])))
        #print(categories_df[category].head())
    treebank_background = pd.DataFrame(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), negative_sample_size)), columns=["excerpt"]).assign(description=False)
    #print("Treebank has {} samples.".format(len(treebank_background)))
    #print("categories_df")
    corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False)
    corpus.append(treebank_background, ignore_index=True, sort=False)
    corpus.fillna(value='', inplace=True)
    return corpus
class _BertTxtContainer:  # memorizza la frase più lunga che BERT possa valutare
    def __init__(self):
        self.tokN = 0
        self.txt = ''

    def addTxtArr(self, inputTxt):
        tokenized = tokenizer.tokenize(inputTxt)
        if len(tokenized) + self.tokN <= 512:
            self.txt += inputTxt + ' '
            self.tokN += len(tokenized)
            return True
        return False


# calcolo perplexity su PennTreebank
N = len(treebank.sents())
perplexity = []
print('Frasi:', N)
bert_txt = _BertTxtContainer()
c = 0

for sent in treebank.sents()[:N]:
    c += 1
    sentTxt = ' '.join(sent)
    # se ho sforato, calcolo perplexity e inserisco la frase in un nuovo oggetto
    if not bert_txt.addTxtArr(sentTxt):
        perplexity.append(BERT_model.get_score(bert_txt.txt))
        # print('tokN', bert_txt.tokN, 'toks:', bert_txt.txt)
        bert_txt = _BertTxtContainer()
        bert_txt.addTxtArr(sentTxt)
    print(100 * c / N, '%')
Пример #45
0
import nltk
from nltk.tag import hmm
from nltk.probability import LaplaceProbDist
from typing import List, Dict, AnyStr
from numpy import mean
import argparse
from nltk.corpus import treebank
import re


treebank_sentence = [' '. join(sentence) for sentence in treebank.sents()]

lower_cased = [sentence.lower() for sentence in treebank_sentence]
allowed_states = re.compile('[^a-z,.\s]')

print(allowed_states.('a', ''))
''.join([allowed_states.sub('', character) for character in lower_cased[0]])

for i in treebank_sentence[0]:
    print(type(i))

for i in additional_text_transitions.keys():
    if i not in tagger._transitions.keys():

        print(i)



Пример #46
0
from __future__ import division
from gensim.models import word2vec
from nltk.corpus import brown, treebank
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

tb_lens = 0
for i, sent in enumerate(treebank.sents()):
	tb_lens += len(sent)

model = word2vec.Word2Vec(iter=1, size=300, window=10, min_count=1, sg=0)
model.build_vocab(brown.sents())
model.train(treebank.sents(), total_examples=tb_lens, epochs=model.iter)

print(" < -o- > vocab length:")
print(len(model.wv.vocab))

model.save('model')
import nltk
from gensim.models import Word2Vec
from nltk.corpus import brown, movie_reviews, treebank

b = Word2Vec(brown.sents())
mr = Word2Vec(movie_reviews.sents())
t = Word2Vec(treebank.sents())

print(b.most_similar('money', topn=5))

print('aew')
Пример #48
0
with open('tc_tags.txt', 'w') as f:
    for tag in tc_tags:
        f.write('%s\n' % tag)

treebank_tags = []
for t in treebank.tagged_words():
    treebank_tags.append(t[1])

with open('treebank_tags.txt', 'w') as f:
    for tag in treebank_tags:
        f.write('%s\n' % tag)

with open('tc_sent_lengths.txt', 'w') as f:
    for sent in tc['sents']:
        f.write('%s\n' % len(sent))

with open('treebank_sent_lengths.txt', 'w') as f:
    for sent in treebank.sents():
        f.write('%s\n' % len(sent))

#tc_tags_series = pd.Series(tc_tags)
#tc_tag_freq = tc_tags_series.value_counts()
#tc_tag_freq.plot(kind='bar')

#treebank_tags_series = pd.Series(treebank_tags)
#treebank_tag_freq = treebank_tags_series.value_counts()
#treebank_tag_freq.plot(kind='bar')

#plt.show()

Пример #49
0
from nltk.corpus import treebank
from nltk.tree import *
import nltk
from nltk.grammar import *
import numpy

treebank.ensure_loaded()

# building the grammar and test set
tbank_productions = treebank.parsed_sents()
grammar_used = tbank_productions[:int(len(tbank_productions) * 0.8)]

# normalize the c structures
for t in grammar_used:
    t.chomsky_normal_form()
tbank_productions2 = list(treebank.sents())
test_part = tbank_productions2[int(len(tbank_productions) * 0.8):]


# prodcutions
productions = []
for t in grammar_used:
    productions += Tree.productions(t)

# induce PCFG
S = nltk.Nonterminal("S")
grammar = nltk.induce_pcfg(S, productions)
prod = grammar.productions()

#helping function to get the probability of a production
def findProb(lhsa, rhsa, prod):
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
print(treebank.sents()[0])
print(unitagger.tag(treebank.sents()[0]))
Пример #51
0
import nltk
from nltk.corpus import treebank
from nltk.tag import UnigramTagger
unitag = UnigramTagger(model={'Vinken': 'NN'})
print(unitag.tag(treebank.sents()[0]))
Пример #52
0
    brown_POS_tags = set()

    for word, pos in brown.tagged_words(tagset='universal'):
        brown_vocab.add(word.lower())
        brown_POS_tags.add(pos)

    hmm = SequentialHMM(brown_vocab, brown_POS_tags)

    brown_corpus = [[(word.lower(), pos) for word, pos in sent]
                    for sent in brown.tagged_sents(tagset='universal')]

    print("Not allowing unknown words:")
    hmm.train(brown_corpus)

    print("1 example in treebank:")
    for sent in treebank.sents():
        new_sent = [
            word.lower() for word in sent if word.lower() in brown_vocab
        ]
        if len(new_sent) == len(sent):
            print(hmm.decode(new_sent))
            break

    print("--------")
    print("Allowing unknown words:")

    hmm = SequentialHMM(brown_vocab, brown_POS_tags, allow_unknown=True)
    hmm.train(brown_corpus)

    print("5 examples in treebank:")
    for sent in treebank.sents()[:5]:
import nltk
from nltk.tag import BigramTagger
from nltk.corpus import treebank
training_1= treebank.tagged_sents()[:7000]
bigramtagger=BigramTagger(training_1)
print(treebank.sents()[0])
print(bigramtagger.tag(treebank.sents()[0]))
testing_1 = treebank.tagged_sents()[2000:]
print(bigramtagger.evaluate(testing_1))