示例#1
0
def train_model():
    data = read_json_file("data/bible_kjv_wrangled.json")
    sentences = list(data.values())
    # Do we want everything in lowercase?
    sentences = [s.lower() for s in sentences]

    print("-----------Tokenize corpus-------------")
    tokenized_sentences = []
    for s in sentences:
        tokens = nltk.word_tokenize(s)
        tokenized_sentences.append(tokens)

    for s in abc.sents():
        s = list(filter(lambda x: x.isalpha() and len(x) > 1, s))
        s = [x.lower() for x in s]  # Do we want everything in lowercase?
        tokenized_sentences.append(s)

    for s in brown.sents():
        s = list(filter(lambda x: x.isalpha() and len(x) > 1, s))
        s = [x.lower() for x in s]  # Do we want everything in lowercase?
        tokenized_sentences.append(s)

    print("------------TRAINING FASTTEXT-----------")

    model = FastText(tokenized_sentences,
                     size=100,
                     window=5,
                     min_count=5,
                     workers=4,
                     sg=1)

    print("----------------DONE-------------")
    return model
示例#2
0
def init_data():
    all_sentence_data = abc.sents()
    all_words = []
    for sent in all_sentence_data:
        for word in sent:
            if word not in ["!", ",", "?", '"', "(", ")", ".", ":", ";"]:
                all_words.append(word.lower())
    return all_words
示例#3
0
def ari(fileid):
    """Accept text as list of words"""
    print(fileid)
    num_chars = len(abc.raw(fileid))
    num_words = len(abc.words(fileid))
    num_sents = len(abc.sents(fileid))

    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents

    return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43
def main():
	corpus = [sentence for sentence in abc_corpus.sents()]
	corpus = clean_corpus(corpus)
	corpus_size = len(corpus)

	w2v = word2vec(corpus)

	print ("Corpus Size - {}".format(corpus_size))
	print ("Vocab size - {}".format(w2v.vocab_size))
	
	if mode == "cbow":
		model = cbow(w2v)
		train(model, generate_cbow_train_data, corpus, w2v)
	elif mode == "skp":
		model = skip_gram(w2v)
		train(model, generate_skp_train_data, corpus, w2v)
示例#5
0
def pre_process():
    """Remove stop words and punctuation marks from corpus
    """
    if 'cleaned_corpus.pkl' not in os.listdir(
            os.curdir) or 'cleaned_sentences.pkl' not in os.listdir(os.curdir):
        print('Pre-processing...')
        words = abc.words()
        words = [w for w in words]

        sentences = abc.sents()
        sentences = [s for s in sentences]

        stop_words = stopwords.words('english')
        punctuation = list(string.punctuation)
        for i in range(len(sentences)):
            print(i)
            for j in sentences[i]:
                prev = len(sentences[i])
                #print(i*j)
                if set(j) - set(punctuation) == set() or j.lower(
                ) in stop_words:
                    print(j)
                    print('removed')
                    if j in words:
                        words.remove(j)
                    sentences[i].remove(j)
                    assert prev == len(sentences[i]) + 1

        for s in sentences:
            if len(s) <= 1:
                print(s)
                sentences.remove(s)

        pickle.dump(words, open('cleaned_corpus.pkl', 'wb'))
        pickle.dump(sentences, open('cleaned_sentences.pkl', 'wb'))

    else:
        print('Pre processed data already present..')
        words = pickle.load(open('cleaned_corpus.pkl', 'rb'))
        sentences = pickle.load(open('cleaned_sentences.pkl', 'rb'))

    return words, sentences
示例#6
0
    if len(raw_sentence) > 0:
        book_sentences.append(sentence_to_wordlist(raw_sentence))

#print(raw_sentences[5])
#print(book_sentences[5])

conll2000_corp_sents = conll2000.sents()
print("condll2000 to sents")
conll2002_corp_sents = conll2002.sents()
print("conll2002 to sents")

conll2007_corp_sents = conll2007.sents()
print("condll2007 to sents")
inaugural_corp_sents = inaugural.sents()
print("inaugural to sents")
abc_corp_sents = abc.sents()
print("ABC to sentences")
genesis_corp_sents = genesis.sents()
print("Genesis to sents")
frame_net_corp_sents = fn.sents()
print("Frame_net to sents")
state_union_corp_sents = state_union.sents()
print('state union to sents')
subject_corp_sents = subjectivity.sents()
print('Subjectvity to sents')
brown_corp_sents = brown.sents()
print("Brown corpus to sents")
movie_reviews_corp_sents = movie_reviews.sents()
print("Movie reviews to sents ")
guttenberg_corp_sents = gutenberg.sents()
print("Guttenberg to sents")
示例#7
0
nltk.download('abc')
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
import numpy as np
import torch
import itertools
import re
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F
"""**Get sentence out of the data**"""

nltk.download('punkt')
data = [" ".join(list_of_words) for list_of_words in abc.sents()]

#print(len(data))
new_data = data[0:50]
print(len(new_data))
"""**Pre Processing and Tokenize**"""


def preprocess_tokenize_text(new_data):

    corpus = []

    for sentence in new_data:
        text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', sentence)
        text = re.sub(' +', ' ', text)
        text.replace('\n', ' ')
示例#8
0
######################  3  #####################################################
############# Training Phase ###################################################
################################################################################

##########################  Training a couple models  ###################
#Training a model with the vocabulary from Pro_Lyrics_list
RM_model = Word2Vec(Pro_Lyrics_list, min_count=2, size=150, workers=15, window=15)
print()
#Saving the model
RM_model.save("word2vec.RM_model")
RM_model.save("RM_model.bin")
print()

#Training a model with the imported vocabulary from abc.sents()
abc_model = gensim.models.Word2Vec(abc.sents(), min_count=2, size=150, workers=15, window=15)
print()
#Saving the model
abc_model.save("word2vec.abc_model")
abc_model.save("abc_model.bin")
print()


####################### Storing vectors gen by models #################
# Store the vectors for train data in following file
### Finish <----------------------------------------------------------------------------------------Incomplete
#word2vec_filename = OUTPUT_FOLDER + 'train_review_word2vec.csv'
#RM_vectors_filename = r'C:\Users\hprob\Desktop\ErdosProjectMay2020\Sample_project\RM_vectors.csv'
#with open(RM_vectors_filename, 'w+') as word2vec_file:
#    for index, row in Lyrics_train.iterrows():
#        model_vector = (np.mean([RM_model[token] for token in row['lyrics']], axis=0)).tolist()
示例#9
0
from sklearn.manifold import TSNE
model = TSNE(n_components=3, random_state=0)
np.set_printoptions(suppress=True)

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

nltk.download('abc')
nltk.download('punkt')

stop_words = list(stop_words)

words = abc.words()

len(abc.sents())

sentences = abc.sents()[0:100]
# sentences=[['he', 'is', 'the', 'king'], ['the', 'king', 'is', 'royal'], ['she', 'is', 'the', 'royal', 'queen']]

sentences

words = []
sent = []
for i in sentences:
    temp = []
    for j in i:
        if j not in stop_words and j.isalpha():
            words.append(j)
            temp.append(j)
    sent.append(temp)
示例#10
0
print "Adding gutenberg sentence structures ({0}) ...".format(
    len(gutenberg.sents()))
for sentence in gutenberg.sents():
    processed_count += 1
    try:
        blob = TextBlob(filter(lambda x: x in string.printable,
                               " ".join(sentence)),
                        pos_tagger=PerceptronTagger())
        tags = tuple([tag[1] for tag in blob.tags])
        sentences.add(tags)
    except:
        print "\r",
    print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding abc sentence structures ({0})...".format(len(abc.sents()))
for sentence in abc.sents():
    processed_count += 1
    try:
        blob = TextBlob(filter(lambda x: x in string.printable,
                               " ".join(sentence)),
                        pos_tagger=PerceptronTagger())
        tags = tuple([tag[1] for tag in blob.tags])
        sentences.add(tags)
    except:
        print "\r",
    print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding reuters sentence structures ({0})...".format(len(
    reuters.sents()))
示例#11
0
# Example of comparison of reading difficulty score (ARI) for two NLTK corpora.

from nltk.corpus import abc


def avg(lst):
    lentotal = 0.0
    for word in lst:
        lentotal = lentotal + len(word)
    return lentotal / len(lst)


def ari(corpus_words, corpus_sents):
    avgchar = avg(corpus_words)
    avgsent = avg(corpus_sents)
    ari = 4.71 * avgchar + 0.5 * avgsent - 21.43
    return ari

print ari(abc.words('rural.txt'), abc.sents('rural.txt'))
print ari(abc.words('science.txt'), abc.sents('science.txt'))
示例#12
0
#This is a single class text classifier example using a Naive Bayes algorithm. It is an adaptation of the tutorial by http://textblob.readthedocs.io/

from textblob.classifiers import NaiveBayesClassifier
from textblob import TextBlob
from nltk.corpus import gutenberg
from nltk.corpus import abc as corpus

# lenght = len (corpus.raw())
# print (corpus.readme())
#print (gutenberg.fileids())
#print (corpus.fileids())
#head = corpus.raw('science.txt')
train = []
for sent in corpus.sents('science.txt')[50:150]:
    train.append((' '.join(sent), 'science'))
for sent in gutenberg.sents('austen-emma.txt')[50:150]:
    train.append((' '.join(sent), 'austen'))
for sent in gutenberg.sents('shakespeare-hamlet.txt')[5:150]:
    train.append((' '.join(sent), 'shakes'))
for sent in gutenberg.sents('melville-moby_dick.txt')[5:150]:
    train.append((' '.join(sent), 'melville'))
    #print ("new_____" , ' '.join(sent))
#print (train2)
# # print (corpus.words('science.txt'))

test = []
for sent in corpus.sents('science.txt')[400:420]:
    test.append((' '.join(sent), 'science'))
for sent in gutenberg.sents('austen-emma.txt')[400:420]:
    test.append((' '.join(sent), 'austen'))
for sent in gutenberg.sents('shakespeare-hamlet.txt')[400:420]:
示例#13
0
 bible = genesis.sents('english-kjv.txt')
 blake = gutenberg.sents('blake-poems.txt')
 bryant = gutenberg.sents('bryant-stories.txt')
 burgess = gutenberg.sents('burgess-busterbrown.txt')
 carroll = gutenberg.sents('carroll-alice.txt')
 ch_ball = gutenberg.sents('chesterton-ball.txt')
 ch_brown = gutenberg.sents('chesterton-brown.txt')
 ch_thurs = gutenberg.sents('chesterton-thursday.txt')
 edge = gutenberg.sents('edgeworth-parents.txt')
 mel = gutenberg.sents('melville-moby_dick.txt')
 mil = gutenberg.sents('milton-paradise.txt')
 caesar = gutenberg.sents('shakespeare-caesar.txt')
 hamlet = gutenberg.sents('shakespeare-hamlet.txt')
 macbeth = gutenberg.sents('shakespeare-macbeth.txt')
 whit = gutenberg.sents('whitman-leaves.txt')
 rural = abc.sents('rural.txt')
 science = abc.sents('science.txt')
 plots = subjectivity.sents('plot.tok.gt9.5000')
 quotes = subjectivity.sents('quote.tok.gt9.5000')
 austen = sense + emma + persuasion
 shakespeare = caesar + hamlet + macbeth
 facts = rural + science
 opinions = plots + quotes
 gute = bryant + burgess + carroll + edge + mel + mil + whit
 chester = ch_ball + ch_brown + ch_thurs
 total = austen + shakespeare + facts + opinions + gute + chester + b + sents
 #print(plots)
 #print(science)
 #print(bible)
 g = Word2Vec(total)
 g.wv.save_word2vec_format('model.bin', binary=True)
from DocumentFeatureSelection import interface
from DocumentFeatureSelection.models import PersistentDict
from sqlitedict import SqliteDict
import time
import os
"""This example shows you how to work on huge dataset.
For persisted-dict object you can choose PersistentDict or SqliteDict
You're supposed to be ready to use following corpora object in nltk
- abc
- genesis
- web
- gutenberg
"""

#----------------------------------------------------------
abc_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

# Case of PersistentDict
persistent_dict_obj = PersistentDict('demo.json', 'c', format='json')
persistent_dict_obj['abc'] = list(abc_corpus)
persistent_dict_obj['genesis'] = list(genesis_corpus)
persistent_dict_obj['web'] = list(web_corpus)
persistent_dict_obj['gutenberg'] = list(gutenberg_corpus)

start = time.time()
# If you put is_use_cache=True, it uses cache object for keeping huge objects during computation
# If you put is_use_memmap=True, it uses memmap for keeping matrix during computation
scored_matrix_obj = interface.run_feature_selection(
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus,
        method='pmi',
        n_jobs=-1,
        use_cython=True
    )
    elapsed_time = time.time() - start
    print ("elapsed_time with cython:{} [sec]".format(elapsed_time))

from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import genesis
from nltk.corpus import abc

abs_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

input_corpus = {
    'abs': list(abs_corpus),
    'genesis': list(genesis_corpus),
    'web': list(web_corpus),
    'gutenberg': list(gutenberg_corpus)
    }

pmi_with_cython(input_corpus)
pmi_with_parallel(input_corpus)
#pmi_with_threading(input_corpus)
示例#16
0

def saveContextWords(data):
    context = {}
    for i in range(len(data)):
        context[i] = data[i]
    dumpPickle("contextWords.pkl", context)


if __name__ == '__main__':
    obj = Word2Vec(2)

    # corpus = "natural language processing and machine learning is fun and exciting".split(" ")
    # sentences = [corpus]

    sentences = list(abc.sents())

    data = obj.preprocessing(sentences)
    print("preprocessing done")
    trainDataX, trainDataY = obj.targetAndContext(data)
    print("training Data is generated")
    exit(0)
    dumpPickle("index2WordMap.pkl", obj.index2WordMap)
    dumpPickle("word2IndexMap.pkl", obj.word2IndexMap)
    saveContextWords(trainDataX)

    # network = NN(obj.uniqueCount, 50)
    # network.train(trainDataX,trainDataY,100)
    # obj.nn = network
    # obj.findSimilarWords("natural",3)
示例#17
0
import numpy as np
import torch
from torch.autograd import Variable
import torch.functional as F
import torch.nn.functional as F
import nltk
nltk.download('abc')
from nltk.corpus import abc
import itertools, re

corpus = []

for text_id in abc.fileids():
    raw_text = list(itertools.chain.from_iterable(abc.sents(text_id)))
    text = ' '.join(raw_text)
    text = text.lower()
    text.replace('\n', ' ')
    text = re.sub('[^a-z ]+', '', text)
    corpus.append([w for w in text.split() if w != ''])

from collections import Counter
import random, math


def subsample_frequent_words(corpus):
    filtered_corpus = []
    word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
    sum_word_counts = sum(list(word_counts.values()))
    word_counts = {
        word: word_counts[word] / float(sum_word_counts)
        for word in word_counts
示例#18
0
    
print "Building clean words list..."
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))

    
def singles(words):
        if len(words) < 1:
            return
        for w in words:
            if re.match("[a-zA-Z'-]+", w) and w.strip() != "''":
                yield w

def doubles(sentences):
import nltk
import gensim
from nltk.corpus import abc

a = 1
print("Model is training")
model = gensim.models.Word2Vec(abc.sents())
print("1")
X = list(model.wv.vocab)
print("2")
data = model.most_similar('science')
print("3")
print(data)
print("Training Completed")
示例#20
0
def similar_words(word):
    model = gensim.models.Word2Vec(abc.sents())
    x = list(model.wv.vocab)
    data = model.wv.most_similar(word)
    print(data)
示例#21
0
def pmi_with_cython(input_corpus):
    logging.debug(msg='With cython is True')
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True)
    elapsed_time = time.time() - start
    print(("elapsed_time with cython:{} [sec]".format(elapsed_time)))


from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import genesis
from nltk.corpus import abc

abs_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

input_corpus = {
    'abs': list(abs_corpus),
    'genesis': list(genesis_corpus),
    'web': list(web_corpus),
    'gutenberg': list(gutenberg_corpus)
}

pmi_with_cython(input_corpus)
pmi_with_parallel(input_corpus)
#pmi_with_threading(input_corpus)
示例#22
0
print "Current Structure total: {0}".format(len(sentences))
del blob

print "Adding gutenberg sentence structures ({0}) ...".format(len(gutenberg.sents()))
for sentence in gutenberg.sents():
	processed_count += 1
	try:
		blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger())
		tags = tuple([tag[1] for tag in blob.tags])
		sentences.add(tags)
	except:
		print "\r",
	print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding abc sentence structures ({0})...".format(len(abc.sents()))
for sentence in abc.sents():
	processed_count += 1
	try:
		blob = TextBlob(filter(lambda x: x in string.printable, " ".join(sentence)), pos_tagger=PerceptronTagger())
		tags = tuple([tag[1] for tag in blob.tags])
		sentences.add(tags)
	except:
		print "\r",
	print "Processed {0} sentences\r".format(processed_count),
print "Current Structure total: {0}".format(len(sentences))

print "Adding reuters sentence structures ({0})...".format(len(reuters.sents()))
for sentence in reuters.sents():
	processed_count += 1
	try:
示例#23
0
from nltk.corpus import abc,stopwords
from string import punctuation
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
import matplotlib.pyplot as plt

sents = abc.sents()
#print(sents[:10])
puncs = list(punctuation)
stop = set(stopwords.words('english') + puncs + ["''" , "``"])
processed_sents = []
for sent in sents:
    temp = []
    for word in sent:
        if word not in stop:
            temp.append(word.lower())
    processed_sents.append(temp)
print(processed_sents[:10])

#Output
#[['pm', 'denies', 'knowledge', 'awb', 'kickbacks', 'the', 'prime', 'minister', 'denied', 'knew', 'awb', 'paying', 'kickbacks', 'iraq', 'despite', 'writing', 'wheat', 'exporter', 'asking', 'kept', 'fully', 'informed', 'iraq', 'wheat', 'sales'], ['letters', 'john', 'howard', 'deputy', 'prime', 'minister', 'mark', 'vaile', 'awb', 'released', 'cole', 'inquiry', 'oil', 'food', 'program'], ['in', 'one', 'letters', 'mr', 'howard', 'asks', 'awb', 'managing', 'director', 'andrew', 'lindberg', 'remain', 'close', 'contact', 'government', 'iraq', 'wheat', 'sales'], ['the', 'opposition', 'gavan', 'o', 'connor', 'says', 'letter', 'sent', '2002', 'time', 'awb', 'paying', 'kickbacks', 'iraq', 'though', 'jordanian', 'trucking', 'company'], ['he', 'says', 'government', 'longer', 'wipe', 'hands', 'illicit', 'payments', 'totalled', '290', 'million'], ['the', 'responsibility', 'must', 'lay', 'may', 'squarely', 'feet', 'coalition', 'ministers', 'trade', 'agriculture', 'prime', 'minister', ',"', 'said'], ['but', 'prime', 'minister#', 'says', 'letters', 'show', 'inquiring', 'future', 'wheat', 'sales', 'iraq', 'prove', 'government', 'knew', 'payments'], ['it', 'would', 'astonishing', '2002', 'prime', 'minister', 'i', 'done', 'anything', 'i', 'possibly', 'could', 'preserve', 'australia', 'valuable', 'wheat', 'market', ',"', 'said'], ['email', 'questions', 'today', 'inquiry', 'awb', 'trading', 'manager', 'peter', 'geary', 'questioned', 'email', 'received', 'may', '2000'], ['it', 'indicated', 'iraqi', 'grains', 'board', 'approached', 'awb', 'provide', 'sales', 'service', '".']]

embeddings = Word2Vec(sentences=processed_sents,size=300,min_count=20,workers=4,sg=0,iter=5,hs=0)
print(embeddings.wv.most_similar('government'))
vocab = list(embeddings.wv.vocab)
X = embeddings[vocab]
tsne_model = TSNE(n_components=2)
X_tsne = tsne_model.fit_transform(X)

from DocumentFeatureSelection.models import PersistentDict
from sqlitedict import SqliteDict
import time
import os

"""This example shows you how to work on huge dataset.
For persisted-dict object you can choose PersistentDict or SqliteDict
You're supposed to be ready to use following corpora object in nltk
- abc
- genesis
- web
- gutenberg
"""

#----------------------------------------------------------
abc_corpus = abc.sents()
genesis_corpus = genesis.sents()
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

# Case of PersistentDict
persistent_dict_obj = PersistentDict('demo.json', 'c', format='json')
persistent_dict_obj['abc'] = list(abc_corpus)
persistent_dict_obj['genesis'] = list(genesis_corpus)
persistent_dict_obj['web'] = list(web_corpus)
persistent_dict_obj['gutenberg'] = list(gutenberg_corpus)

start = time.time()
# If you put is_use_cache=True, it uses cache object for keeping huge objects during computation
# If you put is_use_memmap=True, it uses memmap for keeping matrix during computation
scored_matrix_obj = interface.run_feature_selection(