def fit(self, train, save=True, load=True): ''' Trains the predictor. Parameters -------- data: pandas.DataFrame Training data. It contains the transactions of the sessions. It has one column for session IDs, one for item IDs and one for the timestamp of the events (unix timestamps). It must have a header. Column names are arbitrary, but must correspond to the ones you set during the initialization of the network (session_key, item_key, time_key properties). save : bool Flag to save the model in file after training. (default: True) load : bool Flag to load trained model if exists. (default: True) ''' if load and os.path.isfile(self.path_trained + self.file_prefix + self.file_suffix()): print("Model already trained! Loading...", end="") self.load_w2v_model() print("done.") else: print("Creating session vocabulary...", end="") train[self.item_key] = train[self.item_key].astype(str) sequences = train.groupby(self.session_key)[self.item_key] \ .apply(list) print("done.") print("Training model...", end="") if self.seed > 0: self.model = w2v(sequences, size=self.factors, sg=self.sg, window=self.window, workers=1, hs=self.hs, iter=self.epochs, min_count=1, seed=self.seed) else: self.model = w2v(sequences, size=self.factors, sg=self.sg, window=self.window, workers=self.workers, hs=self.hs, iter=self.epochs, min_count=1) print("done.") if (save): self.save_w2v_model() print("Model saved!") train[self.item_key] = train[self.item_key].astype(int)
def make_model(self): """ Model and train the word2vec model on words from tweets""" # Define parameters for the w2v model num_features = 300 min_word_count = 3 num_workers = multiprocessing.cpu_count() context_size = 7 downsampling = 1e-3 seed = 1 # Build the model self.tweet2vec = w2v( sg = 1, seed = seed, workers = num_workers, size = num_features, min_count = min_word_count, window = context_size, sample = downsampling ) # Build the vocabulary self.tweet2vec.build_vocab(self.sentences) # Train the model self.tweet2vec.train(self.sentences, epochs = 10, total_examples = len(self.sentences))
def fit_word_vectors(self, train, save_model=True): # Load word vectors if model already exists self.fit_session_vocabulary(train) if os.path.isfile(self.path_trained + self.file_prefix + self.file_suffix()): print("Model already trained! Loading...", end="") self.load_w2v_model() print("done.") else: # Generate word vectors print("Generating word vectors...", end="") self.model = w2v(self.all_session_items.values, size=self.factors, window=self.window, sg=self.sg, workers=4, hs=self.hs, iter=self.epochs, min_count=1) print("done.") if save_model: self.save_w2v_model() print("Model saved!") self.wv = self.model.wv del (self.model) # Discards wv model
def train_model(text, filename, phrases=True, workers=8, window=3, overwrite=False): if os.path.exists(filename) and not overwrite: return w2v.load(filename) pool = Pool(workers) tokens = pool.map(tokenize, text) cs = flatten(tokens) if phrases: sentences_phrases = Phrases(cs) sentences = sentences_phrases[cs] else: sentences = cs model_titulo = w2v(sentences,workers=workers,window=window,min_count=1,size=300) model_titulo.save(filename) return model_titulo
def create_w2v(data, emb_dim=300, window=5, min_count=5, negative=5, iterations=10): from gensim.models import Word2Vec as w2v workers = multiprocessing.cpu_count() w2v = w2v(data, size=emb_dim, window=window, min_count=min_count, negative=negative, iter=iterations, workers=workers) print('Word2Vec model created.') return w2v
#------------------------------------------------------------------------------- with open('s1test_x', 'rb') as test_x: s1test_x = pickle.load(test_x) with open('s1train_x', 'rb') as train_x: s1train_x = pickle.load(train_x) with open('s2test_x', 'rb') as test_x: s2test_x = pickle.load(test_x) with open('s2train_x', 'rb') as train_x: s2train_x = pickle.load(train_x) #------------------------------------------------------------------------------- cores = multiprocessing.cpu_count() w2v_model = w2v(min_count=10, window=10, size=300, sample=1e-5, alpha=0.03, min_alpha=0.0007, negative=5, workers=cores - 1) sentences = s1train_x + s2train_x print(sentences) #------------------------------------------------------------------------------- t = time() w2v_model.build_vocab(sentences, progress_per=10000) print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2))) t = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1.0) print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
from gensim.models import Word2Vec as w2v class MySentences(object): def __init__(self, filename): self.filename = filename def __iter__(self): for line in open( self.filename ): yield line.split() if __name__ == '__main__': srcfile = MySentences('/global-mt/lpeng/academic/neural-moses/corpus/bilingual/1227K-lowercase/3-clean/1227K-lowercase.chi-eng.tok.norm.clean.chi') cnmodel = w2v(srcfile, workers=4, size=4) cnmodel.save('/data/disk1/private/zy/phrase_str2vec/src/input/cnmodel') trgfile = MySentences('/global-mt/lpeng/academic/neural-moses/corpus/bilingual/1227K-lowercase/3-clean/1227K-lowercase.chi-eng.tok.norm.clean.eng') enmodel = w2v(trgfile, workers=4, size=4) enmodel.save('/data/disk1/private/zy/phrase_str2vec/src/input/enmodel')
def get_w2v_model(sentences): model = w2v(sentences, min_count=1, workers=4) print("Word2Vec Model Loaded Successfully.") features = model[model.wv.vocab] return features
from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import accuracy_score, confusion_matrix base_dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_tokenised_lemmatised/" directory = os.fsencode(base_dir) #initialise an empty model # min_count: ignore words with lower frequency than the count # window: maximum distance between the current and predicted word within a sentence # size: dimensionality of the feature vectors # alpha: learning rate model = w2v(min_count=10, window=2, sample=6e-5, negative=20, alpha=0.03, min_alpha=0.0007, size=300) #initialise empty list of dictionaries to create pandas dataframe items = [] for file in os.listdir(directory): dict = {} filename = os.fsdecode(file) id = filename.split(".txt", 1)[0] dict["ID"] = id #text is already tokenised using lexnlp r = open(base_dir + filename, "r", encoding='latin1').read()
def __init__(self, train_data, size=200, window=5, min_count=2, workers=8, sg=1, hs=1): self.model = w2v(train_data, size=size, window=window, min_count=min_count, workers=workers, sg=sg, hs=hs)
sns.set_style("darkgrid") from sklearn.decomposition import PCA from sklearn.manifold import TSNE from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import accuracy_score, confusion_matrix base_dir = "/Users/clavance/Desktop/Dropbox/Individual_project/EURLEX/html_test/" directory = os.fsencode(base_dir) #initialise an empty model # min_count: ignore words with lower frequency than the count # window: maximum distance between the current and predicted word within a sentence # size: dimensionality of the feature vectors # alpha: learning rate model = w2v(min_count=10, window=2, size=300) #initialise empty list of dictionaries to create pandas dataframe items = [] for file in os.listdir(directory): dict = {} filename = os.fsdecode(file) id = filename.split(".txt", 1)[0] dict["ID"] = id #text is already tokenised using lexnlp r = open(base_dir + filename, "r", encoding='latin1').read() s = r.split("Class: ", 1)[1] classes = s.split("\nText: `` ", 1)[0] dict["Class"] = classes
tokenizer = nltk.tokenize.RegexpTokenizer( r'\w+') # Keep only alphanumeric characters as tokens for idx_e in range(len(data)): for idx_n in range(len(data[idx_e]["tweets"])): text = data[idx_e]["tweets"][idx_n]["text"] text = re.sub(r"http\S+", "", text) # remove urls text = text.lower() # convert to lowercase tokens = tokenizer.tokenize(text) # tokenize tokenSet.append(tokens) magLabels.append( data[idx_e] ["magnitude"]) # every tokenized tweet has a magnitude label # Remove stopwords, numbers, singleton characters, and lemmatize stopwords_nltk = nltk.corpus.stopwords.words('english') lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() tokenSet = [[ lemmatizer.lemmatize(token) for token in doc if not (token in stopwords_nltk or token.isnumeric() or len(token) <= 1) ] for doc in tokenSet] # remove stopwords print('Preprocessing Completed. Total earthquakes: ', len(data), '. Total tweets: ', len(tokenSet)) print(tokenSet[0:10]) ### Deploy w2v #models = w2v(tokenSet[0:20], min_count=1, size=10) model = w2v(tokenSet, min_count=1, size=10) print('Vector for \'earthquake\': ', model['earthquake'])
def train_corpus(corpus): b = w2v(brown.sents()) b.batch_words
def create_model(self, min_count=1, size=100, window=5, sg=0): if not filename == None: token = file_tokenize(filename) else: token = text_tokenize(text) return w2v(token, min_count=min_count, size=100, window=5)
def train(self, sentences): corpus = [[str(token) for token in sent] for sent in sentences] model = w2v(corpus, size=self.size, min_count=self.min_count) for word in model.wv.vocab.keys(): self.vocab.embedding[word] = model.wv[word]
# TEST = sys.argv[3] ## parameters of word2vec WINDOW = 10 VEC_DIM = 100 # load (label, setence) y_train, x_train = utils.load_data(TRAIN, file_type='train') x_train_nolab = utils.load_data(TRAIN_NO_LAB, file_type='train_nolabel') _, x_test = utils.load_data(TEST, file_type='test') setence = x_train + x_train_nolab + x_test # setence to word list e.g. 'fxxk you' to ['fxxk', 'you'] setenece_split = [] for line in setence: words = text.text_to_word_sequence( line, filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n') setenece_split.append(words) # print(setenece_split[0:10]) # just for test dic = corpora.Dictionary(setenece_split) dic.save('./dictionary') print(dic) # word to vector model = w2v(setenece_split, window=WINDOW, size=VEC_DIM) model.save('./word_vec') # print(model.most_similar('get')) # print(model.similarity('get', 'getting'))