def initModel(self): path = self.getModelFilePath() modelFull = self.config.getBooleanConfig("common.model.full")[0] if modelFull: if self.model is None: self.model = Word2Vec.load(path) self.wv = self.model.wv else: if self.wv is None: self.wv = KeyedVectors.load(path, mmap='r')
def __init__(self): self.cmdpairs = { "!similar": self.execute_cnb, "!similaryle": self.execute_yle, "!similarn": self.execute_n_cnb, "!similarnyle": self.execute_n_yle, "!similarnr": self.execute_n_cnb_r, "!similarnyler": self.execute_n_yle_r, "!xminusyplusz": self.execute_xyz_cnb, "!xminusypluszyle": self.execute_xyz_yle, # "!xminusyplusz": self.execute_x_minus_y_plus_z } self.cnb_wv = gensim.models.Word2Vec.load("./Resources/word2vec_2014-2019_04.model").wv self.yle_wv = KeyedVectors.load("./Resources/word2vec_yle_dersb")
clusters = [0] * 10 i = 0 while i < 10: RandGenerated = random.randint(1, len(data) - 2) if RandGenerated in clusters: i = i - 1 else: clusters[i] = RandGenerated i = i + 1 clusters = sorted(clusters) print(clusters) clusters = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90] # In[3]: model = KeyedVectors.load('newmodel') # model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True) # model.save('newmodel') # In[4]: def countDistance(sentenceConst, WholeGroup): distance = [0] * len(WholeGroup) sentenceFirstString = str(sentenceConst) sentenceFirstString = sentenceFirstString.lower().split() sentenceFirstString = [ w for w in sentenceFirstString if w not in stop_words ] i = 0 for sentence in WholeGroup:
from rake_nltk import Rake from gensim.models import KeyedVectors from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from collections import Counter, defaultdict from math import log, floor import numpy as np from sklearn.metrics.pairwise import cosine_similarity import textrank import random import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity model = KeyedVectors.load("models/normalized.model") stop_words = set(stopwords.words('english')) def equals(prediction, expected): if(prediction == expected): return 1 else: return 0 def get_prediction(scores): total = sum(scores.values()) if(total != 0): for score in scores: scores[score] = scores[score]/total
my_df = pd.read_csv("clean_tweet.csv", index_col=0) my_df.dropna(inplace=True) my_df.reset_index(drop=True, inplace=True) x = my_df.text y = my_df.target SEED = 2000 x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED) x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED) model_ug_cbow = KeyedVectors.load('w2v_model_ug_cbow.word2vec') model_ug_sg = KeyedVectors.load('w2v_model_ug_sg.word2vec') embeddings_index = {} for w in model_ug_cbow.wv.vocab.keys(): embeddings_index[w] = np.append(model_ug_cbow.wv[w], model_ug_sg.wv[w]) tokenizer = Tokenizer(num_words=100000) tokenizer.fit_on_texts(x_train) tokenizer_json = tokenizer.to_json() with io.open('tokenizer.json', 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=False)) print("Saved") sequences = tokenizer.texts_to_sequences(x_train)
### Setup os.chdir('/Users/chrki23/Documents/Insight_Project') path = os.getcwd() ### File Loading lookup_path = path + '/data/cleaned/lookup_table.data' fileloader = open(lookup_path, 'rb') lookup_table = pickle.load(fileloader) fileloader.close() vector_path = path + '/data/cleaned/final_vectors.kv' fname = get_tmpfile(vector_path) word_vectors = KeyedVectors.load(fname, mmap="r") ingredients_path = path + '/data/cleaned/ingredients_used.data' fileloader = open(ingredients_path, 'rb') ingredients_used = pickle.load(fileloader) fileloader.close() ### Helper functions def display_choice(user_input): missing_product = lookup_table[(lookup_table.name.isin(user_input))] return missing_product.iloc[:, 0:4] def get_ingredient(user_input):
def createDatasetSplit(params): filename = set_name(params) if path.exists(filename): ##### REMOVE LATER ###### #dataset=collect_data(params) pass else: dataset = collect_data(params) if (path.exists(filename[:-7])): with open(filename[:-7] + '/train_data.pickle', 'rb') as f: X_train = pickle.load(f) with open(filename[:-7] + '/val_data.pickle', 'rb') as f: X_val = pickle.load(f) with open(filename[:-7] + '/test_data.pickle', 'rb') as f: X_test = pickle.load(f) if (params['bert_tokens'] == False): with open(filename[:-7] + '/vocab_own.pickle', 'rb') as f: vocab_own = pickle.load(f) else: if (params['bert_tokens'] == False): word2vecmodel1 = KeyedVectors.load("Data/word2vec.model") vector = word2vecmodel1['easy'] assert (len(vector) == 300) dataset = pd.read_pickle(filename) #X_train_dev, X_test= train_test_split(dataset, test_size=0.1, random_state=1,stratify=dataset['Label']) #X_train, X_val= train_test_split(X_train_dev, test_size=0.11, random_state=1,stratify=X_train_dev['Label']) with open('Data/post_id_divisions.json', 'r') as fp: post_id_dict = json.load(fp) X_train = dataset[dataset['Post_id'].isin(post_id_dict['train'])] X_val = dataset[dataset['Post_id'].isin(post_id_dict['val'])] X_test = dataset[dataset['Post_id'].isin(post_id_dict['test'])] if (params['bert_tokens']): vocab_own = None vocab_size = 0 padding_idx = 0 else: vocab_own = Vocab_own(X_train, word2vecmodel1) vocab_own.create_vocab() padding_idx = vocab_own.stoi['<pad>'] vocab_size = len(vocab_own.vocab) X_train = encodeData(X_train, vocab_own, params) X_val = encodeData(X_val, vocab_own, params) X_test = encodeData(X_test, vocab_own, params) print("total dataset size:", len(X_train) + len(X_val) + len(X_test)) os.mkdir(filename[:-7]) with open(filename[:-7] + '/train_data.pickle', 'wb') as f: pickle.dump(X_train, f) with open(filename[:-7] + '/val_data.pickle', 'wb') as f: pickle.dump(X_val, f) with open(filename[:-7] + '/test_data.pickle', 'wb') as f: pickle.dump(X_test, f) if (params['bert_tokens'] == False): with open(filename[:-7] + '/vocab_own.pickle', 'wb') as f: pickle.dump(vocab_own, f) if (params['bert_tokens'] == False): return X_train, X_val, X_test, vocab_own else: return X_train, X_val, X_test
movie_popularities = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/movie_popularities.p", "rb" )) book_popularities = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/book_popularities.p", "rb" )) common_tropes = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/common_tropes.p", "rb" )) col_to_trope_list = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/col_to_trope_list.p", "rb" )) movie_titles = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/movie_titles.p", "rb" )) book_titles = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/book_titles.p", "rb" )) with open('./app/irsystem/controllers/SPARSE OR NECESSARY/book_word_to_trope.json', 'r') as f: book_word_to_trope = json.load(f) with open('./app/irsystem/controllers/SPARSE OR NECESSARY/movie_word_to_trope.json', 'r') as f: movie_word_to_trope = json.load(f) book_to_movie_vectorizer = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/book_to_movie_vectorizer.pickle", "rb" )) movie_to_book_vectorizer = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/movie_to_book_vectorizer.pickle", "rb" )) movie_tf_idf = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/movie_tf_idf.pickle", "rb" )) book_tf_idf = pickle.load(open("./app/irsystem/controllers/SPARSE OR NECESSARY/book_tf_idf.pickle", "rb" )) model = KeyedVectors.load("./app/irsystem/controllers/SPARSE OR NECESSARY/tbwb_model.bin") def get_closest_tropes_to_keyword(keyword, word_to_trope, model, top_k = 5): if keyword in model.vocab: all_words = list(word_to_trope.keys()) all_words = [word for word in all_words if word in model.vocab] dists = model.distances(keyword, all_words) sorted_indices = np.argsort(dists) sorted_keyword_match = [all_words[idx] for idx in sorted_indices[:top_k]] for word in sorted_keyword_match: trope_matches = list(itertools.chain.from_iterable([word_to_trope[word] for word in sorted_keyword_match if word in word_to_trope])) return trope_matches[:top_k] else: print('`{}` not in model vocabulary, cannot enhance search with keyword'.format(keyword)) return []
import json import requests import bs4 import numpy as np import time from gensim.models import KeyedVectors import multiprocessing #Program extracts information from CLEF's JSON at https://raw.githubusercontent.com/ag-sc/QALD/master/9/data/qald-9-train-multilingual.json #Do note that this program takes a while to complete. modelPath="/alignedEnVecs"#Path to file produced by utils.vec2File() model = KeyedVectors.load(modelPath,mmap="r")#Input file path to appropriate model stopWords=[line.split()[0] for line in open("SMART Stopwords.txt","r")] #Various statistics that may be useful df={}#Document term frequency ctf={}#Collection term frequency mtf={}#Mean term frequency (Like collection term frequency but normalized for document length) totalDocCount=0#Total number of documents def vectorize(string,model):#Function calculates the tf-idf averaged vectors of whatever string is passed using whatever mdoel is passed terms = string.lower().split() vector = np.zeros((1,300)) totalTfIdf=0 for term in terms: if term not in stopWords: try: tfidf=string.count(term)*np.log10(totalDocCount/df[term])
def training(self, config): DATASET_PATH = config['file']['input']['trainingtweets'] df = pd.read_csv(DATASET_PATH) df = df[["choose_one", "text", "choose_one:confidence"]] #df = df[df['choose_one:confidence'] == 1.0] print("There are %d items in df" % len(df)) df['target'] = df.choose_one.map({'Relevant': 1, 'Not Relevant': 0}) df = df[df.target.isnull() == False] df = df.drop_duplicates(subset=["text"]).reset_index() print("Total unique tweets:%d" % len(df)) tokenizer_it = twitter_parser.Tokenizer() df['tokenized'] = df["text"].apply(tokenizer_it.tweet_to_tokens) list_tokenized_tweets = [] for index, row in df.iterrows(): temp = row['tokenized'] str = ' '.join(temp) list_tokenized_tweets.append(str) new_column = pd.Series(list_tokenized_tweets) df['tokenized_text'] = new_column.values print(df.columns) print("Statistic to check skew-data:") print("On-topic Tweets: %d" % len(df[df['target'] == 1])) print("Off-topic Tweets:%d" % len(df[df['target'] == 0])) df.drop('index', axis=1, inplace=True) print(df.head()) df.info() x = df.tokenized_text y = df.target print(y) from sklearn.cross_validation import train_test_split SEED = 2000 x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=0.2, random_state=SEED) x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=0.5, random_state=SEED) print("Train set has total %d entries" % len(x_train)) print("Train set has total %.2f percent Relevant tweets" % (len(x_train[y == 1.]) / len(x_train))) print("Train set has total %.2f percent Not Relevant tweets" % (len(x_train[y == 0.]) / len(x_train))) print("--------------------------------------------") print("Validation set has total %d entries" % len(x_validation)) print( "Validation set has total %.2f percent Relevant tweets" % (len(x_validation[y == 1.]) / len(x_validation))) print("Validation set has total %.2f percent Not Relevant tweets" % ( len(x_validation[y == 0.]) / len(x_validation))) print("--------------------------------------------") print("Test set has total %d entries" % len(x_test)) print("Test set has total %.2f percent Relevant tweets" % (len(x_test[y == 1.0]) / len(x_test))) print("Test set has total %.2f percent Not Relevant tweets" % (len(x_test[y == 0.]) / len(x_test))) print("--------------------------------------------") from tqdm import tqdm tqdm.pandas(desc="progress-bar") import gensim from gensim.models.word2vec import Word2Vec from gensim.models.doc2vec import TaggedDocument import multiprocessing from sklearn import utils def labelize_tweets_ug(tweets, label): result = [] prefix = label for i, t in zip(tweets.index, tweets): result.append(TaggedDocument(t.split(), [prefix + '_%s' % i])) return result all_x = pd.concat([x_train, x_validation, x_test]) all_x_w2v = labelize_tweets_ug(all_x, 'all') cores = multiprocessing.cpu_count() model_ug_cbow = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065) model_ug_cbow.build_vocab([x.words for x in tqdm(all_x_w2v)]) for epoch in range(30): model_ug_cbow.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1) model_ug_cbow.alpha -= 0.002 model_ug_cbow.min_alpha = model_ug_cbow.alpha model_ug_sg = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065) model_ug_sg.build_vocab([x.words for x in tqdm(all_x_w2v)]) for epoch in range(30): model_ug_sg.train(utils.shuffle([x.words for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1) model_ug_sg.alpha -= 0.002 model_ug_sg.min_alpha = model_ug_sg.alpha model_ug_cbow.save('../models/CrowFlowd/w2v_model_ug_cbow.word2vec') model_ug_sg.save('../models/CrowFlowd/w2v_model_ug_sg.word2vec') from gensim.models import KeyedVectors model_ug_cbow = KeyedVectors.load('../models/CrowFlowd/w2v_model_ug_cbow.word2vec') model_ug_sg = KeyedVectors.load('../models/CrowFlowd/w2v_model_ug_sg.word2vec') print(len(model_ug_cbow.wv.vocab.keys())) embeddings_index = {} import numpy as np for w in model_ug_cbow.wv.vocab.keys(): embeddings_index[w] = np.append(model_ug_cbow.wv[w], model_ug_sg.wv[w]) print('Found %s word vectors.' % len(embeddings_index)) from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences import pickle tokenizer = Tokenizer(num_words=100000) tokenizer.fit_on_texts(x_train) sequences = tokenizer.texts_to_sequences(x_train) print(len(tokenizer.word_index)) self._tokenizer = tokenizer with open('../models/CrowFlowd/tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) length = [] for x in x_train: length.append(len(x.split())) print(max(length)) x_train_seq = pad_sequences(sequences, maxlen=45) print('Shape of data tensor:', x_train_seq.shape) sequences_val = tokenizer.texts_to_sequences(x_validation) x_val_seq = pad_sequences(sequences_val, maxlen=45) sequences_test = tokenizer.texts_to_sequences(x_test) x_test_seq = pad_sequences(sequences_test, maxlen=45) with open('../models/CrowFlowd/x_test_seq.obj', 'wb') as handle: pickle.dump(x_test_seq, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('../models/CrowFlowd/y_test.obj', 'wb') as handle: pickle.dump(y_test, handle, protocol=pickle.HIGHEST_PROTOCOL) num_words = 100000 embedding_matrix = np.zeros((num_words, 200)) for word, i in tokenizer.word_index.items(): if i >= num_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector seed = 7 from keras.models import Sequential from keras.layers import Dense, Dropout from keras.layers import Flatten from keras.layers.embeddings import Embedding from keras import backend as K def f1(y_true, y_pred): def recall(y_true, y_pred): true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) recall = true_positives / (possible_positives + K.epsilon()) return recall def precision(y_true, y_pred): true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) return precision precision = precision(y_true, y_pred) recall = recall(y_true, y_pred) return 2 * ((precision * recall) / (precision + recall + K.epsilon())) from keras.layers import Input, Dense, concatenate, Activation, Conv1D, GlobalMaxPooling1D from keras.models import Model tweet_input = Input(shape=(45,), dtype='int32') tweet_encoder = Embedding(100000, 200, weights=[embedding_matrix], input_length=45, trainable=True)(tweet_input) bigram_branch = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder) bigram_branch = GlobalMaxPooling1D()(bigram_branch) trigram_branch = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)( tweet_encoder) trigram_branch = GlobalMaxPooling1D()(trigram_branch) fourgram_branch = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)( tweet_encoder) fourgram_branch = GlobalMaxPooling1D()(fourgram_branch) merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1) merged = Dense(256, activation='relu')(merged) merged = Dropout(0.2)(merged) merged = Dense(1)(merged) output = Activation('sigmoid')(merged) model = Model(inputs=[tweet_input], outputs=[output]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() from keras.callbacks import ModelCheckpoint import numpy as np from keras.callbacks import Callback from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score class Metrics(Callback): def on_train_begin(self, logs={}): self.val_f1s = [] self.val_recalls = [] self.val_precisions = [] def on_epoch_end(self, epoch, logs={}): val_predict = (np.asarray(self.model.predict(x_val_seq))).round() val_targ = y_validation _val_f1 = f1_score(val_targ, val_predict) _val_recall = recall_score(val_targ, val_predict) _val_precision = precision_score(val_targ, val_predict) self.val_f1s.append(_val_f1) self.val_recalls.append(_val_recall) self.val_precisions.append(_val_precision) print(" — val_f1: % f — val_precision: % f — val_recall % f" % (_val_f1, _val_precision, _val_recall)) return metric = Metrics() filepath = "../models/Crowflowd/CNN_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5" checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') model.fit(x_train_seq, y_train, batch_size=64, epochs=5, validation_data=(x_val_seq, y_validation), callbacks=[metric, checkpoint]) self._model = model scores = model.evaluate(x_test_seq, y_test, verbose=1) print("Accuracy: %.2f%%" % (scores[1] * 100))
def main(): parser = argparse.ArgumentParser( description="Train the FAN or the HAN model" ) parser.add_argument( "dataset", choices=["yelp", "yahoo", "amazon", "synthetic"], help="Choose the dataset", ) parser.add_argument( "model", choices=["fan", "han"], help="Choose the model to be trained (flat or hierarchical)", ) args = parser.parse_args() if args.dataset == "yelp": dataset_config = Yelp elif args.dataset == "yahoo": dataset_config = Yahoo elif args.dataset == "amazon": dataset_config = Amazon elif args.dataset == "synthetic": dataset_config = Synthetic else: # should not end there exit() wv = KeyedVectors.load(dataset_config.EMBEDDING_FILE) train_df = pd.read_csv(dataset_config.TRAIN_DATASET).fillna("") train_documents = train_df.text train_labels = train_df.label if args.model == "fan": train_dataset = FlatDataset( train_documents, train_labels, wv.vocab, dataset_config.WORDS_PER_DOC[PADDING], ) else: train_dataset = HierarchicalDataset( train_documents, train_labels, wv.vocab, dataset_config.SENT_PER_DOC[PADDING], dataset_config.WORDS_PER_SENT[PADDING], ) train_data_loader = torch.utils.data.DataLoader( train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, ) val_df = pd.read_csv(dataset_config.VAL_DATASET).fillna("") val_documents = val_df.text val_labels = val_df.label if args.model == "fan": val_dataset = FlatDataset( val_documents, val_labels, wv.vocab, dataset_config.WORDS_PER_DOC[PADDING], ) else: val_dataset = HierarchicalDataset( val_documents, val_labels, wv.vocab, dataset_config.SENT_PER_DOC[PADDING], dataset_config.WORDS_PER_SENT[PADDING], ) val_data_loader = torch.utils.data.DataLoader( val_dataset, batch_size=BATCH_SIZE, shuffle=True ) logdir = Path(f"{LOG_DIR}/{args.dataset}/{args.model}") logdir.mkdir(parents=True, exist_ok=True) writer = SummaryWriter(str(logdir / f"{PADDING}pad")) if args.model == "fan": model = Fan( embedding_matrix=wv.vectors, word_hidden_size=WORD_HIDDEN_SIZE, num_classes=len(train_labels.unique()), batch_size=BATCH_SIZE, ).to(DEVICE) else: model = Han( embedding_matrix=wv.vectors, word_hidden_size=WORD_HIDDEN_SIZE, sent_hidden_size=SENT_HIDDEN_SIZE, num_classes=len(train_labels.unique()), batch_size=BATCH_SIZE, ).to(DEVICE) criterion = torch.nn.NLLLoss().to(DEVICE) optimizer = torch.optim.SGD( (p for p in model.parameters() if p.requires_grad), lr=LEARNING_RATE, momentum=MOMENTUM, ) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=0.1, patience=PATIENCE - 2, verbose=True, ) train_losses = [] train_accs = [] val_losses = [] val_accs = [] best_val_loss = 1_000_000 best_state_dict = model.state_dict() actual_patience = 0 for epoch in range(1, EPOCHS + 1): train_loss, train_acc = train_func( model, train_data_loader, criterion, optimizer, writer ) train_losses.append(train_loss) train_accs.append(train_acc) val_loss, val_acc = test_func(model, val_data_loader, criterion) val_losses.append(val_loss) val_accs.append(val_acc) print(f"Epoch {epoch}") print( f" Train loss: {train_loss:.4}, Train acc: {train_acc * 100:.1f}%" ) print(f" Val loss: {val_loss:.4}, Val acc: {val_acc * 100:.1f}%") lr_scheduler.step(val_loss) writer.add_scalar("Train/Loss", train_loss, epoch) writer.add_scalar("Train/Accuracy", train_acc, epoch) writer.add_scalar("Validation/Loss", val_loss, epoch) writer.add_scalar("Validation/Accuracy", val_acc, epoch) writer.add_scalar( "Learning rate", optimizer.param_groups[0]["lr"], epoch ) # Early stopping with patience if val_loss < best_val_loss: actual_patience = 0 best_val_loss = val_loss best_state_dict = model.state_dict() else: actual_patience += 1 if actual_patience == PATIENCE: model.load_state_dict(best_state_dict) break writer.add_text( "Hyperparameters", f"BATCH_SIZE = {BATCH_SIZE}; " f"MOMENTUM = {MOMENTUM}; " f"PATIENCE = {PATIENCE}; " f"PADDING = {PADDING}", ) writer.close() modeldir = Path(MODEL_DIR) modeldir.mkdir(parents=True, exist_ok=True) torch.save( model.state_dict(), f"{modeldir}/{args.dataset}-{args.model}-{PADDING}pad.pth", )
from gensim.models import KeyedVectors import time, timeit import sys import io import pdb sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') pdb.set_trace() t0 = time.clock() #wv_from_text = KeyedVectors.load_word2vec_format('/Users/jagenzhao/dataprocess/word2vec/Tencent_AILab_ChineseEmbedding.txt') #wv_from_text.init_sims(replace=True) #wv_from_text.save('/Users/Jagen/Downloads/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.models') model = KeyedVectors.load( '/Users/jagenzhao/dataprocess/word2vec/Tencent_AILab_ChineseEmbedding.models', mmap='r') print('load models ok.', time.clock() - t0, 's\n') t0 = time.clock() print(u'白发') keys = model.similar_by_word('白发', topn=100) for key in keys: print(key) print('cost %f' % (time.clock() - t0)) t0 = time.clock() print('\n') print('斑丘疹') for key in model.similar_by_word('斑丘疹', topn=100): print(key)
# window = 2, # min_count=50, # workers=4, # iter=100, sg=1 # skip-gram ) return model if __name__ == '__main__': saved_model_name = 'model.wv' # Load a model try: model = KeyedVectors.load(saved_model_name, mmap='r') except (FileNotFoundError): # input_data = read_data_files('./data/aclImdb/train/pos') # input_data = word2vec.Text8Corpus(datapath('./data/aclImdb/train/pos')) sentences = word2vec.PathLineSentences( # datapath( # os.path.expanduser( # os.path.join('data', 'aclImdb', 'train', 'pos') # ) # ) 'C:\\Users\\jinai\\git_projects\\D-RNN\\experimental\\sejin\\word2vec_test\\data\\aclImdb\\test\\pos' ) model = make_word2vec_model(list(sentences)) model.save(saved_model_name) # Remove unnecessary data from the memory
# create data source plain txt lib def create_java_source_code_data(): with open(code_dot_data_cfg_generated, mode='r') as csv_file: csv_reader = csv.DictReader(csv_file) next(csv_reader) # skip header for item in csv_reader: plant_source_code_text = read_source_code(item[target_java_file]) write2file(save_source_code_file, plant_source_code_text.encode("utf-8", 'ignore')) # create_java_source_code_data() doc2vec_cfg = Doc2Vec.load(CFG_EMBEDDING_FILE) word2vec = KeyedVectors.load(SOURCE_CODE_EMBEDDING_FILE) # with open(code_dot_data_cfg_generated, mode='r') as csv_file: # csv_reader = csv.DictReader(csv_file) # next(csv_reader) # skip header # for item in csv_reader: # dot_list = item[dot_file_list].split('|') # dotfiledict = {} # for dfname in dot_list: # dot_fname = find_dotf_name(dfname).replace('(', '') # dotfiledict[dot_fname] = dfname # row = ( # item['file_name'], # item['target_java_named_folder'], # item['target_java_named_folder'].replace('bcb_reduced_java_named_files', 'bcb_reduced') + '.java', # dotfiledict) # dotfile_data_dict[item['file_name']] = row
import torch from model2 import PreTrainedEmbeddingEncoderRNN, AttnDecoderRNN from preprocess import prepareData from train_bidirectional import Trainer from preprocess import buildPairs from gensim.models import KeyedVectors import params use_cuda = torch.cuda.is_available() trainfile = '/home/prosa/Works/Text/mt/dataset/filter-en-id/lenlim80/sorted/train.dummy' src_lang, tgt_lang, pairs = prepareData(trainfile, reverse=False) # Word vector word_vectors = KeyedVectors.load(params.WORD_VECTORS_FILE) hidden_size = word_vectors.vector_size max_len = 8 encoder = PreTrainedEmbeddingEncoderRNN(word_vectors, max_len) attn_decoder = AttnDecoderRNN(hidden_size, tgt_lang, dropout_p=0.1, max_length=max_len) if use_cuda: encoder = encoder.cuda() attn_decoder = attn_decoder.cuda() epoch = 100 num_iter = len(pairs)
def inference(self, model_input): """ Internal inference methods :param model_input: transformed model input data list :return: list of inference output in NDArray """ stoplist = list(string.punctuation) language = self.model_params['language'] if language == "de": language = "german" stoplist += stopwords.words(language) stoplist += ['archiviert', 'archiviertes', 'angeblich', 'angebliche', 'facebook', 'seien', 'sei', 'facebookpost', 'behauptung', 'sozialen', 'netzwerken', 'heißt', 'verbreitet', 'mögliche', 'höher', 'wort', 'teils', 'kaum', 'lassen', 'ersten', 'heraus', 'vergleich', 'simpsons', 'behauptet', 'etwa', 'worden', 'immer', 'post', 'sehen', 'kursiert', 'geteilt', 'hätten', 'sollen', 'zeigen', 'derzeit', 'seit', 'wurde', 'schon', 'mehr', 'zwei', 'gibt', 'dabei', 'steht', 'zeigt', 'sic', 'wegen', 'viele', 'netz', 'posting', 'video', 'gesagt', 'internet', 'artikel', 'nutzer', 'jahr', 'beitrag', 'macht', 'sharepic', 'gebe', 'zusammenhang', 'dafür', 'text', 'ab', 'jahren', 'kursieren', 'mann', 'frau', 'überschrift', 'laut', 'seite', 'de', 'zeige', 'wer', 'demnach', 'ende', 'prozent', 'wurden', 'mehrere', 'zudem', 'darin', 'suggeriert', 'zahlen', 'beleg', 'millionen', 'denen', 'beim', 'müssen', 'bereits', 'drei', 'darauf', 'online', 'jahre', 'geht', 'august', 'mehreren', 'beispiel', 'bekommen', 'welt', 'behauptungen', 'neue', 'land', 'stadt', 'oktober', 'erklärt', 'gefährlich', 'sogar', 'belegen', 'gar', 'heute', 'webseite', 'könne', 'schreibt', 'angebliches', 'mal', 'aktuell', 'angeblichen', 'behaupten', 'eindämmung', 'zufolge','jedoch', 'aussage', 'zugeschrieben', 'geld', 'eindruck', 'positiv', 'daten','zahl', 'berichtet', 'märz', 'davon', 'november', 'neben', 'bestätigt', 'leben', 'weniger', 'http', 'neuen', 'schutz', 'aktuellen', 'gab', 'halten', 'oft', 'vermeintliche', 'ganz', 'anfang', 'tag', 'aussagen', 'könnten', 'darunter', 'dezember', 'grund', 'erhalten', 'kommt', 'logo', 'unterstellt', 'erweckt', 'erst', 'wochen', 'gegeben', 'daher', 'zeit', 'gut', 'tage', 'sowie', 'rund', 'gestellt', 'screenshot', 'mitarbeiter', 'user', 'zweiten', 'april', 'geben', 'grafik', 'videos', 'fordert', 'häufig', 'außerdem','lautet', 'beiträgen', 'vermeintlichen', 'finden', 'gemacht', 'stellt', 'posts', 'personen', 'berichten', 'angegeben', 'verbreiten', 'arzt', 'präsident', 'bevölkerung', 'infektion', 'com', 'ländern', 'präsidenten', 'krise', 'bürger', 'rede', 'berichten', 'angegeben', 'verbreiten', 'fall', 'dpaq', 'runde', 'soziale', 'gebracht', 'worte', 'quelle', 'bringen', 'lesen', 'lange', 'tatsächlich', 'erneut', 'statt', 'september', 'weltweit', 'vielen', 'januar', 'nachdem', 'warnt', 'große', 'versucht', 'beweise', 'teilen', 'hingegen', 'juli', 'zusammen', 'luft', 'schreiben', 'wissen', 'per', 'monaten', 'beweis', 'anhand', 'dürfen', 'vermeintlich', 'twitter', 'blog', 'falsch', 'mitte', 'aufschrift', 'februar', 'trägt', 'kurz', 'cookies', 'browser'] # Do some inference call to engine here and return output if self.model_type == "TopicalPageRank": pos = {'NOUN', 'PROPN', 'ADJ'} # the valid Part-of-Speeches to occur in the graph, e.g. {'NOUN', 'PROPN', 'ADJ'} grammar = self.model_params['grammar'] # the grammar for selecting the keyphrase candidates, e.g. "NP: {<ADJ>*<NOUN|PROPN>}" language = self.model_params['language'] # e.g. 'de' normalization = self.model_params['normalization'] # word normalization method, e.g. ‘stemming’ window = self.model_params['window'] # edges connecting two words occurring in a window are weighted by co-occurrence counts, e.g. 10 max_count = self.model_params['max_count'] # maximal count of highest scored keyphrases, which are returned logging.info("TopicalPageRank model_input: {}".format(model_input)) # 1. create a TopicalPageRank extractor. extractor = pke.unsupervised.TopicalPageRank() phrases_list = [] for text_input in model_input: # 2. load the input text extractor.load_document(input=text_input, language=language, normalization=normalization) # 3. select the noun phrases as keyphrase candidates. extractor.candidate_selection(grammar=grammar) # 4. weight the keyphrase candidates using Single Topical PageRank. # Builds a word-graph in which edges connecting two words occurring # in a window are weighted by co-occurrence counts. extractor.candidate_weighting(window=window, pos=pos, lda_model=self.model, stoplist=stoplist) # 5. get the highest scored candidates as keyphrases keyphrases = extractor.get_n_best(n=max_count) logging.info("text_input: {}. keyphrases: {}".format(text_input, keyphrases)) phrases_list.append(keyphrases) return phrases_list elif self.model_type == "DocSim": # load model # with open(self.model, 'rb') as inp: # model = pickle.load(inp) # load word vectors model_wv = KeyedVectors.load(self.model) inference = [] logging.info("DocSim model_input: {}".format(model_input)) for text_input in model_input: logging.info("text_input: {}".format(text_input)) # read string into dataframe df = pd.read_csv(StringIO(text_input), header=None) similarities = [] for i, row in df.iterrows(): logging.info("{}. row: {}".format(i, row)) # prepare first document logging.info("row.iloc[0]: {}".format(row.iloc[0])) tokens = self.text_preprocess(row.iloc[0]) # Remove stop words words1 = [w for w in tokens if not w in stoplist and w in model_wv.key_to_index] logging.info("words1: {}".format(words1)) # prepare first document logging.info("row.iloc[1]: {}".format(row.iloc[1])) tokens = gensim.utils.simple_preprocess(row.iloc[1]) # Remove stop words words2 = [w for w in tokens if not w in stoplist and w in model_wv.key_to_index] logging.info("words2: {}".format(words2)) if (len(words1) == 0) or (len(words2) == 0): similarities.append("0.00") logging.warning("Word list is empty!") else: similarities.append(str(model_wv.n_similarity(words1, words2))) logging.info("similarities: {}".format(similarities)) inference.append(similarities) logging.info("inference: {}".format(inference)) return inference else: logging.error("Model {} not supported!".format(self.model_type)) raise RuntimeError("Model {} not supported!".format(self.model_type))
def process(self): """ Loads the models and outputs the n nearest neighbours """ # Extract text input check_words = self.parameters.get("words") if not check_words: self.dataset.update_status( "No words to find nearest neighbours of were provided") self.dataset.finish(-1) return check_words = [word.strip() for word in check_words.split(",")] # Extract cosine threshold try: cosine_threshold = float(self.parameters.get("cosine_threshold")) if cosine_threshold > 1: cosine_threshold = 1 if cosine_threshold < -1: cosine_threshold = -1 except ValueError: self.dataset.update_status( "Invalid number of provided. Insert a number between -1 and 1, like 0.75" ) self.dataset.finish(-1) return # Extract top n try: top_n = int(self.parameters.get("top_n")) if top_n == 0: top_n = 10 if top_n > 100: # Can't be more than a hundred top_n = 100 except ValueError: self.dataset.update_status( "Invalid number of nearest neighbours provided") self.dataset.finish(-1) return # Extract crawl depth crawl_depth = int(self.parameters.get("crawl_depth") or 1) if crawl_depth < 1 or crawl_depth > 3: crawl_depth = 1 results = [] results_path = self.dataset.get_results_path() tmp_dir = self.dataset.get_temporary_path() # Go through all archived token sets and generate collocations for each with zipfile.ZipFile(str(self.source_file), "r") as model_archive: # Get the filenames and only keep those containing the model (so e.g. no vectors.npy files) model_files = model_archive.namelist() model_names = [ model_name for model_name in model_files if model_name.endswith(".model") ] if not model_names: return # Extract the models and output nearest neighbour(s) for model_name in model_names: if self.interrupted: raise ProcessorInterruptedException( "Interrupted while loading token sets") # Get the date date_string = model_name.split('.')[0] # Words to crawl crawl_words = check_words words_crawled = [] # Temporarily extract file (we cannot use ZipFile.open() as it doesn't support binary modes) tmp_file_path = tmp_dir.joinpath(model_name) model_archive.extract(model_name, tmp_dir) # Check if there's also a vectors.npy file (for large models) in the folder, and if so, extract it if model_name + ".vectors.npy" in model_files: model_archive.extract(model_name + ".vectors.npy", tmp_dir) model = KeyedVectors.load(str(tmp_file_path), mmap="r") # Keep this loop going as long as we haven't reached the crawl limit. for i in range(crawl_depth): new_crawl_words = [] # Check certain words in this model for word in crawl_words: # Get the nearest neigbours try: nearest_neighbours = model.wv.most_similar( positive=[word], topn=top_n) # If not in vocabulary except KeyError as e: results.append({ "source_word": word, "nearest_neighbour": "ERROR: input word not in this model's vocabulary, be sure to insert lemmatized or stemmed versions", "cosime_similarity": 0, "source_occurrences": 0, "target_occurrences": 0, "model": model_name, "date": date_string }) continue # Get the nearest neigbours for nearest_neighbour in nearest_neighbours: if nearest_neighbour[ 1] >= cosine_threshold: # Cosine similarity threshold check results.append({ "source_word": word, "nearest_neighbour": nearest_neighbour[0], "cosine_similarity": nearest_neighbour[1], "source_occurrences": model.vocab[word]. count, # How often the source word appears in the model "target_occurrences": model.vocab[nearest_neighbour[0]]. count, # How often the target word appears in the model "model": model_name, "date": date_string }) # To check in possible next crawl if nearest_neighbour[0] not in words_crawled: new_crawl_words.append( nearest_neighbour[0]) # After first crawl, prepare new words to check crawl_words = new_crawl_words # Delete the temporary folder shutil.rmtree(tmp_dir) if not results: return # Generate csv and finish self.dataset.update_status("Writing to csv and finishing") self.write_csv_items_and_finish(results)
DEBUG = not "FILTER_PRODUCTION" in environ data_dir = "data" if DEBUG else "/data" if DEBUG: app.config["CACHE_TYPE"] = "null" else: app.config["CACHE_TYPE"] = "redis" app.config["CACHE_REDIS_URL"] = environ["REDIS_URL"] app.config["CACHE_DEFAULT_TIMEOUT"] = 60 * 60 * 24 * 14 # 2 weeks cache = Cache(app) vecs = {} for m in Path(data_dir).glob("*.model"): vecs[m.stem] = KeyedVectors.load(str(m), mmap="r") @app.route("/typeahead/<vec_name>") @cache.cached(query_string=True) def typeahead(vec_name): q = request.args.get("q", type=str) if q == '': return jsonify({"tokens": []}) v = vecs[vec_name] q = re.sub(r"\d+", "0", q) q = q.lower()
len_d = len(tokens) for word in tokens: if word not in inverse_dict.keys(): continue X_train_bm25[ind, inverse_dict[word][-1]] = bm25_vectorizer( tf_values[word], len_d, corpus_len, len(inverse_dict[word]) - 1) with open('bm25_inverse_dict.json', 'w') as fp: json.dump(inverse_dict, fp) np.save('X_train_bm25.npy', X_train_bm25) #--------------------------------------------------------------------------------------------w2v block model_file = 'araneum_none_fasttextcbow_300_5_2018.model' model = KeyedVectors.load(model_file) def normalize_vec(vec): return vec / np.linalg.norm(vec) def create_doc_vector(text): # создаем вектор-маску lemmas = text.split() lemmas_vectors = np.zeros((len(lemmas), model.vector_size)) # если слово есть в модели, берем его вектор for idx, lemma in enumerate(lemmas): if lemma in model: lemmas_vectors[idx] = normalize_vec(model[lemma]) # проверка на случай, если на вход пришел пустой массив