def get_loaded_model(force_gpu=False, k_most_frequent_words=1000000): model_path = "infersent/encoder/infersent{}.pkl".format(model_version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) if (not torch.cuda.is_available()) and force_gpu: raise GPUNotFoundException() if torch.cuda.is_available(): model = model.cuda() # If infersent1 -> use GloVe embeddings. # If infersent2 -> use InferSent embeddings. W2V_PATH = 'infersent/dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'infersent/dataset/fastText/crawl-300d-2M.vec' ## noqa model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=k_most_frequent_words) return model
def prepare(model_path: str, word_vecs: str, out_path: str, sentences: Union[str, List[str]] = None, max_vocab: int = 0): """ this method is for adapting the vocabulary, :param model_path: unadapted model state :param word_vecs: word vectors :param out_path: where to store the state :param sentences: training sentences for scanning the vocabulary :param max_vocab: maximum vocabulary size (optional) :return: """ assert bool(sentences) != bool( max_vocab), 'Either sentences or max_vocab should be given' model = InferSent(config=MODEL_CONF) log.info(f"Loading state from {out_path}") model.load_state_dict(torch.load(model_path)) log.info(f"Loading word vecs from {out_path}") model.set_w2v_path(word_vecs) if sentences: if type(sentences) is not list: sentences = list(read_lines(sentences)) log.info("Building vocabulary from sentences") model.build_vocab(sentences, tokenize=True) if max_vocab: log.info(f"Pruning vocabulary to top {max_vocab} types") model.build_vocab_k_words(K=max_vocab) log.info(f"Saving at {out_path}") state = SentenceEncoder._get_state(model) torch.save(state, out_path)
def load_infersent(): V = 2 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'fastText/crawl-300d-2M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab_k_words(K=100000) return infersent
def init_models(vocal_size: int = VOCAB_SIZE): model = InferSent({ 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': VERSION }) model.load_state_dict(torch.load(MODEL_PATH)) model = model.cuda() if USE_CUDA else model model.set_w2v_path(VECTOR_PATH) model.build_vocab_k_words(K=VOCAB_SIZE) return model
def load_model(): model_version = 1 MODEL_PATH = "encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) print('building vocab') model.build_vocab_k_words(K=100000) print('done building vocab') return model
def embed_dataset(dataset_path, infersent_path, force_cpu=False): """ To make this work, first run ./get_infersent.sh """ MODEL_PATH = infersent_path / "encoder/infersent1.pkl" params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1} model = InferSent(params_model) if force_cpu: model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu')) else: model.load_state_dict(torch.load(MODEL_PATH)) model.cuda() W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt' model.set_w2v_path(W2V_PATH) model.build_vocab_k_words(K=100000) csv_data = read_csv(dataset_path / 'train.csv') csv_data = csv_data[1:] # skip header data = defaultdict(list) for irow, row in enumerate(csv_data): if 'snips' in str(dataset_path): utterance, labels, delexicalised, intent = row else: raise TypeError( "Unknown dataset type. Implement your own first. See the " "README") data[intent].append(utterance) vectors = {} for i, (intent, sentences) in enumerate(data.items()): print('{}/{} done'.format(i, len(data.items()))) embeddings = model.encode(sentences) avg_embedding = np.mean(embeddings, axis=0) vectors[intent] = avg_embedding return vectors
def infersent_embeddings(): train_data_list = [] test_data_list = [] sys.path.append( '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master') # Load model from models import InferSent model_version = 1 MODEL_PATH = "/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/glove.840B.300d-003.txt' if model_version == 1 else '/opt/notebooks/OCSVM_ISF_LOF_USE_Baselines/InferSent-master/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) train_data_list = model.encode(final_train['text'].tolist(), bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(train_data_list))) test_data_list = model.encode(final_test['text'].tolist(), bsize=128, tokenize=False, verbose=True) print('nb sentences encoded : {0}'.format(len(test_data_list))) return train_data_list, test_data_list
def init_infersent_model(self): model_version = 1 MODEL_PATH = "encoder/infersent%s.pkl" % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) self.model = model
def infersent_glove(): #Set Model for InferSent+Glove V = 1 MODEL_PATH = '/tmp/GloVe/encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } modelg = InferSent(params_model) modelg.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = True modelg = modelg.cuda() if use_cuda else modelg # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = '/tmp/GloVe/glove.840B.300d.txt' if V == 1 else '/home/ganesh/Quora_dev/tmp/GloVe/glove.840B.300d.txt' modelg.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words modelg.build_vocab_k_words(K=100000) return modelg
utils.extract_list_from_string(rows[1]), book_id_map) with open(os.path.join(this_dir, "data/description.csv"), mode='r') as infile: reader = csv.reader(infile) for rows in reader: if len(rows[1]) > 20: book_id_descriptions[rows[0]] = rows[1] with open(os.path.join(this_dir, "data/title.csv"), mode='r') as infile: reader = csv.reader(infile) next(reader) book_id_titles = {rows[0]: rows[1] for rows in reader} model_path = "encoder/infersent2.pkl" params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2 } model = InferSent(params_model) model.load_state_dict(torch.load(model_path)) w2v_path = 'fastText/crawl-300d-2M.vec' model.set_w2v_path(w2v_path) # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000)
# In[3]: # Keep it on CPU or put it on GPU use_cuda = False model = model.cuda() if use_cuda else model # In[4]: # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # In[5]: # Load embeddings of K most frequent words model.build_vocab_k_words(K=100000) # ## Load sentences # In[6]: # Load some sentences sentences = [] with open('samples.txt') as f: for line in f: sentences.append(line.strip()) print(len(sentences)) # In[7]: sentences[:5]
with open(ORI_PATH) as f: ori = f.read() ori = ori.replace('[[[[Premise]]]]: ', '').replace('>>>>[[[[Hypothesis]]]]:', '') ori = ori.replace('[[', '').replace(']]', '') ori = ori.splitlines() params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) infersent.set_w2v_path(W2V_PATH) infersent.build_vocab_k_words(K) adv_emb = infersent.encode(adv, tokenize=True) ori_emb = infersent.encode(ori, tokenize=True) result = [cos_sim(i, j) for i, j in zip(adv_emb, ori_emb)] with open('../results/InferSent.txt', 'w') as f: f.write('\n'.join([str(i) for i in result])) result = [distance(i, j) for i, j in zip(adv_emb, ori_emb)] with open('../results/InferSent_distance.txt', 'w') as f: f.write('\n'.join([str(i) for i in result]))
def extract_answer_IFST(story_data, question_and_ans_data, story_ids, model_version, Vocab_Size): """ (1) get answer, then modify self.question_and_ans_data by add the answer to it. (2) for each story id, extract its question, then look up in story_data, find the best sentence""" import re import pandas as pd import torch import numpy as np from models import InferSent #sentence_list=build_vocabulary(story_data) W2V_PATH = 'dataset/GloVe/glove.840B.300d.txt' if model_version == 1 else 'dataset/fastText/crawl-300d-2M.vec' MODEL_PATH = 'encoder/infersent%s.pkl' % model_version params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) model.set_w2v_path(W2V_PATH) if model_version == 3: sentence_list = build_vocabulary(story_data) model.build_vocab(sentence_list) else: model.build_vocab_k_words(K=Vocab_Size) for story_id in story_ids: story = story_data.loc[lambda df: df.story_id == story_id, 'story'].values[0] question_ids = question_and_ans_data.loc[ lambda df: df.story_id == story_id, 'question_id'] for question_id in question_ids: # get the question and answer question = question_and_ans_data.loc[ lambda df: df.question_id == question_id, 'question'].values[0] if 'answer' in question_and_ans_data: answer = question_and_ans_data.loc[ lambda df: df.question_id == question_id, 'answer'].values[0] question_encoded = model.encode( str(question_and_ans_data.loc[question_and_ans_data.index[ question_and_ans_data['question_id'] == question_id][0], 'question']))[0] ans = [] for sent in story.sents: #sim = sent.similarity(question) sim = cosine(question_encoded, model.encode(str(sent))[0]) ans.append({ 'question_id': question_id, 'answer_pred': sent, 'similarity': sim }) ans = pd.DataFrame(ans).reindex( ['question_id', 'answer_pred', 'similarity'], axis=1) ans.sort_values(by=['similarity'], ascending=False, inplace=True) question_and_ans_data.loc[lambda df: df.question_id == question_id, 'answer_pred'] = str( ans.iloc[0]['answer_pred']).replace( '\n', ' ') #.text #question_and_ans_data['answer_pred'] = question_and_ans_data['answer_pred'].apply(TextBlob) return question_and_ans_data
assert params.encoder_type in encoder_types, "encoder_type must be in " + \ str(encoder_types) # For Load encoder encoder = None if params.encoder_path and params.encoder_type == 'InferSent': params_model = {'bsize': params.batch_size, 'word_emb_dim': params.word_emb_dim, 'enc_lstm_dim': params.enc_lstm_dim , 'pool_type': params.pool_type, 'dpout_model': params.dpout_model, 'version': params.model_version} encoder = InferSent(params_model) encoder.load_state_dict(torch.load(params.encoder_path)) encoder.set_w2v_path(params.vector_rep) if params.vocab_samples.isdigit() : print("Build vocab from K samples") encoder.build_vocab_k_words(K=int(params.vocab_samples)) else: print("Build vocab from full file") encoder.build_vocab(K=params.vocab_samples) print("========TEST encoder=======") print(encoder.encode(['the cat eats.'])) encoder.to(device) # model config config_nli_model = { 'n_words' : len(word_vec) ,
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--dataname', default='t6', help='dataset name', choices=['t6', 't26', '2C']) parser.add_argument('-c', '--classifiername', default='RF', help='which classifier to use', choices=['GaussianNB', 'RF', 'SVM', 'KNN']) args = parser.parse_args() data_name = args.dataname # t6 or t26, 2C, 4C clf_name = args.classifiername # classfier GLOVE_PATH = 'GloVe/glove.840B.300d.txt' dataset = '../data/' disasters = [] train_list = [] test_list = [] if data_name == "t6": file_path = dataset + 'CrisisLexT6_cleaned/' disasters = [ "sandy", "queensland", "boston", "west_texas", "oklahoma", "alberta" ] test_list = [ "{}_glove_token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "t26": file_path = dataset + 'CrisisLexT26_cleaned/' disasters = [ "2012_Colorado_wildfires", "2013_Queensland_floods", "2013_Boston_bombings", "2013_West_Texas_explosion", "2013_Alberta_floods", "2013_Colorado_floods", "2013_NY_train_crash" ] test_list = [ "{}-tweets_labeled.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}_training.csv".format(disaster) for disaster in disasters ] if data_name == "2C": file_path = dataset + '2CTweets_cleaned/' disasters = [ "Memphis", "Seattle", "NYC", "Chicago", "SanFrancisco", "Boston", "Brisbane", "Dublin", "London", "Sydney" ] test_list = [ "{}2C.csv.token.csv.unique.csv".format(disaster) for disaster in disasters ] train_list = [ "{}2C_training.csv".format(disaster) for disaster in disasters ] accu_list = [] roc_list = [] precision_list = [] recall_list = [] f1_list = [] output_dir = '' for disaster, train, test in zip(disasters, train_list, test_list): train_file = os.path.join(file_path, train) test_file = os.path.join(file_path, test) xtrain = [] ytrain = [] xtest = [] ytest = [] xtrain, ytrain = load_data(data_name, train_file) xtest, ytest = load_data(data_name, test_file) train_output = "{}{}.train.npy".format(output_dir, disaster) test_output = "{}{}.test.npy".format(output_dir, disaster) if not os.path.isfile(train_output): # Load our pre-trained model (in encoder/): V = 1 MODEL_PATH = 'encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) # Set word vector path for the model: W2V_PATH = './GloVe/glove.840B.300d.txt' infersent.set_w2v_path(W2V_PATH) # # Build the vocabulary of word vectors (i.e keep only those needed): # infersent.build_vocab(sentences, tokenize=True) infersent.build_vocab_k_words(K=100000) # Encode your sentences (list of n sentences): train_embed = infersent.encode(xtrain, bsize=128, tokenize=True, verbose=True) np.save(train_output, train_embed) test_embed = infersent.encode(xtest, bsize=128, tokenize=True, verbose=True) np.save(test_output, test_embed) print('file saved') else: train_embed = np.load(train_output) test_embed = np.load(test_output) print(test) accu, roc, precision, recall, f1 = run_classifier( train_embed, ytrain, test_embed, ytest, clf_name, 100) # print accu, roc accu_list.append(accu) roc_list.append(roc) precision_list.append(precision) recall_list.append(recall) f1_list.append(f1) print("{}_InferSent_{}_LOO_accuracy {}".format(data_name, clf_name, accu_list)) print("{}_InferSent_{}_LOO_roc {}".format(data_name, clf_name, roc_list)) print("{}_InferSent_{}_LOO_percision {}".format(data_name, clf_name, precision_list)) print("{}_InferSent_{}_LOO_recall {}".format(data_name, clf_name, recall_list)) print("{}_InferSent_{}_LOO_f1 {}".format(data_name, clf_name, f1_list)) print( "{0}_InferSent_LOO_{1} {2:.4f} + {3:.4f} {4:.4f} + {5:.4f} {6:.4f} + {7:.4f} {8:.4f} + {9:.4f} {10:.4f} + {11:.4f} " .format(data_name, clf_name, np.mean(accu_list), np.std(accu_list), np.mean(roc_list), np.std(roc_list), np.mean(f1_list), np.std(f1_list), np.mean(precision_list), np.std(precision_list), np.mean(recall_list), np.std(recall_list)))
V = 1 MODEL_PATH = '../encoder/infersent%s.pkl' % V params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': V } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = '../GloVe/glove.840B.300d.txt' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab_k_words( K=100000) #100k most common words loaded up in model vocab PORT = "5000" app = Flask(__name__) cors = CORS(app) app.config['CORS_HEADERS'] = 'Content-Type' def adapt_array(arr): """ http://stackoverflow.com/a/31312102/190597 (SoulNibbler) """ out = io.BytesIO() np.save(out, arr) out.seek(0) return sqlite3.Binary(out.read())
if args.download == True: nltk.download('punkt') model_version = args.model_version MODEL_PATH = "/home1/InferSent/encoder/infersent%s.pickle" % model_version params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version} model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = True model = model.cuda() if use_cuda else model W2V_PATH = '/home1/InferSent/oov_train_model.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words # model.build_vocab_k_words(K=100000) model.build_vocab_k_words(K=2051129) # Extract embedding word . # Load test sentences train_test = pd.read_csv('/home1/InferSent/testset.csv', header=None, delimiter=",", encoding='UTF-8') source_s = train_test[0][1:] target_s = train_test[1][1:] embeddings_source = model.encode(source_s, bsize=128, tokenize=False, verbose=True) print('nb source_s encoded : {0}'.format(len(embeddings_source))) embeddings_target = model.encode(target_s, bsize=128, tokenize=False, verbose=True) print('nb target_s encoded : {0}'.format(len(embeddings_target))) np.save('embeddings_source.npy', embeddings_source) np.save('embeddings_target.npy', embeddings_target) if args.cosine == True: source_np = np.load('embeddings_source.npy')
from random import randint import numpy as np import torch from models import InferSent model_version = 1 MODEL_PATH = "/home/anuja/Desktop/BE project/Models/InferSent/infersent1.pkl" params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version} infermodel = InferSent(params_model) infermodel.load_state_dict(torch.load(MODEL_PATH)) use_cuda = False infermodel = infermodel.cuda() if use_cuda else infermodel W2V_PATH = '/home/anuja/Desktop/BE project/glove.6B/glove.840B.300d.txt' #replace with glove.840B.300d.txt infermodel.set_w2v_path(W2V_PATH) infermodel.build_vocab_k_words(K=100000) # In[114]: df = pd.DataFrame(columns=['body','replier', 'thread_no','embeddings']) folder = glob.glob(folder_path) th_no = 0 obj = preprocessing.preprocess() cnt = 0 count_file = 0 thread_list=[] try: for fol in tqdm_notebook(folder): files = glob.glob(fol+'/*.txt')
def main(): init_output_dir(output_dir) # prepare dataset task = get_task(task_name, dataset_path) label_list = task.get_labels() label_map = {v: i for i, v in enumerate(label_list)} print("loading raw data ... ") train_examples = task.get_train_examples() val_examples = task.get_dev_examples() test_examples = task.get_test_examples() print("converting to data loader ... ") train_loader = get_dataloader(train_examples, label_map) val_loader = get_dataloader(val_examples, label_map) test_loader = get_dataloader(test_examples, label_map) # load model print("loading model ... ") model = InferSent(config) model.load_state_dict(torch.load(model_path)) model = model.cuda() if config['use_cuda'] else model model.set_w2v_path(word_emb_path) print("building model vocabs ... ") model.build_vocab_k_words(K=100000, verbose=True) # run embedding for train set print("Run embedding for train set") for _ in trange(1, desc="Epoch"): run_encoding(loader=train_loader, model=model, mode='train') print("Run embedding for dev set") for _ in trange(1, desc="Epoch"): run_encoding(loader=val_loader, model=model, mode='dev') print("Run embedding for test set") for _ in trange(1, desc="Epoch"): run_encoding(loader=test_loader, model=model, mode='test') # HACK FOR MNLI mis-matched if task_name == 'mnli': print("Run Embedding for MNLI Mis-Matched Datasets") print("loading raw data ... ") mm_val_example = MnliMismatchedProcessor().get_dev_examples(dataset_path) mm_test_examples = MnliMismatchedProcessor().get_test_examples(dataset_path) print("converting to data loader ... ") mm_val_loader = get_dataloader(mm_val_example, label_map) mm_test_loader = get_dataloader(mm_test_examples, label_map) print("Run embedding for mm_dev set") for _ in trange(1, desc="Epoch"): run_encoding(loader=mm_val_loader, model=model, mode='mm_dev') print("Run embedding for test set") for _ in trange(1, desc="Epoch"): run_encoding(loader=mm_test_loader, model=model, mode='mm_test')
def main(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) task = get_task(args.task_name, args.data_dir) use_cuda = False if args.no_cuda else True verbose = args.verbose # model config config = { 'word_emb_dim': args.word_emb_dim, 'enc_lstm_dim': args.enc_lstm_dim, 'n_enc_layers': args.n_enc_layers, 'dpout_model': args.dpout_model, 'dpout_fc': args.dpout_fc, 'fc_dim': args.fc_dim, 'bsize': args.batch_size, 'n_classes': args.n_classes, 'pool_type': args.pool_type, 'nonlinear_fc': args.nonlinear_fc, 'use_cuda': use_cuda, 'version': args.model_version, 'dropout_prob': args.dropout_prob, } # load model if verbose: print('loading model...') model = InferSent(config) model.load_state_dict(torch.load(args.model_path)) model = model.cuda() if not args.no_cuda else model model.set_w2v_path(args.word_emb_path) model.build_vocab_k_words(K=args.k_freq_words, verbose=verbose) # load classifier classifier = SimpleClassifier(config) classifier = classifier.cuda() if not args.no_cuda else classifier # get train examples train_examples = task.get_train_examples() # calculate t_total t_total = initialization.get_opt_train_steps(len(train_examples), args) # build optimizer. optimizer = optim.SGD(classifier.parameters(), lr=0.001, momentum=0.9) # create running parameters r_params = RunnerParameters( local_rank=args.local_rank, n_gpu=n_gpu, learning_rate=5e-5, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=t_total, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, verbose=verbose) # create runner class for training and evaluation tasks. runner = GlueTaskClassifierRunner(encoder_model=model, classifier_model=classifier, optimizer=optimizer, label_list=task.get_labels(), device=device, rparams=r_params) if args.do_train: runner.run_train_classifier(train_examples) if args.do_val: val_examples = task.get_dev_examples() results = runner.run_val(val_examples, task_name=task.name, verbose=verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "val_preds.csv"), header=False, index=False) metrics_str = json.dumps( { "loss": results["loss"], "metrics": results["metrics"] }, indent=2) print(metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(metrics_str) # HACK for MNLI-mismatched if task.name == "mnli": mm_val_example = MnliMismatchedProcessor().get_dev_examples( task.data_dir) mm_results = runner.run_val(mm_val_example, task_name=task.name, verbose=verbose) df = pd.DataFrame(results["logits"]) df.to_csv(os.path.join(args.output_dir, "mm_val_preds.csv"), header=False, index=False) combined_metrics = {} for k, v in results["metrics"].items(): combined_metrics[k] = v for k, v in mm_results["metrics"].items(): combined_metrics["mm-" + k] = v combined_metrics_str = json.dumps( { "loss": results["loss"], "metrics": combined_metrics, }, indent=2) print(combined_metrics_str) with open(os.path.join(args.output_dir, "val_metrics.json"), "w") as f: f.write(combined_metrics_str)
params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model_infersent = InferSent(params_model) model_infersent.load_state_dict(torch.load(MODEL_PATH)) W2V_PATH = '/MAD/InferSent/dataset/crawl-300d-2M.vec' model_infersent.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words model_infersent.build_vocab_k_words(K=1000000) print("InferSent model loaded") #input: src, tgt, tgt.translated (to src, being English). src = open(args.path_src, "r").read().split("\n") src = src[:-1] tgt = open(args.path_tgt, "r").read().split("\n") tgt = tgt[:-1] Txt_target_2_cross = open(args.path_tgt_translated).read().split("\n") Txt_target_2_cross = Txt_target_2_cross[:-1] assert len(src) == len(tgt) assert len(src) == len(Txt_target_2_cross)
'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } infersent = InferSent(params_model) infersent.load_state_dict(torch.load(MODEL_PATH)) # Keep it on CPU or put it on GPU use_cuda = False infersent = infersent.cuda() if use_cuda else infersent infersent.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words infersent.build_vocab_k_words(K=500000) # or you can build your own vocabulary based on sentences in the data #infersent.build_vocab(yoursentences, tokenize=True) # 1- create sentence embedding for all the sentences and questions using InferSent # 2- calculates the distance between sentence & questions # based on Euclidean & Cosine similarity using sentence embeddings embeddings_dic = { 'Question': [], 'Answer': [], 'Question_Emb': [], 'Answer_Emb': [], 'Label': [], 'Cosine_Dist': [], 'Euclidean_Dist': [], 'Predicted_label_Cos': [],