def test(config): import sent2vec assert config.sent2vec.model is not None, "Please add sent2vec_model config value." sent2vec_model = sent2vec.Sent2vecModel() sent2vec_model.load_model(config.sent2vec.model) output_fn_test = OutputFnTest(sent2vec_model, config) test_set = Dataloader(config, 'data/test_stories.csv', testing_data=True) test_set.load_dataset('data/test.bin') test_set.load_vocab('./data/default.voc', config.vocab_size) test_set.set_output_fn(output_fn_test) generator_testing = test_set.get_batch(config.batch_size, config.n_epochs, random=True) keras_model = keras.models.load_model( './builds/leonhard/2018-06-08 12:04:03-entailmentv6_checkpoint_epoch-85.hdf5' ) verbose = 0 if not config.debug else 1 # test_batch = next(generator_testing) print(keras_model.metrics_names) loss = keras_model.evaluate_generator(generator_testing, steps=len(test_set) / config.batch_size, verbose=verbose) print(loss)
def __init__(self, embedding_path=None): try: import sent2vec # See https://github.com/epfml/sent2vec except ImportError: logging.warning('Module sent2vec was not found.') logging.warning('Please install using `python -m pip install cython;' 'python -m pip install git+https://github.com/epfml/sent2vec` ' 'to use EmbedRank') return super(EmbedRank, self).__init__() if embedding_path is None: model_name = 'wiki_bigrams.bin' self._embedding_path = os.path.join(self._models, model_name) else: self._embedding_path = embedding_path if not os.path.exists(self._embedding_path): logging.error('Could not find {}'.format(self._embedding_path)) logging.error('Please download "sent2vec_wiki_bigrams" model from ' 'https://github.com/epfml/sent2vec#downloading-sent2vec-pre-trained-models.') logging.error('And place it in {}.'.format(self._models)) logging.error('Or provide an embedding path.') if EmbedRank._embedding_path is None or EmbedRank._embedding_path != self._embedding_path: logging.info('Loading sent2vec model') EmbedRank._embedding_model = sent2vec.Sent2vecModel() EmbedRank._embedding_model.load_model(self._embedding_path) self._embedding_model = EmbedRank._embedding_model EmbedRank._embedding_path = self._embedding_path logging.info('Done loading sent2vec model') # Initialize _pos here, if another selection function is used. self._pos = {'NOUN', 'PROPN', 'ADJ'}
def _buildBioSentVecEmbedding(self): if (os.path.exists(self.sent_vec_embedding_filename + ".pkl")): pickle_in = open(self.sent_vec_embedding_filename + ".pkl", "rb") sent_vec_embedding = pickle.load(pickle_in) self.sent_vec_embedding = sent_vec_embedding return sent_vec_embedding model_path = "BioSentVec_PubMed_MIMICIII-bigram_d700.bin" model = sent2vec.Sent2vecModel() model.load_model(model_path) abstracts_dict = self.data.getAbstractsDict() sent_vec_embedding = {} for pmid in self.pmids: text = abstracts_dict[pmid] document_vector = np.zeros((NUM_SENT_VEC_FEATURES, )) for sentence in nltk.sent_tokenize(text): processed_sentence = self.__sentVecPreprocessSentence(sentence) document_vector += model.embed_sentence( processed_sentence).reshape((NUM_SENT_VEC_FEATURES, )) sent_vec_embedding[pmid] = document_vector # Saving the embedding dictionary in a pickle file with open(self.sent_vec_embedding_filename + ".pkl", "wb") as f: pickle.dump(sent_vec_embedding, f) self.sent_vec_embedding = sent_vec_embedding return sent_vec_embedding
def getCategory(headline): model = sent2vec.Sent2vecModel() model.load_model('sent2vec/wiki_unigrams.bin') # headline = "You should protect your pets from disease." headline = headline.lower() categoryList = [] with open('sent2vec/categories.txt', 'r') as categories: for line in categories: if not line[0] == '\n': categoryList.append(line.split("\n")[0]) # print(str(categoryList)) # embed headline emb = model.embed_sentence(headline) minDistCategory = None minDist = 1000000 # embed categories and compare for i in range(len(categoryList)): times3category = (categoryList[i] + " ") * 3 # print(times3category) # category embedding categoryEmb = model.embed_sentence(times3category) dist = distance.cosine(categoryEmb, emb) # print("Found cosine distance to category: " + categoryList[i] + " distance: " + str(dist)) if dist < minDist: minDist = dist minDistCategory = categoryList[i] print("Category of headline: " + headline + " : " + minDistCategory) return minDistCategory
def __init__(self, trainedModel='torontobooks_unigrams.bin'): print 'Creating empty sent2vec model...' self.model = sent2vec.Sent2vecModel( ) #keeps the trained model on memory print 'Loading trained model. This might take a while...' self.model.load_model(trainedModel) print 'Model %s loaded' % trainedModel
def test(config, testing_set): import sent2vec assert config.sent2vec.model is not None, "Please add sent2vec_model config value." sent2vec_model = sent2vec.Sent2vecModel() sent2vec_model.load_model(config.sent2vec.model) preprocess_fn = PreprocessTest(sent2vec_model) testing_set.set_preprocess_fn(preprocess_fn) output_fn_test = OutputFnTest(sent2vec_model, config) testing_set.set_output_fn(output_fn_test) generator_testing = testing_set.get_batch(config.batch_size, config.n_epochs, random=True) keras_model = keras.models.load_model( './builds/leonhard/2018-05-19 22:33:08-entailmentv2_checkpoint_epoch-1810.hdf5' ) verbose = 0 if not config.debug else 1 # test_batch = next(generator_testing) loss = keras_model.evaluate_generator(generator_testing, steps=len(testing_set) / config.batch_size, verbose=verbose) print(loss)
def embeding(modelfile, data, matrix_file): if os.path.exists(matrix_file): return np.load(matrix_file) print("can not find pre-processed embedded file, start to embeding...") model = sent2vec.Sent2vecModel() model.load_model(modelfile) # emb = model.embed_sentence("once upon a time .") # data = list(data) embs = [] for sentence in data: # print(sentence) if isinstance(sentence, str): print(sentence) emb = model.embed_sentence(sentence) # print(emb) else: print("not str:", sentence) embs.append(emb) embs = np.array(embs) embs = np.squeeze(embs) # print(np.shape(embs)) # print(embs) np.save(matrix_file, embs) print("save to file: ", matrix_file) return embs
def __init__(self, train_file, eval_file, output_dir, model_path, force_create_emp, run_type, epochs): self.train_file = train_file self.eval_file = eval_file self.output_dir = output_dir self.model_path = model_path self.force_create_emb = force_create_emp self.biosentvec_model_path = "/mnt/nfs/work1/696ds-s20/kgunasekaran/sentvec/BioSentVec_PubMed_MIMICIII-bigram_d700.bin" self.biosentvec_model = sent2vec.Sent2vecModel() # hyparameters to be used and played around with self.batch_size = 12 self.learning_rate = 5e-4 self.input_size = 1400 self.hidden_size = 400 self.output_size = 2 self.unbalanced = 500 self.epochs = epochs #end hyperparameters try: print("loading biosentvec model..") self.biosentvec_model.load_model(self.biosentvec_model_path) except Exception as e: print("EXCEPTION:", e) print('model successfully loaded') self.stop_words = set(stopwords.words('english')) self.bio_asq_data_train = [] self.bio_asq_data_eval = [] self.qid_pred = {} self.qid_pred_prob = {} self.qid_target = {}
def __init__(self, env, config): super().__init__() self.entity_vocab = None self.config = config self.symbol_vocab = None self.predicate_vocab = None self.env = env self.sent2vec = sent2vec.Sent2vecModel() self.sent2vec.load_model(config['embeddings']) self.train_symbol = config['train_symbol'] self.train_predicate = config['train_predicate'] self.train_entity = config['train_entity'] self.symbol_embedding = None self.predicate_embedding = None self.entity_embedding = None self.proj_sym = Projection(config['pred_dim']) self.proj_ent = Projection(config['ent_dim']) self.proj_pred = Projection(config['pred_dim']) self.lambda_ = 1.0 self.init_embeddings()
def __init__(self, model_name, is_static=False): self.is_static = is_static if is_static: self.model = sent2vec.Sent2vecModel() self.model.load_model(model_name) else: self.model = SentenceTransformer(model_name)
def sentence_embedding(doc, sent_model=sent2vec.Sent2vecModel()): """ params: sent_model: sen2vec model """ sentences = nltk.word_tokenize(doc) #re.sub("'t", 'ot', "n't, doesn't, can't, don't") res_sent = " ".join(item.lower() for item in sentences if item not in string.punctuation )
def __init__(self, dataset_dir, word2vec_dict, word2vec_dim): self.spacy_en = spacy.load('en') self.X = [] self.Y = [] self.max_sentences_in_text = 0 self.model = sent2vec.Sent2vecModel() self.model.load_model( '/home/mauricio/repo/datasets/word_vectors/enwiki_sent2vec_100.bin' ) self.files_list = os.listdir(os.path.abspath(dataset_dir)) for idx, file in enumerate(self.files_list): print("{} Processing file {}".format(idx, file)) sample = [] with open(os.path.join(os.path.abspath(dataset_dir), file), 'r') as f: lines = f.read().split("\n") text = [ l.split(".")[1].rstrip().lstrip().lower() for l in lines if re.match('^STEP.*', l) ] sentences_count = len(text) if sentences_count > self.max_sentences_in_text: self.max_sentences_in_text = sentences_count self.X.append(text) self.Y.append([1.0]) self.X.append(text[::-1]) self.Y.append([0.0]) print("Maximum sentences in text: {}".format( self.max_sentences_in_text))
def __init__(self, modelPath): """Initialize the Sent2Vec model. Arguments: modelPath {str} -- the path to model """ self.encoder = sent2vec.Sent2vecModel() self.encoder.load_model(modelPath)
def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ): sp = spacy.load('en_core_web_sm') tokenized = sp(doc) sentences = [] for token in tokenized.sents: sentences.append(token.text) if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']: # Use encoder for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {}) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sentence_embeddings = model.encode(sentences) elif encoder == 'use': #!pip install embedding-as-service from embedding_as_service.text.encode import Encoder en = Encoder(embedding='use', model='use_dan', max_seq_length=256) sentence_embeddings = en.encode(texts=sentences) elif encoder == 'infersent': import nltk nltk.download('punkt') from models import InferSent params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 2} infersent = InferSent(params_model) W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec' infersent.set_w2v_path(W2V_PATH) infersent.build_vocab(sentences, tokenize=True) sentence_embeddings = infersent.encode(sentences, tokenize=True) elif encoder == 'sent2vec': import sent2vec model = sent2vec.Sent2vecModel() model.load_model('drive/My Drive/torontobooks_unigram.bin') sentence_embeddings = model.embed_sentences(sentences) elif encoder == 'laser': from laserembeddings import Laser laser = Laser() ## Also used for multilingual sentence embeddings sentence_embeddings = laser.embed_sentences(sentences, lang='en') else: raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder)) return list(zip(sentences, sentence_embeddings))
def compute_sentence_embeddings_representation(data): import sent2vec sent2vec_model = sent2vec.Sent2vecModel() sent2vec_model.load_model('../DocAgg/lib/sent2vec/wiki_bigrams.bin') sentences = [' '.join(sample['tokens']) for sample in data] sentence_embeddings = sent2vec_model.embed_sentences(sentences) for i, sentence_embedding in enumerate(sentence_embeddings): data[i]['sentence_embeddings'] = sentence_embedding return data
def load_word_vec_model(self): if self.config.word_vector_model == None: print("No word vector model provided! Aborting ...") exit() print("Loading word vector model ...") self.model = sent2vec.Sent2vecModel() self.model.load_model(self.config.word_vector_model) print("Word vector loaded!")
def prepare(self, unique_texts): if self.model is None: import sent2vec self.model = sent2vec.Sent2vecModel() self.model.load_model(self.model_path) sentences_preprocessed = self.preprocess_all(unique_texts) vecs = self.model.embed_sentences(sentences_preprocessed) self.set_sen2vec(unique_texts, vecs)
def __init__(self, hparams): self.hparams = hparams self.load_sts_data() self.define_prepare_text() self.model = sent2vec.Sent2vecModel() self.model.load_model(hparams.model) if hparams.dim_subspace > 0: self.get_projection()
def main(config): import sent2vec assert config.sent2vec.model is not None, "Please add sent2vec_model config value." sent2vec_model = sent2vec.Sent2vecModel() sent2vec_model.load_model(config.sent2vec.model) preprocess_fn = Preprocess(sent2vec_model) output_fn_test = OutputFnTest(sent2vec_model, config) train_set = SNLIDataloaderPairs('data/snli_1.0/snli_1.0_train.jsonl') train_set.set_preprocess_fn(preprocess_fn) train_set.set_output_fn(output_fn) test_set = Dataloader(config, 'data/test_stories.csv', testing_data=True) test_set.load_dataset('data/test.bin') test_set.load_vocab('./data/default.voc', config.vocab_size) test_set.set_output_fn(output_fn_test) # dev_set = SNLIDataloader('data/snli_1.0/snli_1.0_dev.jsonl') # dev_set.set_preprocess_fn(preprocess_fn) # dev_set.set_output_fn(output_fn) # test_set = SNLIDataloader('data/snli_1.0/snli_1.0_test.jsonl') generator_training = train_set.get_batch(config.batch_size, config.n_epochs) generator_dev = test_set.get_batch(config.batch_size, config.n_epochs) keras_model = model(config) verbose = 0 if not config.debug else 1 timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Callbacks tensorboard = keras.callbacks.TensorBoard(log_dir='./logs/' + timestamp + '-entailmentv5/', histogram_freq=0, batch_size=config.batch_size, write_graph=False, write_grads=True) model_path = os.path.abspath( os.path.join(os.curdir, './builds/' + timestamp)) model_path += '-entailmentv5_checkpoint_epoch-{epoch:02d}.hdf5' saver = keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', verbose=verbose, save_best_only=True) keras_model.fit_generator(generator_training, steps_per_epoch=300, epochs=config.n_epochs, verbose=verbose, validation_data=generator_dev, validation_steps=len(test_set) / config.batch_size, callbacks=[tensorboard, saver])
def senc_vector_model(): model = sent2vec.Sent2vecModel() model.load_model(model_path) emb_1 = model.embed_sentence(sentence_1[0]) emb_2 = model.embed_sentence(sentence_2[0]) emb_3 = model.embed_sentence(sentence_3[0]) cosine_dis(emb_1, emb_2) cosine_dis(emb_1, emb_3)
def __init__(self, model='sent2vec', pre_trained_model_path=''): self.s2v_model = None if model == 'sent2vec': try: self.s2v_model = sent2vec.Sent2vecModel() self.s2v_model.load_model(pre_trained_model_path) except Exception as e: print(e) print('pretrained model {} successfully loaded'.format( pre_trained_model_path))
def encode(all_sections, model_path=None, chunk=0, chunk_size=2500, model=None): logging.info('loading model...') if model is None: model = sent2vec.Sent2vecModel() try: model.load_model(model_path) except Exception as e: print(e) logging.info('model successfully loaded') stop_words = set(stopwords.words('english')) chunk_meta = [] chunk_vecs = [] sorted_keys = list(all_sections.keys()) sorted(sorted_keys) chunk_keys = sorted_keys[(chunk * chunk_size):((chunk + 1) * chunk_size)] logging.info('Running on keys %s...', str(chunk_keys[0:5])) def preprocess_sentence(text): text = text.replace('/', ' / ') text = text.replace('.-', ' .- ') text = text.replace('.', ' . ') text = text.replace('\'', ' \' ') text = text.lower() tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words] return ' '.join(tokens) for k_idx, k in enumerate(chunk_keys): s_doc = time.time() logging.info('key %s (%s of %s) ', k, k_idx, len(chunk_keys)) sentences = load_sents(all_sections, k) dim = model.get_emb_size() vectors = np.zeros((len(sentences), dim)) gt = time.time t = gt() counter = 0 for doc_id, sec_id, sentence_id, s in sentences: vectors[counter] = model.embed_sentence(preprocess_sentence(s)) logging.log_every_n(logging.INFO, 'Processed %s sentences | %s seconds', 10, sentence_id, str(gt() - t)) counter += 1 e_t = gt() logging.info('Done! Processed %s Sentences | %s seconds', len(sentences), str(e_t - t)) chunk_meta.extend(sentences) chunk_vecs.append(vectors) e_doc = time.time() logging.info('key %s (%s of %s)... %s seconds ', k, k_idx, len(chunk_keys), e_doc - s_doc) return chunk_vecs, chunk_meta
def _sentvec_transform(sample, **kwargs): X, y = sample import sent2vec sentvec_model = kwargs[ 'sentvec_model'] if 'sentvec_model' in kwargs and kwargs[ 'sentvec_model'] else sent2vec.Sent2vecModel() X = sentvec_model.embed_sentences([' '.join(x) for x in X]) if type( X[0]) is not str else sentvec_model.embed_sentences([' '.join(X)])[0] mask = [1] * len(X) if type(X[0]) is not str else [[1] * len(X[x]) for x in range(len(X))] return [X, mask], y
def loadBioSent2VecModel(model_path): """ Load bioSent2VecModel, which is ~ 20 GB. Input: model_path Returns: model object """ model = sent2vec.Sent2vecModel() try: model.load_model(model_path) print('Model successfully loaded!') except Exception as e: print(e) return model
def __init__(self, cfg): super(BioSentVec, self).__init__() self.cfg = cfg checkpoint = cfg.model.checkpoint if not os.path.exists(checkpoint): if cfg.model.checkpoint_download: download_ressource( checkpoint, 'https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioSentVec_PubMed_MIMICIII-bigram_d700.bin' ) else: raise FileNotFoundError(checkpoint) self.model = sent2vec.Sent2vecModel() self.model.load_model(checkpoint)
def main(config, training_set, testing_set): import sent2vec assert config.sent2vec.model is not None, "Please add sent2vec_model config value." sent2vec_model = sent2vec.Sent2vecModel() sent2vec_model.load_model(config.sent2vec.model) preprocess_fn = Preprocess(sent2vec_model) training_set.set_preprocess_fn(preprocess_fn) testing_set.set_preprocess_fn(preprocess_fn) training_set.set_output_fn(output_fn_train) testing_set.set_output_fn(output_fn_test) generator_training = training_set.get_batch(config.batch_size, config.n_epochs, random=True) generator_testing = testing_set.get_batch(config.batch_size, config.n_epochs, random=True) cloze_model = keras_model(config) verbose = 0 if not config.debug else 1 timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Callbacks tensorboard = keras.callbacks.TensorBoard(log_dir='./logs/scheduler-' + timestamp + '/', histogram_freq=0, batch_size=config.batch_size, write_graph=False, write_grads=True) model_path = os.path.abspath( os.path.join(os.curdir, './builds/' + timestamp)) model_path += '-scheduler_checkpoint_epoch-{epoch:02d}.hdf5' saver = keras.callbacks.ModelCheckpoint(model_path, monitor='val_loss', verbose=verbose, save_best_only=True) cloze_model.fit_generator( generator_training, steps_per_epoch=len(training_set) / config.batch_size, epochs=config.n_epochs, verbose=verbose, validation_data=generator_testing, validation_steps=len(testing_set) / config.batch_size, callbacks=[tensorboard, saver])
def __init__(self, file_name, sim_thresh, sim_type): self.file_name = file_name self.sim_type = sim_type #self.model_path = "/mnt/nfs/work1/696ds-s20/kgunasekaran/sentvec/BioSentVec_PubMed_MIMICIII-bigram_d700.bin" self.model = sent2vec.Sent2vecModel() self.batch_size = 12 try: self.model.load_model(self.model_path) except Exception as e: print("EXCEPTION:", e) print('model successfully loaded') self.stop_words = set(stopwords.words('english')) self.bio_asq_data = []
def sent_embedding(text): lang = detect(text) text_pre = preprocessText(text) sentences = sent_tokenize(text_pre) model = sent2vec.Sent2vecModel() _os_path = "/home/thangnd/git/python/NLP_20182/text-summarizer-demo/web/models/" if(lang == 'en'): path = _os_path + "wiki_unigrams.bin" #load model cho tieng anh elif(lang == 'vi'): path = _os_path + "my_model.bin" #load model cho tieng viet else: return 0 model.load_model(path) embs = model.embed_sentences(sentences) return embs, sentences
def create_chitchat_bot(self): """Initializes self.chitchat_bot with some conversational model.""" # Hint: you might want to create and train chatterbot.ChatBot here. # It could be done by creating ChatBot with the *trainer* parameter equals # "chatterbot.trainers.ChatterBotCorpusTrainer" # and then calling *train* function with "chatterbot.corpus.english" param ######################## #### YOUR CODE HERE #### ################# RHT_Conv_Bot ##################### sent2vec_model = sent2vec.Sent2vecModel() sent2vec_model.load_model(self.conv_model) print("chitchat_bot created.") return sent2vec_model
def fit(self, *_): try: import sent2vec except ImportError: from wellcomeml.__main__ import download download("non_pypi_packages") import sent2vec if self.pretrained: model_path = check_cache_and_download(self.pretrained) self.model = sent2vec.Sent2vecModel() self.model.load_model(model_path) else: # Custom training not yet implemented raise NotImplementedError( "Fit only implemented for loading pretrained models") return self