def train(sentences): print("starting to train!") # train model if args.train_pairs and args.relevant_selects: min_count = args.min_count * 5 elif args.train_pairs: min_count = args.min_count * 10 else: min_count = args.min_count if "word2vec" in args.gensim_model_name: model = Word2Vec(sentences, size=args.embedding_size, window=20, sg=args.skipgram, workers=16, min_count=min_count) elif "fast" in args.gensim_model_name: model = FastText(sentences, size=args.embedding_size, window=20, sg=args.skipgram, workers=16, min_count=min_count) # summarize the loaded model print(model) # access vector for one word # save model # trim unneeded model memory = use (much) less RAM model.init_sims(replace=True) model.save(args.data_dir + args.model_name)
def test_fasttext_similar_ir(): model = FastText([DEFAULT_ANALYZER(doc) for doc in DOCUMENTS], min_count=1) model.save('model_ft') model.init_sims(replace=True) model = Word2Vec.load('model_ft') match_op = Matching() wcr = Word2VecRetrieval(model.wv, analyzer=DEFAULT_ANALYZER) retrieval = Retrieval(wcr, matching=match_op) #, labels=['1번', '2번', '3번', '4번', '5번', '6번', '7번', '8번']) retrieval.fit(DOCUMENTS) start = time.time() # 시작 시간 저장 result, score = retrieval.query("안냥") print("time :", time.time() - start) # 현재시각 - 시작시간 = 실행 시간 print(result) print(score)
def train_fasttext(tokens): ft_model = FastText(min_count=10, window=5, size=150, negative=10, alpha=0.03, min_alpha=0.0007, sample=6e-5, sg=0) ft_model.build_vocab(tokens) print(ft_model.corpus_count) ft_model.train(tokens, total_examples=ft_model.corpus_count, epochs=300, report_delay=1) ft_model.init_sims(replace=True) write_pickle(ft_model, 'ft_model2') return ft_model
def testFastText(self): class LeeReader(object): def __init__(self, fn): self.fn = fn def __iter__(self): with smart_open(self.fn, 'r', encoding="latin_1") as infile: for line in infile: yield line.lower().strip().split() model = FastText(LeeReader(datapath('lee.cor'))) model.init_sims() index = self.indexer(model, 10) self.assertVectorIsSimilarToItself(model.wv, index) self.assertApproxNeighborsMatchExact(model, model.wv, index) self.assertIndexSaved(index) self.assertLoadedIndexEqual(index, model)
def main(): parser = argparse.ArgumentParser(description='Trains word embeddings') parser.add_argument('--config_file', type=str, default='configs/echoes_local.config', help='location of the configuration file') args = parser.parse_args() config = configparser.ConfigParser() config.read(args.config_file) print(config['word']['model_dir']) sentences = Sentences(input_file=config['general']['corpus_file']) try: shutil.rmtree(config['word']['model_dir']) except FileNotFoundError: pass os.mkdir(config['word']['model_dir']) logging.info('Building fasttext model...') model = FastText(sentences, size=int(config['word']['size']), window=int(config['word']['window']), min_count=int(config['word']['min_count']), iter=int(config['word']['epochs']), workers=int(config['word']['workers'])) model.init_sims() model.save(f"{config['word']['model_dir']}/ft_model") logging.info(f"Saved fasttext model under {config['word']['model_dir']}") logging.info('Building word2vec model...') model = Word2Vec(sentences, size=int(config['word']['size']), window=int(config['word']['window']), min_count=int(config['word']['min_count']), iter=int(config['word']['epochs']), workers=int(config['word']['workers'])) model.init_sims() annoy_index = AnnoyIndexer(model, 100) annoy_index.save(f"{config['word']['model_dir']}/annoy_model") model.save(f"{config['word']['model_dir']}/w2v_model") logging.info(f"Saved word2vec model under {config['word']['model_dir']}")
# Split a review into parsed sentences. sentences += KaggleWord2VecUtility.review_to_sentences( review, tokenizer, remove_stopwords=True) except: continue logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', \ level=logging.INFO) num_features = int(sys.argv[1]) # Word vector dimensionality min_word_count = 20 # Minimum word count num_workers = 40 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words print("Training FastText model...") # Train FastText model. model = FastText(sentences, workers=num_workers, hs=0, sg=1, negative=10, iter=25, \ size=num_features, min_count=min_word_count, \ window=context, sample=downsampling, seed=1) model_name = str(num_features) + "features_" + str( min_word_count) + "minwords_" + str(context) + "context_len2alldata" model.init_sims(replace=True) # Save FastText model. print("Saving FastText model...") model.save(model_name) endmodeltime = time.time() print("time : ", endmodeltime - start)
class Char2VecFeatureExtractor(VectorFeatureExtractor): def __init__(self): super().__init__() self.key = KEY self.config = Char2VecModelConfiguration() def fit(self, X, y=None, size=100, min_count=5, workers=1, window=5, sample=1e-3, skipgram=False, min_n=3, max_n=6): """ Trains a Word2vec model on given documents. Each document should represent a sentence. Args: X: list(Document | AnnotatedDocument | list(str)) y: optional labels size: Size of embeddings to be learnt (Default 100), i.e. word vector dimensionality min_count: Minimum word count. Ignore words with number of occurrences below this (Default 5). workers: Number of threads to run in parallel window: Context window size sample: Threshold for downsampling higher-frequency words (Default 0.001) skipgram: Use skip-gram if True and CBOW otherwise min_n: min length of char ngrams (Default 3) max_n: max length of char ngrams (Default 6) """ log.info("Checking parameters...") self.config.set_parameters({ "size": size, "min_count": min_count, "workers": workers, "window": window, "sample": sample, "min_n": min_n, "max_n": max_n }) self.config.validate() # Get sentences as lists of tokens log.info("Tokenizing {} documents...".format(len(X))) sentences = [] for idx, doc in enumerate(X): sentences.append(document_to_tokens(doc)) log_progress(log, idx, len(X)) # Initialize and train the model (this will take some time) log.info("Training FastText on {} sentences...".format(len(X))) self.model = FastText( sentences, workers=self.config.get_parameter("workers"), size=self.config.get_parameter("size"), min_count=self.config.get_parameter("min_count"), window=self.config.get_parameter("window"), sample=self.config.get_parameter("sample"), sg=1 if skipgram else 0, min_n=self.config.get_parameter("min_n"), max_n=self.config.get_parameter("max_n")) # If you don't plan to train the model any further, calling # init_sims() will make the model much more memory-efficient. self.model.init_sims(replace=True) return self def transform(self, X, y=None): """ Transforms the list of documents and returns tokens with their features. Each document should represent a sentence. """ log.info("Generating features for {} documents...".format(len(X))) features = [] for doc in X: doc_features = [] for token in document_to_tokens(doc): if token in self.model.wv: doc_features.append((token, self.model.wv[token])) features.append(doc_features) return features def save(self, file_path): save_path = Path(file_path) mkdir(save_path) model_save_path = save_path.joinpath("char2vec.model") config_save_path = save_path.joinpath("char2vec.config") self.model.save(str(model_save_path)) self.config.save(config_save_path) def load(self, file_path): load_path = Path(file_path) model_load_path = load_path.joinpath("char2vec.model") config_load_path = load_path.joinpath("char2vec.config") self.model = FastText.load(str(model_load_path)) self.config.load(config_load_path) return self
class WordEmbedding(): def __init__(self, embedding_type="w2v", embedding_size=100, ngram=(3, 6), window_size=5, architecture="sg"): self.embedding_type = embedding_type self.window = window_size self.size = embedding_size self.model = None if architecture == "sg": self.skip_gram = True else: self.skip_gram = False if ngram is None: ngram = (3, 6) self.min_gram = ngram[0] self.max_gram = ngram[1] def train_embedding(self, sentences, n_iter=100, workers=1, min_count=3, negative_sample=1): if self.embedding_type == "w2v": train_corpus = sentences if self.model is None: self.model = Word2Vec(size=self.size, window=self.window, min_count=min_count, negative=negative_sample, workers=workers, sg=int(self.skip_gram)) self.model.build_vocab(train_corpus) # self.model.build_vocab() else: self.model.build_vocab(train_corpus, update=True) elif self.embedding_type == "ft": train_corpus = sentences if self.model is None: self.model = FastText(sg=int(self.skip_gram), size=self.size, window=self.window, min_count=min_count, min_n=self.min_gram, max_n=self.max_gram, workers=workers, negative=negative_sample) self.model.build_vocab(train_corpus) else: self.model.build_vocab(train_corpus, update=True) elif self.embedding_type == "glove": raise ValueError("GloVe training not supported use official repo") else: raise ValueError("Invalid Embedding Type") train_corpus = sentences self.model.train(train_corpus, epochs=n_iter, total_examples=self.model.corpus_count) def retrieve_vector(self, word): try: return self.model.wv[word] except KeyError: return np.random.random(self.size) def find_similar_word(self, word, n=10): try: return self.model.most_similar(positive=[word], topn=n) except KeyError: return [] def save_model(self, file_name): self.model.save("{}.model".format(file_name)) we_model_files = glob("{}.model*".format(file_name)) with ZipFile(file_name, "w") as zipf: for we_file in we_model_files: zipf.write(we_file) os.remove(we_file) def load_model(self, file_name): try: with ZipFile(file_name, "r") as zipf: zipf.extractall("/tmp/") nl = zipf.namelist() fn = [name for name in nl if name.endswith(".model")][0] path = "/tmp/" + fn except BadZipFile: path = file_name if self.embedding_type == "w2v": self.model = KeyedVectors.load_word2vec_format(path) elif self.embedding_type == "ft": self.model = FastText.load_fasttext_format(path) elif self.embedding_type == "glove": """path name: .txt file""" try: glove_file = datapath(os.path.abspath(path)) tmp_file = get_tmpfile("/tmp/g2w2v.txt") glove2word2vec(glove_file, tmp_file) self.model = KeyedVectors.load_word2vec_format(tmp_file) except UnicodeDecodeError: self.model = KeyedVectors.load(os.path.abspath(path)) self.size = self.model.wv.vector_size def remove_from_vocab(self, word_list): new_vectors = [] new_vocab = {} new_index2entity = [] new_vectors_norm = [] if self.embedding_type == "ft": self.model.wv.init_sims() for i in range(len(self.model.wv.vocab)): word = self.model.wv.index2entity[i] vec = self.model.wv.vectors[i] vocab = self.model.wv.vocab[word] vec_norm = self.model.wv.vectors_norm[i] if word not in word_list: vocab.index = len(new_index2entity) new_index2entity.append(word) new_vocab[word] = vocab new_vectors.append(vec) new_vectors_norm.append(vec_norm) self.model.wv.vocab = new_vocab self.model.wv.vectors = np.array(new_vectors) self.model.wv.index2entity = new_index2entity self.model.wv.index2word = new_index2entity self.model.wv.vectors_norm = new_vectors_norm else: self.model.init_sims() for i in range(len(self.model.vocab)): word = self.model.index2entity[i] vec = self.model.vectors[i] vocab = self.model.vocab[word] vec_norm = self.model.vectors_norm[i] if word not in word_list: vocab.index = len(new_index2entity) new_index2entity.append(word) new_vocab[word] = vocab new_vectors.append(vec) new_vectors_norm.append(vec_norm) self.model.vocab = new_vocab self.model.vectors = np.array(new_vectors) self.model.index2entity = new_index2entity self.model.index2word = new_index2entity self.model.vectors_norm = new_vectors_norm
def _fasttext(table, input_col, sg=1, size=100, window=5, min_count=1, max_vocab_size=None, train_epoch=100, workers=1, alpha=0.025, min_alpha=0.025, seed=None, hs=1, negative=5, ns_exponent=0.75, topn=30, hashfxn=hash, min_n=3, max_n=6, bucket=2000000): if isinstance(sg, str): sg = int(sg) algo = {1: 'Skip-gram', 0: 'CBOW'}[sg] tagged_sents = table[input_col].apply(list).tolist() ft = FastText(sentences=tagged_sents, sg=sg, size=size, window=window, alpha=alpha, min_alpha=min_alpha, seed=seed, min_count=min_count, max_vocab_size=max_vocab_size, workers=workers, iter=train_epoch, hs=hs, negative=negative, ns_exponent=ns_exponent, hashfxn=hashfxn, min_n=min_n, max_n=max_n, bucket=bucket) ft.init_sims(replace=True) vocab = ft.wv.vocab analogies_score, sections = ft.wv.evaluate_word_analogies( 'brightics/function/textanalytics/data/word2vec_questions_words.txt') pearson_1, spearman_1, oov_ratio_1 = ft.wv.evaluate_word_pairs( 'brightics/function/textanalytics/data/word2vec_wordsim353.tsv') pearson_2, spearman_2, oov_ratio_2 = ft.wv.evaluate_word_pairs( 'brightics/function/textanalytics/data/word2vec_simlex999.tsv') params = {'Input column': input_col, 'Training algorithm': algo, 'Word vector dimensionality': size, 'Window': window, 'Minimum word count': min_count, 'Max vocabulary size': max_vocab_size, 'Train epoch': train_epoch, 'Number of workers': workers, 'Alpha': alpha, 'Minimum alpha': min_alpha, 'Seed': seed, 'Hierarchical softmax': hs, 'Negative': negative, 'Negative sampling exponent': ns_exponent} # tsne visualization length = len(vocab) if length < topn: topn = length topn_words = sorted(vocab, key=vocab.get, reverse=True)[:topn] X = ft[topn_words] tsne = TSNE(n_components=min(2, topn), random_state=seed) X_tsne = tsne.fit_transform(X) df = pd.DataFrame(X_tsne, index=topn_words, columns=['x', 'y']) fig = plt.figure() fig.set_size_inches(50, 40) ax = fig.add_subplot(1, 1, 1) ax.scatter(df['x'], df['y'], s=1000) ax.tick_params(axis='both', which='major', labelsize=50) for word, pos in df.iterrows(): ax.annotate(word, pos, fontsize=80) plt.show() fig = plt2MD(plt) plt.clf() rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## FastText Result | | ### Total Number of words | {length} | | ### Top {topn} Words | {topn_words} | {fig} | | ### Word analogy score | {analogies_score} | | ### Word correlation scores | #### Pearson correlation coefficient with 2-tailed p-value (WordSim353) | {pearson_1_1}, {pearson_1_2} | #### Spearman rank-order correlation coefficient with 2-tailed p-value (WordSim353) | {spearman_1_1}, {spearman_1_2} | #### The ratio of pairs with unknown words (WordSim353) | {oov_ratio_1} | #### Pearson correlation coefficient with 2-tailed p-value (SimLex999) | {pearson_2_1}, {pearson_2_2} | #### Spearman rank-order correlation coefficient with 2-tailed p-value (SimLex999) | {spearman_2_1}, {spearman_2_2} | #### The ratio of pairs with unknown words (SimLex999) | {oov_ratio_2} | | ### Parameters | {params} """.format(length=length, analogies_score=analogies_score, pearson_1_1=pearson_1[0], pearson_1_2=pearson_1[1], spearman_1_1=spearman_1[0], spearman_1_2=spearman_1[1], oov_ratio_1=oov_ratio_1, pearson_2_1=pearson_2[0], pearson_2_2=pearson_2[1], spearman_2_1=spearman_2[0], spearman_2_2=spearman_2[1], oov_ratio_2=oov_ratio_2, topn=topn, topn_words=topn_words, params=dict2MD(params), fig=fig))) vocab = list(ft.wv.vocab) model = _model_dict('fasttext_model') model['params'] = params model['vocab'] = vocab model['ft'] = ft.wv model['_repr_brtc_'] = rb.get() out_table = pd.DataFrame({'words': vocab, 'word_vectors': ft.wv[vocab].tolist()}) return {'model': model, 'out_table': out_table}
# ## preprocessing topics = [[] for i in range(len(df))] para = [[] for i in range(len(df))] topics1 = [[] for i in range(len(df))] para1 = [[] for i in range(len(df))] for i in range(len(df_combined)): text = df_combined.iloc[i][0] text = str(text) topics[i] = preprocess_text(text) text = df_combined.iloc[i][1] text = str(text) para[i] = preprocess_text(text) model_para = FastText(para, min_count=1) model_para.init_sims(replace=True) model_topic = FastText(topics, min_count=1) model_topic.init_sims(replace=True) # ## applying model def get_answers(df_combined, query1): query = Answer_Pre_Processing(query1) q = preprocess_text(query) count = 0 min1 = 1000 result = "" for i in range(len(df_combined)): distance = model_para.wmdistance(q, para[i])