def loadDevblogModel(self, embedding_dim, epochs, window, min_count): """ Devblog 데이터를 기반으로 FastText 단어 임베딩 모델 학습 - input : embedding_dim / int / 단어 벡터화시 차원 수 : epochs / int / 학습 횟수 : window / int / 학습에 사용될 n-gram : min_count / int / 학습에 사용될 단어의 최소 등장횟수 - return : we_model """ model = None if not os.path.isfile(CONST.devblog_model_path): print('🐈 학습된 단어 임베딩 모델이 없습니다.') dc = Document() docs = dc.getDocs(labeled_only=False) # 전체 데이터 가져오기 print('🐈 단어 임베딩 모델 학습을 시작합니다.') sentences = docs.text.apply(lambda x: [han2Jamo(s) for s in x.split(' ')]) model = FastText(size=embedding_dim, window=window, min_count=min_count) model.build_vocab(sentences=sentences) model.train(sentences=sentences, total_examples=len(sentences), epochs=epochs) print('🐈 단어 임베딩 모델을 저장합니다.') model.save(CONST.devblog_model_path) else: model = FastText.load(CONST.devblog_model_path) return model
def FastText_Save(files_in, models_out, min_count, size, iters): ''' FastText Model : input sentences in which words are pulled together with ' ' The input sentence is a list in a [] --> [list,list2,list3,......] and save the model. Arg: files_in: the tokenized txt data models_out: the path to save the model min_count: the threshold of frequency, if a word's frequency is smaller than it, the word will be drop size: the size of output vector of words iters: the times for training // vocabulary: vocab=(model.wv.vocab).keys() ''' txt_file = open(files_in, 'r') sentence = [] for line in txt_file: line = line.strip() line = line.split(' ') sentence.append(line) # Train the networks model = FastText(sentence, min_count=min_count, size=size, iter=iters, window=5) model.save(models_out)
def classType_fasttext_train(self, classType): train_sentences = [] for word in self.train: sentence = [] mappings = self.word_mapping[word] for mapping in mappings: if mapping == classType: sentence.append(word) if len(sentence) > 0: train_sentences.append(sentence) feature_encoder = FastText(size=50, window=2, min_count=1, min_n=2, max_n=6) feature_encoder.build_vocab(sentences=train_sentences) feature_encoder.train(sentences=train_sentences, total_examples=feature_encoder.corpus_count, epochs=1000) feature_encoder.save('./models/' + classType + '_fasttext.model') if classType == 'company': self.company_feature_encoder = feature_encoder elif classType == 'location': self.location_feature_encoder = feature_encoder elif classType == 'goods': self.goods_feature_encoder = feature_encoder else: raise Exception( 'Allowed arguments are company, location and goods')
def main(self): print('使用fasttext 方式进行训练, start train...') model = FastText(sentences=self.data, sg= 1, size= 150, window =2, min_count=1) model.save(self.fasttext_model) print('模型训练完成...')
def fasttext_model(model_name, dir_model): print('Creating fasttext model') trainings = [] cwd = os.getcwd() file_pathes = Path(cwd + '\\wakati').glob('**/*.txt') for file_path in file_pathes: with open(file_path, 'r', encoding='utf-8') as wafile: #wakati text整形 text = (wafile.read()).replace('\n', '') text = re.sub(r"\text+", " ", text) #label付与 tag_name = os.path.basename(file_path) tag_name = '__label__' + tag_name[20:26] + ' , ' text = tag_name + text text = text.replace('\u3000', '') text = text.replace('\xa0', '') textls = text.split(' ') textls2 = [x for x in textls if x] #空item削除 trainings.append(textls2) #print(list(trainings[0])) print('fasttext modeling') model_ft = FastText(trainings, size=300, window=15, min_count=5, iter=10, workers=10, sg=1) model_ft.save(dir_model + model_name)
def main(): """ script to training fastText word embedding model """ parser = argparse.ArgumentParser(description='') parser.add_argument('-i', '--input-file', required=False, default=config.DATA_FILE, help='input data file for training') parser.add_argument('-m', '--model-file', required=False, default=config.MODEL_FILE, help='model output name') parser.add_argument('-s', '--embedding-size', required=False, type=int, default=config.MODEL_FILE, help='model output name') args = parser.parse_args() model = FastText(size=args.embedding_size, sg=1) model.build_vocab(corpus_file=args.input_file) total_words = model.corpus_total_words model.train(corpus_file=args.input_file, total_words=total_words, epochs=5) model.save(args.model_file)
def train_vector_model(train_data_list, mode): if mode == 'train': mecab = Okt() str_buf = train_data_list['encode'] joinString = ' '.join(str_buf) pos1 = mecab.pos(joinString) pos2 = ' '.join( list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n') morphs = list(map(lambda x: mecab.morphs(x), pos2)) print("BUILD MODEL") model = FastText(size=vector_size, window=3, workers=8, min_count=1, sg=1, iter=1000) model.build_vocab(morphs) print("BUILD COMPLETE") print("TRAIN START") model.train(morphs, total_examples=model.corpus_count, epochs=model.epochs, compute_loss=True) if not os.path.exists('./fasttext'): os.makedirs('./fasttext') model.save('./fasttext/model') print("TRAIN COMPLETE") return model else: return FastText.load('./fasttext/model')
def load_ft_model(self, fname): """ class FastText(sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=0.001, seed=1, workers=3, min_alpha=0.0001, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH) min_n : int Min length of char ngrams to be used for training word representations. max_n : int Max length of char ngrams to be used for training word representations. Set max_n to be lesser than min_n to avoid char ngrams being used. word_ngrams : int {1,0} If 1, uses enriches word vectors with subword(ngrams) information. If 0, this is equivalent to word2vec. bucket : int Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model. """ print( 'Loading Fasttext Model... in {0:.2f} seconds'.format(time.time() - start_time)) try: fasttext_model = FastText.load(fname) print(fasttext_model) except IOError: print('No existed model. Training Ft Model... in {0:.2f} seconds'. format(time.time() - start_time)) texts = config.WikiCorpus() fasttext_model = FastText(texts, **DEFAULT_ARGUMENTS_FT) fasttext_model.save(fname) print('Success to load Fasttext Model... in {0:.2f} seconds'.format( time.time() - start_time)) return fasttext_model
def getWordVec(corpus: list, type=1) -> object: ''' Args: corpus: list[list[str]], each sublist indicates a sentence type: 1 = word2vec, 2 = fasttext ''' coherenceMetric = Callback(ConvergenceMetric()) convergenceMetric = Callback(CoherenceMetric()) diffMetric = Callback(DiffMetric()) if type == 1: model = Word2Vec(corpus) model.save('word2vec.model') return model else: model = FastText(min_count=1) logging.info('Starting building vocabulary table') model.build_vocab(corpus) logging.info('Starting training') model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs, callbacks=[epochLogger]) model.save('FastText.model') return model
def train_vector_model(train_data_list, train): if train: str_buf = train_data_list['encode'] joinString = ' '.join(str_buf) pos1 = kiwi_f.k_pos(joinString) pos2 = ' '.join( list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n') morphs = list(map(lambda x: kiwi_f.k_morphs(x), pos2)) print("BUILD MODEL") model = FastText( size=300, window=3, workers=8, min_count=1, sg=1, #skipgram 모델의 성능이 더 좋다고 알려져있음 iter=1000) model.build_vocab(morphs) print("BUILD COMPLETE") print("TRAIN START") model.train(morphs, total_examples=model.corpus_count, epochs=model.epochs, compute_loss=True) if not os.path.exists(path.FASTTEXT_DIR): os.makedirs(path.FASTTEXT_DIR) model.save(path.model_path + 'model_test') print("TRAIN COMPLETE") return model else: return FastText.load(path.model_path + 'model_test')
class EmbeddingModel(): def __init__(self, name="default", phraser=None): self.name = "embedding_" + name + ".model" self.phraser = phraser if self.name in os.listdir(dir_embedding): self.get_embedding = FastText.load(dir_embedding + self.name) print("Embedding {} loaded".format(name)) else: print("embedding not exists") print("start building...") self.build_embedding() self.save() def build_embedding(self): tickers = [ i for i in os.listdir(dir_cleaned_news) if i.endswith(".csv") ] tokenized_docs = [] start = time.time() for ticker in tickers: df = pd.read_csv(dir_cleaned_news + ticker, index_col=0) tokenized_docs += tokenizer(df['content'], self.phraser) self.get_embedding = FastText(tokenized_docs, sg=1, hs=1) end = time.time() print("train finished! ", end - start, " seconds") # 저장 def save(self): self.get_embedding.save(dir_embedding + self.name) print("saved!")
def make_title_model(self, title_list_detach): try: print("make_title_model 실행") if not (os.path.isfile( "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model" )): print("make_title_model 모델 학습 시작") FT_title_model = FT_gensim(title_list_detach, size=300, window=100, min_count=1, sg=1, iter=2000) print("make_title_model2 모델 학습 완료") self.FT_title_model = FT_title_model FT_title_model.save( "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model" ) self.FT_title_model = FT_gensim.load( "C:/Users/hwang in beom/Desktop/final/full/FT_title_model.model" ) print("make_title_model 모델 로드됨") except OSError as e: print("failed to create directory!") raise
def train(sentences): print("starting to train!") # train model if args.train_pairs and args.relevant_selects: min_count = args.min_count * 5 elif args.train_pairs: min_count = args.min_count * 10 else: min_count = args.min_count if "word2vec" in args.gensim_model_name: model = Word2Vec(sentences, size=args.embedding_size, window=20, sg=args.skipgram, workers=16, min_count=min_count) elif "fast" in args.gensim_model_name: model = FastText(sentences, size=args.embedding_size, window=20, sg=args.skipgram, workers=16, min_count=min_count) # summarize the loaded model print(model) # access vector for one word # save model # trim unneeded model memory = use (much) less RAM model.init_sims(replace=True) model.save(args.data_dir + args.model_name)
def train_fasttext(infile, outfile, skipgram, loss, size, epochs): """ train_fasttext(args**) -> Takes the input file, the output file and the model hyperparameters as arguments and trains the model accordingly. The model is saved at the output location. Arguments --------- infile : Input pre-processed wiki dump outfile : Output directory to save the model. skipgram : Layers of the model (0 - CBOW, 1 - Skipgram) loss : Loss Function (0 - Negative Sampling, 1 - Heirarichal Loss) size : Embedding size (100 ~ 300) epochs : Number of epochs """ sentence = LineSentence(infile) model = FastText(sentence, sg=skipgram, hs=loss, size=size, alpha=0.05, window=5, min_count=5, min_n=2, max_n=5, workers=3, iter=epochs) model.save(outfile)
def trainmodel(paragraphset, fs, fw, fc): embedding_model = FastText( paragraphset, size=fs, window=fw, min_count=fc, workers=4, sg=1) mname = str(fs)+"_"+str(fw)+"_"+str(fc)+".model" embedding_model.save(mname) print(mname+"save done") return embedding_model
def text_setup_for_feature_representation(dataset,embedding_case): dataset = get_equal_for_each_cat(dataset,1000000) #dataset = clean_text_language(dataset) x = [k for k in dataset['text']] #x_1 = [i.split() for i in x] if embedding_case == 1: #x_1 = [i.split() for i in x] print("text to word sequence...") x_list = [text_to_word_sequence(k) for k in dataset['text']] print('text prepared for word2vec...') print(len(x_list)) model = Word2Vec(x_list, size = 800, window = 5, min_count=3, workers=3) print("saving model...") model.save("word2vec_yelp_800") del model elif embedding_case == 2: x_1 = [i.split() for i in x] print('text prepared for FastText...') model = FastText(x_list, size = 300, window = 5, min_count = 3, workers=3) print("saving model...") model.save("FastText_yelp_all") del model elif embedding_case == 3: taggedDocs = nt('taggedDocs','words tags') docs = [] for i in range(len(x)): words = x[i].split() tag = [i] docs.append(taggedDocs(words,tag)) print('Text prepared for doc2vec...') model = Doc2Vec(x_list, size = 300, window = 8, min_count = 3, workers = 3) print("saving model...") model.save('doc2vec_yelp_all_latest') del model
def train_vector_model(datas, train): path = configs.fasttext_path if train: mecab = Okt() str_buf = datas['encode'] joinString = ' '.join(str_buf) pos1 = mecab.pos(joinString) pos2 = ' '.join(list(map(lambda x: '\n' if x[1] in ['Punctuation'] else x[0], pos1))).split('\n') morphs = list(map(lambda x: mecab.morphs(x), pos2)) print("BUILD MODEL") model = FastText(size=vector_size, window=3, workers=8, min_count=2, sg=1, iter=1500) model.build_vocab(morphs) print("BUILD COMPLETE") print("TRAIN START") model.train(morphs, total_examples=model.corpus_count, epochs=model.epochs, compute_loss=True) if not os.path.exists(path): os.makedirs(path) model.save(path + 'model_v2') print("TRAIN COMPLETE") return model else: print("LOAD SAVED MODEL") return FastText.load(path + 'model_v2')
def generate_outer_feature(self): train_df, _ = read_data() test_df = read_data(test=True) all_df = pd.concat([train_df, test_df], ignore_index=True) users = all_df['user_id'].unique() docs = [] for u in users: docs.append(all_df[all_df['user_id'] == u]['kiji_id'].values) vc = all_df['kiji_id'].value_counts() to_none_ids = vc[vc < 5].index def to_word(d): if d in to_none_ids: return 'None' return d if os.path.exists(self.fast_model_path): model = FastText.load(self.fast_model_path) else: docs = [[to_word(w) for w in doc] for doc in docs] with timer(logger, format_str='create kiji_id fast_model' + ' {:.3f}[s]'): model = FastText(docs, workers=6, size=64) model.save(self.fast_model_path) z = self.df_outer['kiji_id_raw'].map(to_word).map( lambda x: model.wv[x]) df = pd.DataFrame(np.array(z.values.tolist())).add_prefix('kiji_wv_') df[self.merge_key] = self.df_outer['kiji_id_raw'] return df
def constructModelFromCiteseer(): fname = open("./intelligentciteseerfasttext.model", "wb") cwd = os.path.dirname(os.path.realpath(__file__)) dataDirPath = os.path.join(cwd, os.path.pardir, "citeseerdata") sentences = [] termparser = TermParser() termparser.labautopedia() termparser.webopedia() termparser.constructTermCountDict() compsciterms = termparser.allterms for entry in scantree(dataDirPath): if not entry.name.startswith('.') and entry.is_file(): filepath = entry.path with open(filepath, "r", encoding="utf-8") as f: textcontent = f.read() for term in compsciterms: if term in textcontent: textcontent = textcontent.replace(term, ''.join(term.split(" "))) termparser.termdict[term] += 1 content = textcontent.split(". ") for line in content: sentence = cleanuptext(line) if len(sentence) is not 0: sentences.append(sentence), compsciterms termparser.wordOccurenceGraph() modelF = FastText(sentences, size=4, window=4, min_count=1, iter=10) modelF.save(fname) fname.close()
def train_wordvectors(in_file, out_file): df = pd.read_csv(in_file, encoding='gb18030', usecols=['content']) df = df.fillna('') df['tokens'] = df['content'].apply(lambda x: list(jieba.cut(x))) sentences = df['tokens'].tolist() model = FastText(sentences, window=5, size=35, iter=10, min_count=1) model.save(out_file)
def train_model(sentences: Collection[str], save_path=MODEL_PATH): model = FastText(size=VECTOR_SIZE) model.build_vocab(sentences=sentences) model.train(sentences=sentences, total_examples=model.corpus_count, epochs=50) model.save(save_path) return model
def trainModel(fileName): print("training ") corpus = createCorpus(fileName) model = FastText(corpus, size=300, window=5, min_count=5, sg=0, iter=4) model.save( 'C:/Users/Lenovo/Desktop/Bitirme/Word2Vec.v3/models/fasttext.model.bin' ) print("done")
def create_model(skip_gram, tokenized_sentences, model_path): model = FastText(min_count=1, window=5, sg=skip_gram) model.build_vocab(sentences=tokenized_sentences) model.train(sentences=tokenized_sentences, total_examples=len(tokenized_sentences), vector_size=5, epochs=100) model.save(model_path) return model
def model(self, minCnt, size, window): # size = N dim. vector model = FastText(self.tokensListSet, min_count=minCnt, size=size, window=window) model.save('model/fastText.bin') print(model)
class Word2Vector(object): def __init__(self, src_file, dst_file, size=300, window=5, min_count=10, hs=0, sg=0, learning_rate=0.025): self.src_file = src_file self.model_file = dst_file self.size = size self.window = window self.min_count = min_count self.hs = hs # 1: 分层softmax, 0: 不使用分层softmax self.sg = sg # 1: skip-gram, 0: CBOW self.alpha = learning_rate self.workers = multiprocessing.cpu_count() def train(self, sentences): self.model = FastText(sentences, size=self.size, window=self.window, min_count=self.min_count, sg=self.sg, workers=self.workers) self.model.save(self.model_file) self.model.save_word2vec_format(self.model_file + '.bin', binary=True) def train_model(self): sentences = LineSentence(self.src_file) self.train(sentences) def online_train_model(self, sentences): # 在线训练 self.model.build_vocab(LineSentence(sentences)) self.model.train(total_examples=self.model.corpus_count, epochs=self.model.iter) def online_train_model(self, file_name, isdir=True): # 在线训练 if isdir: sentences = PathLineSentences(self.src_file) else: sentences = LineSentence(self.src_file) self.online_train_model(sentences) def train_dir_model(self): sentences = PathLineSentences(self.src_file) self.train(sentences) def load_model(self, model_name): self.model = FastText.load(model_name) def show_similarity(self, word1, word2): return self.model.wv.similarity(word1, word2) def show_word_vector(self, word): return self.model.wv[word]
def train(self): model = FastText(self.sentences, size=200, window=3, min_count=1, iter=70) currdir = os.getcwd() model.save(currdir + '/mymodel.bin') return model
def bible_embeddings(processed_bible): #Parameters: processed bible file #Returns: writes bible representation to file model = FastText() model.build_vocab(sentences=processed_bible) model.train(sentences=processed_bible, total_examples=len(processed_bible), epochs=10) model.save("bible_ft.bin")
def _train_and_save_model_ft(sents, model_path): ft_model = FastText(sents, size=128, window=32, min_count=5, sample=1e-2, sg=1, iter=50) ft_model.save(model_path) return ft_model
def fasttext(model_path, sentences): ''' https://radimrehurek.com/gensim/models/fasttext.html model_path should have a .model extension sentences: list of list of strings(tokens) ''' model = FastText(sentences, min_count=1) word_vectors = model.wv model.save(model_path) return model_path
def fasttext_train(corpus_path, save_path): """输入分词完成的txt文件,一行为一个文本。""" model = FastText(window=5, size=200, min_count=1, workers=2) model.build_vocab( corpus_file=corpus_path) # scan over corpus to build the vocabulary total_words = model.corpus_total_words # number of words in the corpus model.train(corpus_file=corpus_path, total_words=total_words, epochs=5) model.save(save_path)
class NP2vec: """ Initialize the np2vec model, train it, save it and load it. """ def is_marked(self, s): """ Check if a string is marked. Args: s (str): string to check """ return len(s) > 0 and s[-1] == self.mark_char def __init__( self, corpus, corpus_format='txt', mark_char='_', word_embedding_type='word2vec', sg=0, size=100, window=10, alpha=0.025, min_alpha=0.0001, min_count=5, sample=1e-5, workers=20, hs=0, negative=25, cbow_mean=1, iter=15, min_n=3, max_n=6, word_ngrams=1): """ Initialize np2vec model and train it. Args: corpus (str): path to the corpus. corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json formats are supported. For json format, the file should contain an iterable of sentences. Each sentence is a list of terms (unicode strings) that will be used for training. mark_char (char): special character that marks NP's suffix. word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and fasttext are supported. np2vec_model_file (str): path to the file where the trained np2vec model has to be stored. binary (bool): boolean indicating whether the model is stored in binary format; if word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True. sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training algorithm. If 1, CBOW is used,otherwise, skip-gram is employed. size (int): model training hyperparameter, size of the feature vectors. window (int): model training hyperparameter, maximum distance between the current and predicted word within a sentence. alpha (float): model training hyperparameter. The initial learning rate. min_alpha (float): model training hyperparameter. Learning rate will linearly drop to `min_alpha` as training progresses. min_count (int): model training hyperparameter, ignore all words with total frequency lower than this. sample (float): model training hyperparameter, threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5) workers (int): model training hyperparameter, number of worker threads. hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non- zero, negative sampling will be used. negative (int): model training hyperparameter, negative sampling. If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. iter (int): model training hyperparameter, number of iterations. min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used for training word representations. max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training. """ self.mark_char = mark_char self.word_embedding_type = word_embedding_type self.sg = sg self.size = size self.window = window self.alpha = alpha self.min_alpha = min_alpha self.min_count = min_count self.sample = sample self.workers = workers self.hs = hs self.negative = negative self.cbow_mean = cbow_mean self.iter = iter self.min_n = min_n self.max_n = max_n self.word_ngrams = word_ngrams if corpus_format == 'txt': self._sentences = LineSentence(corpus) elif corpus_format == 'json': with open(corpus) as json_data: self._sentences = json.load(json_data) elif corpus_format == 'conll2000': try: self._sentences = list() for chunked_sent in conll2000.chunked_sents(corpus): tokens = list() for chunk in chunked_sent: if hasattr(chunk, '_label') and chunk._label == 'NP': s = '' for w in chunk: s += w[0] + self.mark_char tokens.append(s) else: if isinstance(chunk, nltk.Tree): for w in chunk: tokens.append(w[0]) else: tokens.append(chunk[0]) self._sentences.append(tokens) except Exception: print('Conll2000 dataset is missing from NLTK. See downloading details in the ' 'README file') else: logger.error('invalid corpus format: ' + corpus_format) sys.exit(0) if word_embedding_type == 'fasttext' and word_ngrams == 1: # remove the marking character at the end for subword fasttext model training for i, sentence in enumerate(self._sentences): self._sentences[i] = [ w[:-1] if self.is_marked(w) else w for w in sentence] logger.info('training np2vec model') self._train() def _train(self): """ Train the np2vec model. """ if self.word_embedding_type == 'word2vec': self.model = Word2Vec( self._sentences, sg=self.sg, size=self.size, window=self.window, alpha=self.alpha, min_alpha=self.min_alpha, min_count=self.min_count, sample=self.sample, workers=self.workers, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, iter=self.iter) elif self.word_embedding_type == 'fasttext': self.model = FastText( self._sentences, sg=self.sg, size=self.size, window=self.window, alpha=self.alpha, min_alpha=self.min_alpha, min_count=self.min_count, sample=self.sample, workers=self.workers, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, iter=iter, min_n=self.min_n, max_n=self.max_n, word_ngrams=self.word_ngrams) else: logger.error( 'invalid word embedding type: ' + self.word_embedding_type) sys.exit(0) def save(self, np2vec_model_file='np2vec.model', binary=False): """ Save the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format """ if self.word_embedding_type == 'fasttext' and self.word_ngrams == 1: if not binary: logger.error( "if word_embedding_type is fasttext and word_ngrams is 1, " "binary should be set to True.") sys.exit(0) # not relevant to prune fasttext subword model self.model.save(np2vec_model_file) else: # prune non NP terms logger.info('pruning np2vec model') total_vec = 0 vector_size = self.model.vector_size for word in self.model.wv.vocab.keys(): if self.is_marked(word): total_vec += 1 logger.info( "storing %sx%s projection weights for NP's into %s" % (total_vec, vector_size, np2vec_model_file)) with utils.smart_open(np2vec_model_file, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store NP vectors in sorted order: most frequent NP's at the top for word, vocab in sorted( iteritems( self.model.wv.vocab), key=lambda item: -item[1].count): if self.is_marked(word): embedding_vec = self.model.wv.syn0[vocab.index] if binary: fout.write( utils.to_utf8(word) + b" " + embedding_vec.tostring()) else: fout.write( utils.to_utf8( "%s %s\n" % (word, ' '.join( "%f" % val for val in embedding_vec)))) @classmethod def load(cls, np2vec_model_file, binary=False, word_ngrams=0): """ Load the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ if word_ngrams == 0: return KeyedVectors.load_word2vec_format( np2vec_model_file, binary=binary) elif word_ngrams == 1: return FastText.load(np2vec_model_file) else: logger.error('invalid value for \'word_ngrams\'')