def testFastText(self): class LeeReader(object): def __init__(self, fn): self.fn = fn def __iter__(self): with smart_open(self.fn, 'r', encoding="latin_1") as infile: for line in infile: yield line.lower().strip().split() model = FastText(LeeReader(datapath('lee.cor'))) model.init_sims() index = self.indexer(model, 10) self.assertVectorIsSimilarToItself(model.wv, index) self.assertApproxNeighborsMatchExact(model, model.wv, index) self.assertIndexSaved(index) self.assertLoadedIndexEqual(index, model)
def get_fasttext_embedding_matrix(word_index, max_nb_words): model = fText.load_fasttext_format(FASTTEXT_FILE) nb_words = max_nb_words word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM)) for word, i in word_index.items(): if i > max_nb_words: continue embedding_vector = model.wv[word] if embedding_vector is not None: word_embedding_matrix[i] = embedding_vector print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0)) return word_embedding_matrix, nb_words
def _train(self): """ Train the np2vec model. """ if self.word_embedding_type == 'word2vec': self.model = Word2Vec( self._sentences, sg=self.sg, size=self.size, window=self.window, alpha=self.alpha, min_alpha=self.min_alpha, min_count=self.min_count, sample=self.sample, workers=self.workers, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, iter=self.iter) elif self.word_embedding_type == 'fasttext': self.model = FastText( self._sentences, sg=self.sg, size=self.size, window=self.window, alpha=self.alpha, min_alpha=self.min_alpha, min_count=self.min_count, sample=self.sample, workers=self.workers, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, iter=iter, min_n=self.min_n, max_n=self.max_n, word_ngrams=self.word_ngrams) else: logger.error( 'invalid word embedding type: ' + self.word_embedding_type) sys.exit(0)
def load(cls, np2vec_model_file, binary=False, word_ngrams=0): """ Load the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ if word_ngrams == 0: return KeyedVectors.load_word2vec_format( np2vec_model_file, binary=binary) elif word_ngrams == 1: return FastText.load(np2vec_model_file) else: logger.error('invalid value for \'word_ngrams\'')
def word_embedding(self): return FastText(self.tokens, size=100, min_count=5, workers=multiprocessing.cpu_count(), sg=1)
def main(): class MyIter(object): def __iter__(self): # absolute_path = r'C:\Users\NA\Desktop\Workspace\GJAI_WarmingUpProject\AIJOA_Project\wiki.ko' file_name = 'voice.txt' path = datapath(absolute_path + '/' + file_name) # 학습시킬 데이터의 절대경로 with utils.open(path, 'r', encoding='UTF-8') as fin: for line in fin: yield list(tokenize(line)) # 데이터 불러오기 # absolute_path = r'C:\Users\NA\Desktop\Workspace\GJAI_WarmingUpProject\AIJOA_Project\wiki.ko' file_name = 'wiki.ko.bin' # file_name = 'cc.ko.300.bin' # wiki.ko.bin corpus_file = datapath(absolute_path + '/' + file_name) # 데이터의 절대경로 상대경로는 안되나? 응 안돼 # 모델 구축 # model 객체화 # 우측 링크의 매개변수 참조 : https://radimrehurek.com/gensim/models/fasttext.html print("모델 객체 생성") # model = FastText(size=1000, window=3, min_count=3, workers=4, sg=1) model = models.fasttext.load_facebook_model(corpus_file) # 불러온 모델에 이어서 학습하기 # MyIter()로 불러온 데이터 new_sentences 리스트에 담기 new_sentences = [] for i in MyIter(): new_sentences.append(i) print(new_sentences) # MyIter()로 불러온 데이터양 늘리기 new_sentences = new_sentences * 10000 print(len(new_sentences)) # corpus_total_words (및 corpus_count) 모델 속성을 추가 설정 # 이어서 학습시킬때에는 update=True로 설정해야 기존 학습한 어휘가 추가된체로 학습함. print("추가 어휘구성") model.build_vocab(new_sentences, update=True) # Update the vocabulary print("추가 어휘구성 끝") # 추가 학습 print("추가 학습") model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) print("추가 학습 끝") # 모델 저장 : 저장경로는 절대 절대 절대 경로로 설정 안그러면 어만데 저장됨. save_name = "wiki_ko_v3.model" fname = get_tmpfile(absolute_path + '/' + save_name) print("model save start") model.save(fname) print("model save end") # 저장했던 모델 불러오기 : 절대 경로임. # absolute_path = r'C:\Users\NA\Desktop\Workspace\GJAI_WarmingUpProject\AIJOA_Project\wiki.ko' file_name = "wiki_ko_v3.model" fname = get_tmpfile(absolute_path + '/' + file_name) print("model load start") model = FastText.load(fname) print("model load end") menu_list = { '폴더버거 핫치킨': [ '골드버거 치킨', '오늘도 봐봐 치킨', '오늘도 보고 와 치킨', '불도 먹었어 치킨', '골드버거 핫치킨', '골드버거 치킨', '월드 보고 아침에', '오늘도 보고 와 치킨', '폴더 버거 킹', '홀더 버거 치킨', '뭘 더 먹어 치킨', '너 먹어 치킨', '뭐 먹어 치킨' ], '폴더버거 비프': [ '골드버그 비프', '올더 버거 비프', '폴더 버거 비프', '골드버그 비프 세트', '올더 버거 비프 세트', '어디서 먹어 핑크색', '물 더 먹어 비트 세트', '골드버그 비프 세트', '올 더 버거 비틀 세트', '홀더 버거 비프', '뭘 더 먹어 비프', '너 먹어 피프 세트', '뭐 먹어 비프' ], '리아미라클버거': ['리아미라클버거', '미아 미라클버거', '리아미라클버거 세트', '미라클버거 세트', '리아 미라클 버거 세트'], '와규 에디션 투': [ '외규에디션 2', '마귀 에디션 2', '와규에디션 2', '와 귀신 전화', '월요일 좀 주세요 전화', '와규에디션 2 세트', '와규에디션 2 세트', '목요일 샘플 세트' ], '더블엑스투': [ '브렉시트', '더블엑스 2', '더블 X2 세트', '저번에 지출 세트', '버그 렉스필드 전화', '노래 치킨 세트', '더블 X2 세트', '더블엑스 풀세트', '더블엑스 두 세트' ], '티렉스': [ '티렉스 버거', '티렉스 버거 세트', '티렉스버거세트', '티렉스버거 찾아', '티렉스 버거 세트 두 개', '티렉스버거세트 두 개' ], '클래식 치즈버거': [ '클래식 치즈버거', '클래식 치즈버거 세트 하나', '클래식 치즈버거 틀어', '클래식 치즈버거 세트', '클래식 치즈버거 세트 두 개' ], '한우불고기': [ '한우 불고기', '한우불고기 세트 하나', '한우 불고기 집 전화', '한우 불고기 제주 TV', '한우불고기 두 개', '한우불고기 세트 두 개' ], '모짜렐라 인 더 버거 베이컨': [ '모짜렐라인 더 버거', '모짜렐라인 더 버거 세트 하나', '모짜렐라 인더버거 베트남', '모짜렐라인 더 버거 세트 두 개', '모짜렐라인 더 버거 세트' ], # '모짜렐라 in the 버거':['모짜렐라인 더 버거', '모짜렐라인 더 버거 세트 하나', # '모짜렐라 인더버거 베트남', '모짜렐라인 더 버거 세트 두 개', # '모짜렐라인 더 버거 세트'], # '모짜렐라 인 더 버거':['모짜렐라인 더 버거', '모짜렐라인 더 버거 세트 하나', # '모짜렐라 인더버거 베트남', '모짜렐라인 더 버거 세트 두 개', # '모짜렐라인 더 버거 세트'], '에이지버거': [ 'az버거', 'az버거 세트', '에이지버거', '에이지버거 세트', '아재버거', '아재버거 세트 하나', '거제도 가서 찾아', '아재 동생 두 개', '아재버거세트 두 개', '아재버거 세트 두 개' ], 'az버거': [ 'az버거', 'az버거 세트', '에이지버거', '에이지버거 세트', '아재버거', '아재버거 세트 하나', '거제도 가서 찾아', '아재 동생 두 개', '아재버거세트 두 개', '아재버거 세트 두 개' ], '에이제트버거': [ 'az버거', 'az버거 세트', '에이제트버거', '에이제트버거 세트', '아재버거', '아재버거 세트 하나', '거제도 가서 찾아', '아재 동생 두 개', '아재버거세트 두 개', '아재버거 세트 두 개' ], '원조 빅불': [ '원조빅불', '원조빅불 세트', '원조빅불세트', '언제 도착하나', '물제 를 풀 세트', '원조 빅불 세트', '오늘 이불세트' ], '핫크리스피버거': [ '핫크리스피버거', '하트스피가방', '핫크리스피버거 세트', '크리스피 버거 세트', '핫 크리스피버거 세트', '하트 스트커 세트' ], '불고기버거': ['불고기 버거', '불고기 버거 세트 하나', '불고기 버거 세트', '불고기버거 세트 두 개'], '데리버거': [ '데리버거', '데리버거 세트 하나', '데리버거 찾아', '데리버거 두 개', '데리버거 세트 두 개', '데리버거세트 두 개' ], '치킨버거': ['치킨버거', '치킨 먹어', '치킨 버거 세트', '치킨 먹었다', '치킨 먹어서 두 개', '치킨 버거 세트 두 개'], '새우버거': ['새우버거', '재봉 설탕', '새우버거 세트', '일본어 태어나', '여보 가서 켜', '새우버거 속'] } # 메뉴오탈과 메들 사이의 유사도 print("메뉴오탈과 메들 사이의 유사도 비교 파일 생성 시작") for key1 in menu_list.keys(): file_name = f'{key1}메뉴의 오탈자와 전체메뉴들과의 유사도.txt' for value in menu_list[key1]: for key2 in menu_list.keys(): # print(f"메뉴'{key1}'의 오탈자 '{value}'와 메뉴명'{key2}'와의 유사도: {model.wv.similarity(value, key2)}") if file_name not in os.listdir(absolute_path): with open(absolute_path + '/' + file_name, 'w', encoding='utf-8') as file_data: file_data.write( f"'{key1}'의 오탈자 '{value}'와 메뉴명'{key2}'와의 유사도: {model.wv.similarity(value, key2)}\n" ) else: with open(absolute_path + '/' + file_name, 'a', encoding='utf-8') as file_data: file_data.write( f"'{key1}'의 오탈자 '{value}'와 메뉴명'{key2}'와의 유사도: {model.wv.similarity(value, key2)}\n" ) print("메뉴오탈과 메들 사이의 유사도 비교 파일 생성 종료")
def getResult(text): print("START READ AND PREPROCESSING", text) # kitab = np.load('dataset/numpy/kitabsave.npy', allow_pickle=True) kitab = np.load(resource_path('./data/dataset/kitabsave.npy'), allow_pickle=True) kitab = kitab.tolist() # sentence_clear = np.load('dataset/numpy/sentence_clearsave.npy', allow_pickle=True) sentence_clear = np.load(resource_path('./data/dataset/sentence_clearsave.npy'), allow_pickle=True) sentence_clear = sentence_clear.tolist() # kategori = np.load('dataset/numpy/kategori.npy', allow_pickle=True) kategori = np.load(resource_path('./data/dataset/kategori.npy'), allow_pickle=True) kategori = kategori.tolist() # namakitab = np.load('dataset/numpy/namakitab.npy', allow_pickle=True) namakitab = np.load(resource_path('./data/dataset/namakitab.npy'), allow_pickle=True) namakitab = namakitab.tolist() # kitab[0][1] # modelFT = ft.load('Model/modelFT.model') modelFT = ft.load(resource_path('./data/model/modelFT.model')) #TF-IDF tfidf_vectorizer = TfidfVectorizer() norm_tf=[] for isikitab in kitab: for ktb in isikitab: norm_tfidf = normalizeArabic(ktb) norm_tf.append(norm_tfidf) tfidf_doc = tfidf_vectorizer.fit_transform(norm_tf) tfidf_word=tfidf_vectorizer.get_feature_names() PIFQvectorizer = CountVectorizer() vectoreTF = PIFQvectorizer.fit_transform(norm_tf) featureTf = PIFQvectorizer.get_feature_names() cosimhasil = [] cosimhasilnilai = [] hasilqenilaidicosim = [] namakitabcosim = [] halamankitabcosim = [] isikitabcosim = [] inputandicosim = [] pifqhasil = [] pifqhasilnilai = [] hasilqenilaidipifq = [] namakitabpifq = [] halamankitabpifq = [] isikitabpifq = [] inputandipifq = [] gabunganhasil = [] gabunganhasilnilai = [] hasilqenilaidigabungan = [] namakitabgabungan= [] halamankitabgabungan = [] isikitabgabungan = [] inputandigabungan = [] # MOST SIMILAR WE hasilQE = modelFT.wv.most_similar(text) hasilQE = [(strip_tashkeel(''.join(c for c in hasilQE[i][0] if not ud.category(c).startswith('P'))), hasilQE[i][1]) for i in range(len(hasilQE))] # print(hasilQE) cosim = [] hasilpifq = [] hasilgabungan = [] nilaihasilgabungan = [] nilaihasilpifq = [] nilaicosim = [] QEpakai = hasilQE[0:3] for i in QEpakai: tes=i[0] tfidf_query = tfidf_vectorizer.transform([tes]) cos=0.0 #hitung kedekatan query pada masing masing dokumen cos=cosine_similarity(tfidf_doc,tfidf_query) # print(type(cos)) cosim.append(max(cos)) nilaicosim.append(cos) # print('tfidf') # ================ countTF = [] s = ''.join(c for c in tes if not ud.category(c).startswith('P')) s = strip_tashkeel(s) for k in range(len(featureTf)): if featureTf[k] == s: # print(k) for j in range(vectoreTF.shape[0]): countTF.append(vectoreTF[j,k]) #PIFQ nilaipifq = [] for k in countTF: if sum(countTF) == 0: nilaipifq.append(0) else: nilaipifq.append(1 + np.log10(1 + (k / sum(countTF))) + 0.5) nilaihasilpifq.append(nilaipifq) hasilpifq.append(max(nilaipifq)) # print('pifq') #gabungan nilaigabungan = [] for k in range(vectoreTF.shape[0]): nilaigabungan.append(nilaipifq[k] * cos[k][0]) nilaihasilgabungan.append(nilaigabungan) hasilgabungan.append(max(nilaigabungan)) # print("======= hasil Cosim ===========") angka = 0 for i in nilaicosim: # print(hasilQE[angka][0]) for j in range(len(i)): if i[j] == cosim[angka]: panjangkitab= 0; for iterkitab in range(len(kitab)): panjangkitab = panjangkitab + len(kitab[iterkitab]) if j <= panjangkitab: tessplit = kitab[iterkitab][0].split(',') # print('Nama Kitab {} halaman ke {}'.format(namakitab[iterkitab],tessplit[4])) # print('isi kitab : ', tessplit[5]) cosimhasil.append(hasilQE[angka][0]) hasilqenilaidicosim.append(hasilQE[angka][1]) cosimhasilnilai.append(cosim[angka]) namakitabcosim.append(namakitab[iterkitab]) halamankitabcosim.append(tessplit[4]) isikitabcosim.append(tessplit[5]) # inputandicosim.append(kata) break; angka += 1 # print("====== hasil pifq ==========") angka = 0 for i in nilaihasilpifq: # print(hasilQE[angka][0]) for j in range(len(i)): if i[j] == hasilpifq[angka]: panjangkitab= 0; for iterkitab in range(len(kitab)): panjangkitab = panjangkitab + len(kitab[iterkitab]) if j <= panjangkitab: tessplit = kitab[iterkitab][0].split(',') # print('Nama Kitab {} halaman ke {}'.format(namakitab[iterkitab],tessplit[4])) # print('isi kitab : ', tessplit[5]) pifqhasil.append(hasilQE[angka][0]) pifqhasilnilai.append(hasilpifq[angka]) hasilqenilaidipifq.append(hasilQE[angka][1]) namakitabpifq.append(namakitab[iterkitab]) halamankitabpifq.append(tessplit[4]) isikitabpifq.append(tessplit[5]) # inputandipifq.append(kata) break; angka += 1 # print("============== Hasil Gabungan ===============") print("============== selesai ===============") angka = 0 for i in nilaihasilgabungan: # print(hasilQE[angka][0]) for j in range(len(i)): if i[j] == hasilgabungan[angka]: panjangkitab= 0; for iterkitab in range(len(kitab)): panjangkitab = panjangkitab + len(kitab[iterkitab]) if j <= panjangkitab: tessplit = kitab[iterkitab][0].split(',') # print('Nama Kitab {} halaman ke {}'.format(namakitab[iterkitab],tessplit[4])) # print('isi kitab : ', tessplit[5]) gabunganhasil.append(hasilQE[angka][0]) gabunganhasilnilai.append(hasilgabungan[angka]) hasilqenilaidigabungan.append(hasilQE[angka][1]) namakitabgabungan.append(namakitab[iterkitab]) halamankitabgabungan.append(tessplit[4]) isikitabgabungan.append(tessplit[5]) # inputandigabungan.append(kata) break; angka += 1 nilaihasilcosim = [] for i in cosimhasilnilai: nilaihasilcosim.append(i[0])
def train_model(clxn): sents = chain(*(tokenize(item.text) for item in clxn)) model = FastText(sents, size=300) return model
x.append(ls[0]) temp = [] #print(ls[0]) for j in tokenizer.tokenize(ls[0].decode('utf-8')): #print(j) temp.append(j) data.append(temp) lent.append(len(temp)) y_test.append(int(ls[1])) f.close() pad_len = max(lent) #model_FT = FastText(data, size=10, window=5, min_count=1, workers=5, sg=1,max_vocab_size=10000) #model_FT.save("SG_fasttext.model") model_FT = FastText.load("SG_fasttext.model") print "SG FT model_done!" voc = list(model_FT.wv.vocab) print(len(voc)) XVAL = fit_transform(data) print("Transformed!!") x_train = [] x_train = XVAL[:m] print(np.array(x_train).shape) x_test = [] x_test = XVAL[m:] print(np.array(x_test).shape)
test_size=0.3, random_state=2018) x_pos_train, x_pos_test, __, ___ = train_test_split(x_pos, y, test_size=0.3, random_state=2018) print('==== training BiLSTM-CRF ====') # %% # from gensim.models import Word2Vec # model_ted = Word2Vec(sentences=vocab, size=50, window=3, min_count=1, workers=4, sg=1) # model_ted.save('word2vec.model') from gensim.models import FastText, Word2Vec model_ted = FastText(sentences=vocab, size=EMBED_DIM // 4, window=3, min_count=1, workers=4) # model_ted = Word2Vec(sent, size=2 * (EMBED_DIM // 5), window=3, min_count=1, workers=4, sg=0) # model_ted = Word2Vec(sent, size=2 * (EMBED_DIM // 5), window=3, min_count=1, workers=4, sg=1) def get_weight_matrix(embedding, vocab): # total vocabulary size plus 0 for unknown words vocab_size = len(vocab) # define weight matrix dimensions with all 0 weight_matrix = numpy.zeros((vocab_size, 50)) # step vocab, store vectors using the Tokenizer's integer mapping #wordvectors = embedding.wv for i, word in enumerate(vocab): #weight_matrix[i] = wordvectors.word_vec(word)
sims = sorted(enumerate(sims), key=lambda item: -item[1]) df2 = pd.DataFrame(sims, columns=['attack_id', 'sim_score']) for i in range(0, len(df2)): df2.text[i] = df.Description[df2.attack_id[i]] from gensim.models import Word2Vec model_hack = Word2Vec(sentences=texts, size=100, window=5, min_count=2, workers=4, sg=0) model_hack.wv.most_similar('researchers') from gensim.models import FastText model_hack = FastText(texts, size=100, window=5, min_count=5, workers=4, sg=1) model_hack.wv.most_similar('researchers') from elasticsearch import Elasticsearch es = Elasticsearch(['http://cloudweb01.isi.edu/es/'], http_auth=('effect', 'c@use!23'), port=80) print(es.info()) #search for blogs/news since certain timeframe. results = es.search(index="effect/socialmedia", scroll='1d', size=20000, body={ "query": { "range": {
def error_correction_fasttext_with_retrain_wiki(self, model_type, datasets_type, dataparam_1, dataparam_2): total_error = 0 total_error_to_repaired = 0 total_repaired = 0 if model_type == "Fasttext_All_Domain": #every time it will load the pretrained model to test new wiki table error_correction = self.prepare_testing_datasets_wiki( dataparam_1, dataparam_2 ) #dataparam_1 : json_list, dataparam_2: path of json_filelist model_fasttext = FastText.load("model/Fasttext_All_Domain.w2v") if model_type == "Fasttext_CV_Fold": model_fasttext = FastText.load("model/Fasttext_CV_Fold.w2v") if model_type == "Fasttext_Domain_Location": model_fasttext = FastText.load( "model/Fasttext_Location_Domain.w2v") error_correction = self.prepare_domain_testing_datasets_wiki( dataparam_1, dataparam_2, "location") total_error = self.calculate_total_error_wiki( dataparam_1, dataparam_2) if datasets_type == "wiki": train_data_rows = [] for rf in dataparam_1: if rf.endswith(".json"): try: revision_list = json.load( io.open(os.path.join(dataparam_2, rf), encoding="utf-8")) one_item = revision_list[-1] old_value = str(one_item[0]['old_value'].strip()) new_value = str(one_item[0]['new_value'].strip()) vicinity = one_item[0]['vicinity'] vicinity = remove_markup(str(vicinity)) vicinity = ast.literal_eval(vicinity) #print('Before: ',vicinity) train_vicinity_index = vicinity.index(old_value) del vicinity[train_vicinity_index] vicinity.append(new_value) vicinity = [ x for x in vicinity if not any(x1.isdigit() for x1 in x) ] vicinity = [x for x in vicinity if len(x) != 0 ] #remove empty item from list #vicinity=[re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in vicinity] #print('After: ', vicinity) #row=list(filter(None, row)) dirty_table = one_item[0]['dirty_table'] for index, row in enumerate(dirty_table): if index == 0: continue shape = len(row) row = remove_markup(str(row)) row = ast.literal_eval(row) row = list(filter(None, row)) #remove all digit row = [ x for x in row if not any(x1.isdigit() for x1 in x) ] row = [x for x in row if len(x) != 0 ] #remove empty item from list if row: row = [ re.sub('[^a-zA-Z0-9.-]+', ' ', _) for _ in row ] train_data_rows.append(row) except Exception as e: print('Exception: ', str(e)) if train_data_rows: model_fasttext.build_vocab(train_data_rows, update=True) model_fasttext.train(sentences=train_data_rows, total_examples=len(train_data_rows), epochs=5) for error_value, actual_value in zip(error_correction['error'], error_correction['actual']): try: if model_type == "Fasttext_Domain_Location": pass else: total_error = total_error + 1 if not any(x1.isdigit() for x1 in error_value): total_error_to_repaired = total_error_to_repaired + 1 similar_value = model_fasttext.most_similar( error_value) #print('Actual value: ', actual_value,'Most similar value of : ',error_value, ' ' , similar_value) first, b = similar_value[0] #print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value) first = first.strip() actual_value = actual_value.strip() if first == actual_value: print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value) total_repaired = total_repaired + 1 except: continue print(total_error, total_error_to_repaired, total_repaired) model_type = model_type + ' retrain wiki ' self.evaluate_model(model_type, total_error, total_error_to_repaired, total_repaired)
def loadEmbedding(filename): print("Loading Word Embedding...") it_model = FastText.load_fasttext_format(filename, full_model=False) print("...Done!") print("Building Sqlite DB....") return it_model
def main(argv): topic = argv[0] filelang = argv[1] mainlang = argv[2] path = "/home/oyku/embeddings/fasttext/wiki." + filelang + ".align.vec" dictionary = load_vec(path) mono_path = "/home/oyku/monolingual_fasttext/cc." + filelang + ".300" mono_wv = fText.load_fasttext_format(mono_path) file = "/home/oyku/myversion/oov_words/" + mainlang + "/" + topic + "_" + filelang + ".txt" f = open(file, 'r', encoding='utf8') content = f.readlines() cont = set() for el in content: if not el.strip().isdigit(): cont.add(el.strip()) print("The number of OOVs: " + str(len(content))) print("The number of word OOVs: " + str(len(cont))) ## Morphologic morphs = {} for blob in cont: if not blob.isdigit(): text = Text(blob) text.language = filelang morphemes = [] for morp in text.morphemes: if len(morp) > 3 and morp in dictionary: morphemes.append(morp) if len(morphemes) != 0: morphs[blob] = morphemes print("Morphologic check is over") left = cont.difference(morphs) ## Spelling spellex = {} for oov in left: if len(oov) > 2: possibles = [] for inv in dictionary: if stringdist.rdlevenshtein(oov, inv) == 1: possibles.append(inv) if len(possibles) == 1: spellex[oov] = possibles print("Spelling check is over") next_left = left.difference(spellex) fasttext_bin = {} for oov in next_left: try: similars = mono_wv.wv.most_similar(oov.strip()) most_sim = "" for sim in similars: if sim[0] in dictionary and sim[1] > 0.5: most_sim = sim[0] break if most_sim != "": fasttext_bin[oov.strip()] = [most_sim] except: continue print("Fasttext check is over") print("-----------------------------------------------") print("Identified with morphologic analysis: " + str(len(morphs))) print("Identified with spell analysis: " + str(len(spellex))) print("Identified with Fasttext: " + str(len(fasttext_bin))) union = union3(morphs, spellex, fasttext_bin) print("Total: " + str(len(union))) saved_path = "/home/oyku/myversion/oov_matches/" + mainlang + "/" + topic + "_" + filelang + ".p" pickle.dump(union, open(saved_path, "wb"))
def Updates(): try: print("updating Doc2Vec") print(updating) a = stem.snowball.ArabicStemmer() stopwords_list = stopwords.words('arabic') df = pd.read_csv('textc-Copy1.csv', encoding='utf-8') df["contenu"].fillna("محتوى فارغ", inplace=True) df["article"].fillna("محتوى فارغ", inplace=True) y = df['ToF'] df = df.drop('ToF', axis=1) text = [] for i in range(df.shape[0]): x = nltk.tokenize.wordpunct_tokenize(df.contenu[i]) text1 = [a.stem(word) for word in x] text.append(text1) titre = [ a.stem(word) for word in df.article if word not in stopwords_list ] #doc2vec docs = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for i, te in enumerate(text): tags = [i] docs.append(analyzedDocument(te, tags)) model = doc2vec.Doc2Vec(docs, vector_size=300, non_negative=True, window=8, min_count=1, workers=4, dm=1) from gensim.test.utils import get_tmpfile fname = get_tmpfile("doc2vec.model") model.save(fname) model = doc2vec.Doc2Vec.load(fname) print("updating fastext") class MyItera(object): def __iter__(self): for line in Corpus.article: filtered_sentence = [] for w in tokenize(line): if w not in stop_words: filtered_sentence.append(w) yield filtered_sentence class MyIter(object): def __iter__(self): for line in Corpus.contenu: filtered_sentence = [] for w in tokenize(line): if w not in stop_words: filtered_sentence.append(w) yield filtered_sentence model = FastText(size=150, window=3, min_count=1) model.build_vocab(sentences=MyIter()) total_examples = model.corpus_count model.train(sentences=MyIter(), total_examples=total_examples, epochs=5) except: Update()
def loadfasttext(): fname = get_tmpfile("fasttext.model") model_fasttext = FastText.load(fname) return model_fasttext
def make_model_fasttext(dataset, setting): pert_sent = list(perturbed_iterator(dataset, setting)) model = FastText(pert_sent, workers=effective_n_jobs(-1)) model.save(f'fasttext_models/{setting}.bin')
from gensim.models import FastText print('Starting to load fasttext embeddings...') path_to_fasttext_emb = '/tmp/wiki.ru.bin' print('Done!') ft_model = FastText.load_fasttext_format(path_to_fasttext_emb) print(ft_model.wv['снег'])
from fse import IndexedList from fse.models.average import FAST_VERSION, MAX_WORDS_IN_BATCH from fse.models import SIF from gensim.models import FastText import logging logging.basicConfig( format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO) w2v_model = "H:/Vietnamese word representations/Word_vector_data/VnNewsWord2Vec/VnNewsWord2Vec.bin" lookup = FastText.load_fasttext_format(w2v_model, encoding='utf-8') sentences = [] s = IndexedList(sentences) print(len(s)) title_file = 'H:/Vietnamese word representations/News-titles-embedding/Data/tokenized_titles_cleaned' with open(title_file, 'r', encoding='utf-8') as file: for line in file: sentences.append(line.split()) s = IndexedList(sentences) model = SIF(lookup, workers=2) model.train(s) model.save('sent2vec')
def main(): # --- argument parsing --- ( model_name, epochs, min_count, cores, checkpoint_every, cache_in_memory, lowercase, fasttext, args ) = parse_args(default_model_name='w2v_default', default_epochs=100) # --- init logging --- logger = init_logging(name=model_name, basic=True, to_file=True, to_stdout=False) log_args(logger, args) input_dir = join(SMPL_PATH, 'dewiki') model_dir = join(EMB_PATH, model_name) if not exists(model_dir): makedirs(model_dir) logger.info('model dir: ' + model_dir) t0 = time() if cache_in_memory: # needs approx. 25GB of RAM logger.info('cache data in memory') sentences = [s for s in Sentences(input_dir, logger, lowercase=lowercase)] else: sentences = Sentences(input_dir, logger, use_file_cache=True, lowercase=lowercase) gc.collect() # Model initialization logger.info('Initializing new model') if fasttext: model = FastText( size=300, window=5, min_count=min_count, sample=1e-5, negative=5, sg=1, seed=42, iter=epochs, workers=cores, min_n=3, max_n=6, ) else: model = Word2Vec( size=300, window=5, min_count=min_count, sample=1e-5, negative=5, sg=1, seed=42, iter=epochs, workers=cores, ) logger.info('Building vocab') model.build_vocab(sentences, progress_per=100_000) # Model Training epoch_saver = EpochSaver(model_name, model_dir, checkpoint_every) epoch_logger = EpochLogger(logger) logger.info('Training {:d} epochs'.format(epochs)) model.train( sentences, total_examples=model.corpus_count, epochs=model.epochs, report_delay=60, callbacks=[epoch_logger, epoch_saver], ) # saving model file_path = join(model_dir, model_name) logger.info('Writing model to ' + file_path) model.callbacks = () model.save(file_path) t1 = int(time() - t0) logger.info("all done in {:02d}:{:02d}:{:02d}".format(t1//3600, (t1//60) % 60, t1 % 60))
def error_correction_fasttext_with_retrain_realworld( self, datasets_type, dataset_1, dataset_2, model_type): total_error = 0 total_error_to_repaired = 0 total_repaired = 0 try: if model_type == "Fasttext_All_Domain": if datasets_type == "real_world": error_correction = self.prepare_testing_datasets_real_world( dataset_1, dataset_2) #dataset_1 clean data for real world model_fasttext = FastText.load("model/Fasttext_All_Domain.w2v") if model_type == "Fasttext_Domain_Location": error_correction = self.prepare_domain_testing_datasets_realworld( dataset_1, dataset_2) #dataset_1 clean data for real world total_error = self.calculate_total_error_realworld( dataset_1, dataset_2) model_fasttext = FastText.load( "model/Fasttext_Location_Domain.w2v") except Exception as e: print('Model Error: ', str(e)) data_for_retrain = self.prepare_dataset_for_retrain_realworld( dataset_1, dataset_2) train_data_rows = [] try: data_for_retrain = data_for_retrain.values.tolist() for row in data_for_retrain: row = list(map(str, row)) row = list(filter(None, row)) train_data_rows.append(row) if train_data_rows: if train_data_rows: model_fasttext.build_vocab(train_data_rows, update=True) model_fasttext.train(sentences=train_data_rows, total_examples=len(train_data_rows), epochs=5) except Exception as e: print("Exception from spell model : ", str(e)) for error_value, actual_value in zip(error_correction['error'], error_correction['actual']): if model_type == "Fasttext_Domain_Location" and datasets_type == "real_world": pass else: total_error = total_error + 1 try: if not any(x1.isdigit() for x1 in error_value): total_error_to_repaired = total_error_to_repaired + 1 similar_value = model_fasttext.most_similar(error_value) first, b = similar_value[0] first = first.lower() actual_value = actual_value.lower() #print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value) first = first.strip() actual_value = actual_value.strip() if first == actual_value: print('Error : ', error_value, ' Fixed: ', first, ' Actual: ', actual_value) total_repaired = total_repaired + 1 except Exception as e: print('Error correction model: ', str(e)) continue self.evaluate_model(model_type, total_error, total_error_to_repaired, total_repaired)
for ix in range(len(word_index)): word = word_index[ix] # word_encode = word_index[ix].encode(('utf-8')) vec = word_vectors[word] for j in range(int(dim)): E[ix][j] = vec[j] window_size = 10000 embedding = Embedding(100, 32, weights=[E], input_length=window_size, trainable=False) # https://gist.github.com/brandonrobertz/49424db4164edb0d8ab34f16a3b742d5 trained_model = FastText.load_fasttext_format('data_noun_token') input_train = [] fin = open('data_noun_token.txt', 'r') lines = fin.readlines() print(len(lines)) for line in lines: word_vector = [] for word in line.split(): try: word_vector.append(trained_model[word]) except: pass input_train.append(word_vector) fin.close()
for word in tokens: if word not in diccionario: diccionario[word] = 1 else: diccionario[word] += 1 print("numero de oraciones presentes en el corpus " + str(len(sentences))) print("numero de palabras unicas " + str(len(diccionario))) num_features = [20, 50, 100] #Dimensionality of the resulting word vectors min_word_count = 1 #Minimum word count threshold num_workers = multiprocessing.cpu_count( ) #Number of threads to run in parallel context_size = 5 #Context window length seed = 1 #Seed for the RNG, to make the result reproducible for p in num_features: fasttext_model = FastText( sentences=sentences, size=p, window=context_size, min_count=min_word_count, workers=num_workers, sg=1 #skip-gram ) fasttext_model.wv.save_word2vec_format( 'model/fasttext_skip-gram_model_bioinfer_' + str(p) + '.txt', binary=False) del fasttext_model
import numpy as np import pickle import utils # Setting max_size, max_seq_len = 50, 30 (modelno_to_goodsnm, modelno_to_goodsnms) = utils.model_basic_dict() #print("✱ the # of classes(catalogs): ", len(modelno_to_goodsnm)) # Not-Toy set # Loading.. # ├ ⑴Toy dict.(pre-stored @toyData.py) # ├ ⑵Word Embedding Model (pre-trained @wordEmbedding.py) # └ ⑶LSTM model (pre-stored @lstm.py) with open('dictionary/toyDict.pickle', 'rb') as handle: toy_dict = pickle.load(handle) fastText = FastText.load('model/FastText.bin') preLSTM = load_model('model/lstm.h5') LSTM = MyLSTM(toy_dict=toy_dict, embedding_model=fastText) (X_train, Y_train, X_val, Y_val, X_test, Y_test, toy_train_dict) = LSTM.split_train_test() X_test = sequence.pad_sequences(np.array(X_test), maxlen=max_seq_len) index_dict = LSTM.create_index_dict() #---------------------Prediction---------------------# X_new = X_test[:50] Y_new = Y_test[:50] Y_hat = list(preLSTM.predict_classes(X_new)) print("\nY_hat: {},\nY_new: {}\n".format(Y_hat, Y_new)) for i in range(len(Y_hat)):
def train_fasttext(corpus): model = FastText(size=9, window=2, min_count=1) model.build_vocab(sentences=corpus) model.train(sentences=corpus, total_examples=len(corpus), epochs=10) model.save(r'models\fasttext.model')
def main(): try: with open("token_counts.json") as infile: token_counts = json.load(infile) except: print( "Failed to load token counts - did you `python train.py build-corpus $INFILE` yet?" ) sys.exit(1) if os.path.exists("vectors_out.sqlite"): os.unlink("vectors_out.sqlite") conn = sqlite3.connect("vectors_out.sqlite") conn.execute(""" CREATE TABLE vector_meta ( vector_float_bytes integer, embedding_dimensions integer, vocab_size integer, oov_token text, build_parameters text ) """) conn.execute( "INSERT INTO vector_meta VALUES (?, ?, ?, ?, ?)", ("float32", EMBED_DIM, VOCAB_SIZE, OOV_TOKEN, jsonify_build_params()), ) conn.execute( "CREATE TABLE vectors (token text primary key, vector_bytes blob);") conn.execute( "CREATE TABLE frequencies (token text primary key, count integer);") print(f"Loaded {sum(token_counts.values())} words. Training model...") ft_model = FastText( corpus_file="corpus.txt", size=EMBED_DIM, window=WINDOW_LEN, min_count=MIN_COUNT, max_vocab_size=VOCAB_SIZE, alpha=LEARNING_RATE, sg=SKIPGRAM, sorted_vocab=1, iter=EPOCHS, workers=NUM_WORKERS, negative=NEGATIVE_SAMPLES, ns_exponent=NEG_SAMP_DIST, ) vectors = {} vocab = {} sorted_counts = sorted(token_counts.items(), key=lambda p: -p[1]) idx = 2 progress = tqdm(desc="Writing Vectors", total=VOCAB_SIZE) oov_vector = ft_model.wv[OOV_TOKEN] conn.execute("INSERT INTO vectors VALUES (?, ?)", (OOV_TOKEN, oov_vector)) conn.execute("INSERT INTO frequencies VALUES (?, ?)", (OOV_TOKEN, 0)) for token, count in sorted_counts: try: vector = ft_model.wv.get_vector(token) except: continue vectors[token] = vector vocab[token] = idx vector_bytes = vector.astype("float32").tobytes() conn.execute("INSERT INTO vectors VALUES (?, ?)", (token, vector_bytes)) conn.execute("INSERT INTO frequencies VALUES (?, ?)", (token, count)) progress.update(1) idx += 1 if idx >= VOCAB_SIZE: break print("Writing vectors_out.sqlite...") conn.commit() progress.close() print("Writing vocab.json...") with open("vocab.json", "w") as outfile: json.dump(vocab, outfile) print("Done!")
if embedding == 'w2v': model = Word2Vec(sents, size=dimension, window=word_window, workers=3, sg=skip_grams, hs=softmax, negative=negative_sample, iter=EPOCHS) model.wv.save_word2vec_format(model_filename, binary=False) model.vocabulary.save(vocab_filename) else: model = FastText(sg=skip_grams, hs=softmax, size=dimension, window=word_window, workers=3, negative=negative_sample) model.build_vocab(sentences=sents) model.train(sentences=sents, total_examples=len(sents), epochs=EPOCHS) model.save(model_filename) # model.vocabulary.save(vocab_filename) print("Model saved at " + model_filename) embedding_model = [] if embedding == 'w2v': embedding_model = Embedding.from_word2vec(model_filename) else:
def generate_fasttext(messages: list): model = FastText(min_count=3, workers=8, window=3, min_n=1) model.build_vocab(sentences=messages) model.train(sentences=messages, total_examples=len(messages), epochs=10) return model
help='corpus') parser.add_argument('--base', '-b', dest='base', action='store', required=True, help='base') args = parser.parse_args() corpus_file = args.base + "/" + args.corpus base = args.base #modelSG = Word2Vec(sentencia, sg=1,window=1,min_count=1) output = base + "/gSG_" + base + ".vec" modelSG = Word2Vec(corpus_file=corpus_file, sg=1, window=1, min_count=1) modelSG.wv.save_word2vec_format(output, binary=False) #modelCB = Word2Vec(sentencia, sg=0,window=1,min_count=1) output = base + "/gCB_" + base + ".vec" modelCB = Word2Vec(corpus_file=corpus_file, sg=0, window=1, min_count=1) modelCB.wv.save_word2vec_format(output, binary=False) output = base + "/gFT_" + base + ".vec" modelFT = FastText(window=1, min_count=1, sg=1) modelFT.build_vocab( corpus_file=corpus_file) # scan over corpus to build the vocabulary total_words = modelFT.corpus_total_words # number of words in the corpus modelFT.train(corpus_file=corpus_file, total_words=total_words, epochs=5) modelFT.wv.save_word2vec_format(output, binary=False)
#characters chars = set([w_i for w in words for w_i in w]) n_chars = len(chars) print("Number of Labels: ", n_chars) tag2idx = {t: i + 1 for i, t in enumerate(tags)} tag2idx["PAD"] = 0 # Vocabulary Key:tag_index -> Value:Label/Tag idx2tag = {i: w for w, i in tag2idx.items()} # Char Key:char -> Value:token_index char2idx = {c: i + 2 for i, c in enumerate(chars)} char2idx["UNK"] = 1 char2idx["PAD"] = 0 words_fast = FastText.load('model_fast30/model_fast.model') #load pretrained word embedding embedding_matrix = np.ones((len(word2idx), 100), dtype='float32') embedding_matrix[0] = np.zeros(100, dtype='float32') # with open('wiki-news-300d-1M.vec') as f: for i in range(2, len(idx2word) - 2): embedding_matrix[i] = words_fast[idx2word[i]] # ordered_words_ft.append(s[0]) print('Found %s word vectors.' % len(embedding_matrix)) # for word, i in word2idx.items(): # embedding_vector = embeddings_index.get(word) # if embedding_vector is not None: # # words not found in embedding index will be all-zeros. # embedding_matrix[i] = embedding_vector
""" `gensim.models.FastText` 使用示例 """ # gensim 示例 import gensim import numpy as np from gensim.test.utils import common_texts from gensim.models.keyedvectors import FastTextKeyedVectors from gensim.models._utils_any2vec import compute_ngrams, ft_hash from gensim.models import FastText # 构建 FastText 模型 sentences = [["Hello", "World", "!"], ["I", "am", "huay", "."]] min_ngrams, max_ngrams = 2, 4 # ngrams 范围 model = FastText(sentences, size=5, min_count=1, min_n=min_ngrams, max_n=max_ngrams) # 可以通过相同的方式获取每个单词以及任一个 n-gram 的向量 print(model.wv['hello']) print(model.wv['<h']) """ [-0.03481839 0.00606661 0.02581969 0.00188777 0.0325358 ] [ 0.04481247 -0.1784363 -0.03192253 0.07162753 0.16744071] """ print() # 词向量和 n-gram 向量是分开存储的 print(len(model.wv.vectors)) # 7 print(len(model.wv.vectors_ngrams)) # 57 # gensim 好像没有提供直接获取所有 ngrams tokens 的方法 print(model.wv.vocab.keys())
word2id[i[0]] = int(i[1]) read_file.close() #参数配置 epcho = 1 batch_size = 256 num_to_ev = 400 # 训练多少批,在本地评测一次 vocab_size = len(word2id) # 词典大小 embedding_dim = 256 # 词向量维度 t_max_len = 22 #title的最大长度 q_max_len = 11 #query的最大长度 lr = 0.0001 #学习率 #加载验证集 val_data = load_esim_data_and_labels( "/home/kesci/work/data/eval_data/19_eval.csv", word2id, q_max_len=q_max_len, t_max_len=t_max_len) # 每个词拼接使用128维的w2v和128维的fast向量到256维的组合向量表示 ce = np.random.uniform(-1, 1, [vocab_size + 1, embedding_dim]) word2vec_model = Word2Vec.load("/home/kesci/word2vec.model") fast_model = FastText.load("./data/fast_w2v.model") ce[0] = np.zeros(embedding_dim) for i in word2id: try: ce[word2id[i]] = np.concatenate((word2vec_model[i], fast_model[i])) except: print(i)
set_b = set(b["PaperTitle"].to_list() + b["abstract"].to_list()) list_ab = list(set_a) + list(set_b) cores = multiprocessing.cpu_count() ps = PorterStemmer() with Pool(cores) as p: sentences = p.map(stem_lines, list_ab) # sentences = [stem_lines(i) for i in list_ab] # for s in tokenizer.pipe(list_ab): # sentences.append([t.text for t in s if not t.check_flag(IS_PUNCT)]) print("Train the FastText model") # word2vec_model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=cores, sg=0) # word2vec_model.wv.save_word2vec_format("word2vec_model", binary=True) # binary format print(sentences[0]) fastText_model = FastText(size=100, window=5, min_count=1, sentences=sentences, iter=5, workers=cores) fastText_model.wv.save("fastText_model") print("finish training") word_vectors = fastText_model.wv # KeyedVectors.load_word2vec_format("fastText_model") b['embedding'] = b.apply( lambda x: wv_papers(x['abstract'], x['PaperTitle'], word_vectors), axis=1) print(b.head()) print(len(b)) b.to_pickle("fasttext_embedding.pickle")
import os from gensim.models import FastText EXP_HOME = "F:/MyWorks/Thesis Works/Crowdsource_Knowledge_Base/DeepGenQR/experiment" # EXP_HOME = "C:/My MSc/ThesisWorks/BigData_Code_Search/DeepGenQR/experiment" model_file = EXP_HOME + '/pymodel/github-deepcs' model = FastText.load(model_file) # print(len(model.wv)) # quit() word_file = EXP_HOME + '/w2vec-data/words.txt' vec_file = EXP_HOME + '/w2vec-data/github-vector.txt' vec_lines = list() words = open(word_file, 'r') for word in words: try: if model.wv.__contains__(word.strip()): vector = model.wv[word.strip()] line = word.strip() + " " + ' '.join(str(x) for x in vector) vec_lines.append(line) except IOError: print("Could not found " + word) pass output_file = open(vec_file, 'w') for content in vec_lines: output_file.write("%s\n" % content) output_file.close()
class NP2vec: """ Initialize the np2vec model, train it, save it and load it. """ def is_marked(self, s): """ Check if a string is marked. Args: s (str): string to check """ return len(s) > 0 and s[-1] == self.mark_char def __init__( self, corpus, corpus_format='txt', mark_char='_', word_embedding_type='word2vec', sg=0, size=100, window=10, alpha=0.025, min_alpha=0.0001, min_count=5, sample=1e-5, workers=20, hs=0, negative=25, cbow_mean=1, iter=15, min_n=3, max_n=6, word_ngrams=1): """ Initialize np2vec model and train it. Args: corpus (str): path to the corpus. corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json formats are supported. For json format, the file should contain an iterable of sentences. Each sentence is a list of terms (unicode strings) that will be used for training. mark_char (char): special character that marks NP's suffix. word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and fasttext are supported. np2vec_model_file (str): path to the file where the trained np2vec model has to be stored. binary (bool): boolean indicating whether the model is stored in binary format; if word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True. sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training algorithm. If 1, CBOW is used,otherwise, skip-gram is employed. size (int): model training hyperparameter, size of the feature vectors. window (int): model training hyperparameter, maximum distance between the current and predicted word within a sentence. alpha (float): model training hyperparameter. The initial learning rate. min_alpha (float): model training hyperparameter. Learning rate will linearly drop to `min_alpha` as training progresses. min_count (int): model training hyperparameter, ignore all words with total frequency lower than this. sample (float): model training hyperparameter, threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5) workers (int): model training hyperparameter, number of worker threads. hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non- zero, negative sampling will be used. negative (int): model training hyperparameter, negative sampling. If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. iter (int): model training hyperparameter, number of iterations. min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used for training word representations. max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training. """ self.mark_char = mark_char self.word_embedding_type = word_embedding_type self.sg = sg self.size = size self.window = window self.alpha = alpha self.min_alpha = min_alpha self.min_count = min_count self.sample = sample self.workers = workers self.hs = hs self.negative = negative self.cbow_mean = cbow_mean self.iter = iter self.min_n = min_n self.max_n = max_n self.word_ngrams = word_ngrams if corpus_format == 'txt': self._sentences = LineSentence(corpus) elif corpus_format == 'json': with open(corpus) as json_data: self._sentences = json.load(json_data) elif corpus_format == 'conll2000': try: self._sentences = list() for chunked_sent in conll2000.chunked_sents(corpus): tokens = list() for chunk in chunked_sent: if hasattr(chunk, '_label') and chunk._label == 'NP': s = '' for w in chunk: s += w[0] + self.mark_char tokens.append(s) else: if isinstance(chunk, nltk.Tree): for w in chunk: tokens.append(w[0]) else: tokens.append(chunk[0]) self._sentences.append(tokens) except Exception: print('Conll2000 dataset is missing from NLTK. See downloading details in the ' 'README file') else: logger.error('invalid corpus format: ' + corpus_format) sys.exit(0) if word_embedding_type == 'fasttext' and word_ngrams == 1: # remove the marking character at the end for subword fasttext model training for i, sentence in enumerate(self._sentences): self._sentences[i] = [ w[:-1] if self.is_marked(w) else w for w in sentence] logger.info('training np2vec model') self._train() def _train(self): """ Train the np2vec model. """ if self.word_embedding_type == 'word2vec': self.model = Word2Vec( self._sentences, sg=self.sg, size=self.size, window=self.window, alpha=self.alpha, min_alpha=self.min_alpha, min_count=self.min_count, sample=self.sample, workers=self.workers, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, iter=self.iter) elif self.word_embedding_type == 'fasttext': self.model = FastText( self._sentences, sg=self.sg, size=self.size, window=self.window, alpha=self.alpha, min_alpha=self.min_alpha, min_count=self.min_count, sample=self.sample, workers=self.workers, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, iter=iter, min_n=self.min_n, max_n=self.max_n, word_ngrams=self.word_ngrams) else: logger.error( 'invalid word embedding type: ' + self.word_embedding_type) sys.exit(0) def save(self, np2vec_model_file='np2vec.model', binary=False): """ Save the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format """ if self.word_embedding_type == 'fasttext' and self.word_ngrams == 1: if not binary: logger.error( "if word_embedding_type is fasttext and word_ngrams is 1, " "binary should be set to True.") sys.exit(0) # not relevant to prune fasttext subword model self.model.save(np2vec_model_file) else: # prune non NP terms logger.info('pruning np2vec model') total_vec = 0 vector_size = self.model.vector_size for word in self.model.wv.vocab.keys(): if self.is_marked(word): total_vec += 1 logger.info( "storing %sx%s projection weights for NP's into %s" % (total_vec, vector_size, np2vec_model_file)) with utils.smart_open(np2vec_model_file, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store NP vectors in sorted order: most frequent NP's at the top for word, vocab in sorted( iteritems( self.model.wv.vocab), key=lambda item: -item[1].count): if self.is_marked(word): embedding_vec = self.model.wv.syn0[vocab.index] if binary: fout.write( utils.to_utf8(word) + b" " + embedding_vec.tostring()) else: fout.write( utils.to_utf8( "%s %s\n" % (word, ' '.join( "%f" % val for val in embedding_vec)))) @classmethod def load(cls, np2vec_model_file, binary=False, word_ngrams=0): """ Load the np2vec model. Args: np2vec_model_file (str): the file containing the np2vec model to load binary (bool): boolean indicating whether the np2vec model to load is in binary format word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword ( ngrams) information. Returns: np2vec model to load """ if word_ngrams == 0: return KeyedVectors.load_word2vec_format( np2vec_model_file, binary=binary) elif word_ngrams == 1: return FastText.load(np2vec_model_file) else: logger.error('invalid value for \'word_ngrams\'')