sentences = [] with open("/home/siyuan/data/ner_sample.txt", "r") as f: cont = f.read() lines = cont.split("\n") idx = np.random.permutation(len(lines)) for i in idx[:50000]: sentences.append(list(jieba.cut(lines[i], HMM=True))) print(sentences) model = Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4) model.save("./word2vec_gensim.model") def isstopword(word): if word == "_": return True if len(re.findall("(.先生|.女士)", word)) > 0: return True else: return False
f = s.replace("\n", " ") # iterate through each sentence in the file for i in sent_tokenize(f): #temp = [] # tokenize the sentence into words for j in word_tokenize(i): temp.append(j.lower()) data.append(temp) sentences = data # train model model = Word2Vec(sentences, min_count=1) # summarize the loaded model print(model) # summarize vocabulary words = list(model.wv.vocab) print(words) #model.save('model.bin') X = model[model.wv.vocab] pca = PCA(n_components=2) result = pca.fit_transform(X) plt.scatter(result[:, 0], result[:, 1])
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Sep 18 16:04:35 2019 @author: chenjiannan """ from gensim.test.utils import common_texts, get_tmpfile from gensim.models import Word2Vec path = get_tmpfile("word2vec.model") model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) model.save("word2vec.model") model.build_vocab([["hello", "world"]], update=True) model.train([["hello", "world"]], total_examples=1, epochs=1) model.wv['hello'] from gensim.test.utils import datapath
tokenized_file = "tokenized_dataset.pickle" # load our tokenized dataset ids, questions, answers, all_answers = dl.get_dataset_tokens_loaded(dataset_file, tokenized_file) # we want all vocabulary alldocs = questions + all_answers # for reshuffling per pass doc_list = alldocs[:] print('Input %d docs in total' % (len(doc_list))) assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise" model = Word2Vec(size=word_dim, window=10, min_count=1, workers=32) model.build_vocab(alldocs) # We only want to train on new words, so intersect with google pre-trained. model.intersect_word2vec_format(g_pretrain_bin_file, binary=True, lockf=0.0) @contextmanager def elapsed_timer(): start = default_timer() elapser = lambda: default_timer() - start yield lambda: elapser() end = default_timer() elapser = lambda: end - start def cwidvec2str(cwid, vec):
with open(pathname) as file: noisy_words = [word.replace("\n", "") for word in file.readlines()] for odd_word in noisy_words: odd_word.replace("\n", '') text = text.replace(" {} ".format(odd_word), " ") return text df = pd.read_csv("data/clean_qa.csv", sep='\t') noisy_words_filepath = "res/noisy_words.txt" sentences = [ [Porter.stem(word) for word in delete_noisy_words(sentence.lower(), noisy_words_filepath).split() if word] for sentence in df["Question"] if sentence] model = Word2Vec(sentences, size=100, batch_words=5, window=4, min_count=5) word_vectors = KeyedVectors.load("model/word2vec.model") vocab = word_vectors.wv.vocab def preprocessQuery(sentence): """Splits sentence into words and intersects it with vocabulaty of the trained model""" preprocessed = [Porter.stem(word) for word in delete_noisy_words(sentence.lower(), noisy_words_filepath).split() if word] return preprocessed def intersectWithVocab(sentence): sentence = [word for word in sentence if word in vocab] return sentence
def wmd_query_k_con_4_dia(train_dia, train_con, val_dia, val_con, infer_dia, infer_con, k_set, out_path): """ 使用BM25算法,对给定的诊断文本,查询k个候选概念 :param train_dia: :param train_con: :param val_dia: :param val_con: :param infer_dia: :param infer_con: :param k_set: :param out_path: :return: """ start_time = time.time() train_dia_f = open(train_dia, 'r', encoding='utf-8').readlines() train_dia_f = [i.split('\t')[0].rstrip('\n') for i in train_dia_f] val_dia_f = open(val_dia, 'r', encoding='utf-8').readlines() val_dia_f = [i.split('\t')[0].rstrip('\n') for i in val_dia_f] infer_dia_f = open(infer_dia, 'r', encoding='utf-8').readlines() infer_dia_f = [i.split('\t')[0].rstrip('\n') for i in infer_dia_f] train_con_f = open(train_con, 'r', encoding='utf-8').readlines() train_con_f = [i.rstrip('\n') for i in train_con_f] val_con_f = open(val_con, 'r', encoding='utf-8').readlines() val_con_f = [i.rstrip('\n') for i in val_con_f] infer_con_f = open(infer_con, 'r', encoding='utf-8').readlines() infer_con_f = [i.rstrip('\n') for i in infer_con_f] all_dia = train_dia_f + val_dia_f + infer_dia_f all_dia = list(set(all_dia)) all_dia_num = len(all_dia) all_con = train_con_f + val_con_f + infer_con_f all_con = list(set(all_con)) all_con_num = len(all_con) all_infer_con = list(set(infer_con_f)) all_infer_con_num = len(all_infer_con) corpus = all_dia + all_con corpus = [i.split(' ') for i in corpus] # TODO 加载停用词 corpus_dict = {} for idx, line in enumerate(corpus): corpus_dict[idx] = line w2v_model = Word2Vec(corpus, size=512, min_count=1, window=3, sg=0) # sg=0--CBOW;1-skip-gram wmd_model = WmdSimilarity(corpus=corpus, w2v_model=w2v_model, num_best=len(corpus)) y_pred_acc = [0 for _ in range(len(infer_con_f))] y_pred_acc_5 = [0 for _ in range(len(infer_con_f))] y_pred_acc_10 = [0 for _ in range(len(infer_con_f))] y_true = [1 for _ in range(len(infer_con_f))] result = open(out_path + 'result.txt', 'w', encoding='utf-8') for idx, query_dia in enumerate( tqdm(infer_dia_f, desc='推理中...', leave=False)): query_result = wmd_model[query_dia.split(' ')] candidate_con = {} for idx_value in query_result: if idx_value[0] > all_dia_num - 1: candidate_con[idx_value[0]] = idx_value[1] assert len(candidate_con) == all_con_num, '抽取出来的概念文本个数不是{}个!'.format( all_con_num) candidate_txt_value = {} for key, value in candidate_con.items(): candidate_txt_value[' '.join(corpus_dict[key])] = value assert len(candidate_con) == all_con_num, '查询出来的候选概念数量和实际的候选概率数量不一致!!!' sort_candidate_txt_value = sorted(candidate_txt_value.items(), key=lambda x: x[1], reverse=True) max_10_con = [txt_value[0] for txt_value in sort_candidate_txt_value] if infer_con_f[idx] == max_10_con[0]: y_pred_acc[idx] = 1 if infer_con_f[idx] in max_10_con[:5]: y_pred_acc_5[idx] = 1 if infer_con_f[idx] in max_10_con[:10]: y_pred_acc_10[idx] = 1 acc = accuracy_score(y_true, y_pred_acc) acc_5 = accuracy_score(y_true, y_pred_acc_5) acc_10 = accuracy_score(y_true, y_pred_acc_10) f1 = f1_score(y_true, y_pred_acc) end_time = time.time() take_time = end_time - start_time per_item_time = take_time / len(infer_con_f) print('{}数据集下,Acc={},Acc@5={},Acc@10={},F1={},总耗时{},平均每条耗时{}'.format( data_set, round(acc, 3), round(acc_5, 3), round(acc_10, 3), round(f1, 3), take_time, round(per_item_time, 2))) result.write( '{}数据集下,Acc={},Acc@5={},Acc@10={},F1={},总耗时{},平均每条耗时{}'.format( data_set, acc, acc_5, acc_10, f1, take_time, per_item_time))
english[english.label == 1][test.columns] ]).reset_index() #西班牙语问句1 data['spa_qura_list_1'] = data['spa_qura1'].apply(lambda x: x.split(' ')) #西班牙语问句2 data['spa_qura_list_2'] = data['spa_qura2'].apply(lambda x: x.split(' ')) #西班牙语问句拼到一个list spa_list = list(data['spa_qura_list_1']) spa_list.extend(list(data['spa_qura_list_2'])) #班牙语问句 Word2Vec model = Word2Vec(spa_list, sg=1, size=30, window=5, min_count=1, negative=3, sample=0.001, hs=1, workers=8) def seq_to_w2v(seq, model): words = [] default = [0 for x in range(30)] for i in range(30): if i < len(seq): words.extend(model[seq[i]]) else: words.extend(default) return words
text = re.sub( r'\d', ' ', text ) ## Matches any Unicode digit (which includes [0-9], and also many other digit characters) text = re.sub( r'\s+', ' ', text ) ## Matches Unicode whitespace characters (which includes [ \t\n\r\f\v], and also many other characters ## Preparing dataset sentences = nltk.sent_tokenize(text) # paragraph into sentences sentences = [nltk.word_tokenize(sentence) for sentence in sentences] # sentences into words for i in range(len(sentences)): sentences[i] = [ word for word in sentences[i] if word not in set(stopwords.words('english')) ] ## Training Word2Vec model model = Word2Vec(sentences, min_count=1) ##word should be presented more than 1 time. words = model.wv.vocab ## vocabularies in Word2Vec model ## Finding word vectors vector = model.wv[ 'college'] # vector of 100 dimentions for the word 'college'. ## Most similar words similar = model.wv.most_similar('college') # similar word to the 'college'
def word2vec_model(train_data): model = Word2Vec(train_data,size=30,window = 3,min_count =1,iter=20) return model
# on_the # 1052 # at_the # 1035 # we_ # 're 1033 # i_was # 1018 # of_the # 1014 # ca_n # 't 1010 # are_you # 994 bigram_model = Word2Vec(bigram[sentences], size=100) bigram_model_counter = Counter() for key in bigram_model.vocab.keys(): if key not in stopwords.words("english"): if len(key.split("_")) > 1: bigram_model_counter[key] += bigram_model.vocab[key].count for key, counts in bigram_model_counter.most_common(50): print '{0: <20} {1}'.format(key.encode("utf-8"), counts) do_n 't 2436 gon_na 1576 ca_n
def main(argv): # ======================= + # / # D E F A U L T S / # / # ------------------- + # Default log level. logging.basicConfig(level=logging.INFO) # Default data directory. data_dir = '' # Default outout directory. output_dir = 'output' # ================================= + # / # P A R S E C L I A R G S / # / # ----------------------------- + # Parse cli args. try: opts, args = getopt.getopt(argv, "d:o:", ['data_dir=', 'output_dir=']) except getopt.GetoptError: print('data_prep_jigsaw.py -d <data_dir> -o <output_dir>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('data_prep_jigsaw.py -d <data_dir> -o <output_dir>\n' + 'Defaults:\n' + ' data_dir=\t\t' + ' output_dir=\t\toutput') sys.exit() elif opt == '-v': logging.getLogger().setLevel(logging.DEBUG) elif opt == '-d': data_dir = str(arg) elif opt == '-o': output_dir = str(arg) # Set vars with default or passed-in values. # Path to data directory. data_path = Path(data_dir) # Output path. output_path = Path(output_dir) # Set vars with default or passed-in values. # Get the data, create dataframes from the CSVs. train_path = data_path / 'train.csv' train_df = pd.read_csv(train_path, header=0) test_path = data_path / 'test.csv' test_df = pd.read_csv(test_path, header=0) test_labels_path = data_path / 'test_labels.csv' test_labels_df = pd.read_csv(test_labels_path, header=0) # Print heads if debug. logging.debug(train_df.head()) logging.debug(test_df.head()) logging.debug(test_labels_df.head()) # ============================== + # / # P R E P A R E D A T A / # / # --------------------------- + # Drop everything except for comment_text and the labels. train_df.drop(['id'], axis=1, inplace=True) # Merge test with test labels. merged_test_df = pd.merge(test_df, test_labels_df, on='id') # Get list of records with -1 for labels (they weren't used in kaggle evaluation). # Drop them. # Create output dir if it doesn't exist. try: os.makedirs(output_path) except FileExistsError: logging.info('Output directory already exists.') # Write features and labels to disk. csv_path = output_path / 'raw_train_set.csv' train_df.to_csv(csv_path) # =================================================== + # / # P R E P A R E W O R D E M B E D D I N G S / # / # ----------------------------------------------- + # Build vocabulary and word embeddings from source if needed. # Store records all_labels = [] tokens = [] maxsentlen = 0 maxdoclen = 0 num_dropped = 0 # Process csv one line at a time with open(csv_path, mode='r') as csv_file: csv_reader = csv.DictReader(csv_file) lineno = 0 idx = 0 for line in csv_reader: # Skip header. lineno += 1 sys.stdout.write("Processing line %i \r" % lineno) sys.stdout.flush() # Begin at index 1. idx += 1 # TODO This is coupled to this field. Change to arg? text = line['comment_text'] # Process raw text. # Force lowercase. text = text.lower() # Remove unwanted tokens. text = re.sub("\\n", ' ', text) text = re.sub("\\t", ' ', text) # Remove single and double backticks. text = re.sub("`", '', text) # Remove single quotes. text = re.sub("'", '', text) # Replace multiple periods in sequence with one period. text = re.sub("\.{2,}", '.', text) # Replace everything except words, '.', '|', '?', and '!' with space. text = re.sub('[^\w_|\.|\?|!]+', ' ', text) # Replace periods with ' . '. text = re.sub('\.', ' . ', text) # Replace '?' with ' ? '. text = re.sub('\?', ' ? ', text) # Replace '!' with ' ! '. text = re.sub('!', ' ! ', text) # Tokenize by splitting on whitespace. # No leading or trailing whitespace is kept. # Consecutive spaces are treated as a single space. text = text.split() # Drop empty reviews. if len(text) == 0: num_dropped += 1 continue # Split into sentences. sentences = [] sentence = [] for t in text: # Use '.', '!', '?' as markers of end of sentence. if t not in ['.', '!', '?']: # Not at end of a sentence. sentence.append(t) else: # At end of a sentence. sentence.append(t) # Add sentence to sentences. sentences.append(sentence) # Track longest sentence. if len(sentence) > maxsentlen: maxsentlen = len(sentence) # Reset sentence list. sentence = [] # If sentence has word, add to list of sentences. if len(sentence) > 0: sentences.append(sentence) # Add split sentences to tokens. tokens.append(sentences) # Track longest document. if len(sentences) > maxdoclen: maxdoclen = len(sentences) # Build list of labels for record. doc_labels = [] doc_labels.append(line['toxic']) doc_labels.append(line['severe_toxic']) doc_labels.append(line['obscene']) doc_labels.append(line['threat']) doc_labels.append(line['insult']) doc_labels.append(line['identity_hate']) # Add list of labels to list of all labels. all_labels.append(doc_labels) # Use all processed raw text to train word2vec. allsents = [sent for doc in tokens for sent in doc] # TODO Make embedding size a cli arg w/ default of 300. embedding_size = 300 model = Word2Vec(allsents, min_count=5, size=embedding_size, workers=4, iter=5) model.init_sims(replace=True) # Save all word embeddings to matrix vocab = np.zeros((len(model.wv.vocab) + 1, embedding_size)) word2id = {} # First row of embedding matrix isn't used so that 0 can be masked. for key, val in model.wv.vocab.items(): # Begin indexes with offset of 1. idx = val.__dict__['index'] + 1 # Build 2D np array (idx, vector) vocab[idx, :] = model[key] # Dictionary mapping word to index. word2id[key] = idx # Switch keys/values and store id2word dictionary (for decoding examples). id2word = {y: x for x, y in word2id.items()} # Normalize embeddings. vocab -= vocab.mean() vocab /= (vocab.std() * 2) # Reset first row to 0. vocab[0, :] = np.zeros(embedding_size) # Add additional word embedding for unknown words. vocab = np.concatenate((vocab, np.random.rand(1, embedding_size))) # Index for unknown words. unk = len(vocab) - 1 # Convert words to word indices. data = {} for idx, doc in enumerate(tokens): sys.stdout.write('processing %i of %i records \r' % (idx + 1, len(tokens))) sys.stdout.flush() dic = {} # Get label for each index. dic['labels'] = all_labels[idx] # Get text of each document. dic['text'] = doc # Build list of indicies representing the words of each sentence, # if word is a key in word2id mapping, use unk, defined: vocab[len(vocab)-1]. indicies = [] for sent in doc: indicies.append( [word2id[word] if word in word2id else unk for word in sent]) # Add indices to dictionary. dic['idx'] = indicies # Add dictionary containing label, text, indices to data dictionary at index. data[idx] = dic # Write data dictionary to file. data_output_path = output_path / 'jigsaw-WM-Gao-data.bin' with open(data_output_path, 'wb') as f: msgpack.pack(data, f) # Write embeddings to file in numpy binary format. embeddings_output_path = output_path / 'jigsaw-WM-EMB-Gao-300' np.save(embeddings_output_path, vocab) # Write id2word dict to file. id2word_output_path = output_path / 'jigsaw-WM-EMB-Gao-id2word.bin' with open(id2word_output_path, 'wb') as f: msgpack.pack(id2word, f)
# split by line lines = cleaned_txt.split('\n') # tokenize by line with jieba for line in lines: if line: line_tokens = jieba.cut(line, cut_all=False, HMM=True) tokenized_sentence = [token for token in line_tokens] tokenized_sentences.append(tokenized_sentence) # get time information t2 = time.time() book_process_time = t2 - t1 list_times.append(book_process_time) n_books = n + 1 print("\nSentences tokenized !") print("{} seconds in total and {} seconds per book".format( sum(list_times), sum(list_times) / n_books)) print('\nComputing Word2Vec ...') model = Word2Vec(tokenized_sentences, window=5) print('\nWord2Vec is computed !') word_vectors = model.wv name_model = 'sample_mandarin_embeddings_{}book_model.tsv'.format(n_books) path_model = os.path.join(path_embeddings, name_model) word_vectors.save_word2vec_format(path_model) print('model saved at path : {}'.format(path_model))
file = open(filename, 'r', encoding="utf8") doc = csv.reader(file, delimiter=',') for i, row in enumerate(doc): if (i >= train_size): break tweet = row[2] line = doc_to_clean_lines(tweet, vocab) lines += line return lines # load the vocabulary file = "sentiment_train.csv" vocab_filename = file + '_vocab.txt' vocab = load_doc(vocab_filename) vocab = vocab.split() vocab = set(vocab) # load training data sentences = load_sentences(file, vocab, True) print('Total training sentences: %d' % len(sentences)) # train word2vec model (workers=cpu cores, window= number of neighbor words considered) model = Word2Vec(sentences, size=100, window=5, workers=8, min_count=1) # summarize vocabulary size in model words = list(model.wv.vocab) print('Vocabulary size: %d' % len(words)) # save model in ASCII (word2vec) format filename = file + '_embedding_word2vec.txt' model.wv.save_word2vec_format(filename, binary=False)
lambda x: x.replace(" ", "")) model_dataframe["separates"] = model_dataframe["sentences"].apply( lambda x: x.replace(",", "")) model_dataframe["separates"] = model_dataframe["separates"].apply( lambda x: x.replace(";", "")) model_dataframe["separates"] = model_dataframe["separates"].apply( lambda x: x.replace("\"", "")) model_dataframe["separates"] = model_dataframe["separates"].apply( lambda x: x.replace('"', '')) model_dataframe["separates"] = model_dataframe["separates"].apply( lambda x: x.split()) # -- 문장별 Word2Vec 처리 model = Word2Vec(model_dataframe["separates"], sg=1, size=300, min_count=1, iter=10) ''' sg = 0이면 cbow, 1이면 skip-grm min_count = 5 (등장 횟수가 5 이하인 단어는 무시) size = 300 (300차원짜리 벡터스페이스에 embedding) iter (보통 딥러닝에서 말하는 epoch와 비슷한, 반복횟수 workers : cpu의 코어수에 따라 multi-thread를 지원해서 병렬처리하는 옵션 alpha : 초기학습률, min_alpha: alpha값이 학습과정에서 선형으로 줄어서 도달하는 최소 값 ''' count = 0 sum = 0.0 average = 0.0
import sys import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == '__main__': program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ' '.join(sys.argv)) inp = '../policy.seg' outp1 = '../policy.model' outp2 = '../policy.vector' model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count(), iter=3) # trim unneeded model memory = use(much) less RAM #model.init_sims(replace=True) model.save(outp1) model.wv.save_word2vec_format(outp2, binary=False) print('OK')
# unlabeled.fnames = [] for m in tar.getmembers(): if ".txt" in m.name: # unlabeled.fnames.append(m.name) unlabeled.data.append(read_instance(tar, m.name)) tar.close() return unlabeled.data def read_instance(tar, ifname): inst = tar.getmember(ifname) ifile = tar.extractfile(inst) content = ifile.read().strip() return content if __name__ == "__main__": print("Reading files") tarfname = "data/speech.tar.gz" docs = read_files(tarfname) lmtzr = WordNetLemmatizer() print("Lemmatizing and Tokenizing") lemmatized = [[ lmtzr.lemmatize(word).lower().strip('.,;:') for word in word_tokenize(d.decode("utf-8")) if len(word) >= g.min_length ] for d in docs] print("Computing Word2Vec Matrix") wv = Word2Vec(lemmatized, workers=g.num_jobs) wv.save("word2vec.model")
resp = resp.json() data = resp['items'] start += len(data) # DataFrame df = pd.DataFrame(data=data) df['title'] = df['title'].apply(preprocessing) df['description'] = df['description'].apply(preprocessing) df_list.append(df) # break if len(data) != 100: break return pd.concat(df_list) if __name__ == "__main__": df = request_book_by_query("파이썬") target = df['title'] + ' ' + df['description'] target = target.apply(get_nouns) target = target.str.split() # Training model = Word2Vec(target.to_list(), size=300, window=10, min_count=1) model.init_sims(replace=True) # Test result = model.wv.most_similar("알고리즘", topn=10) print("/".join([x[0] for x in result]))
def build_word2vec_model(data, embedding_size=2, save=True): model = Word2Vec(data, min_count=0, size=embedding_size) if save: model.save_word2vec_format('output') return model
if (s in frequency): store_total = store_total + frequency[s] # print(len(store)) print(a, "이상의 단어 총수:", store_total) # Word2Vec 적용하기 # CBOW model # size: number of dimensions of the embeddings (default = 100) # window: target word와 target word 주변 단어 간의 최대 거리 (default = 5) # min_count: 단어 빈도 수가 이 값보다 작으면 무시됨 (default = 5) # workers: numbers of partitions during training (default = 3) minCount = 20 s = 250 w = 6 cbow_model = Word2Vec(data, min_count=minCount, iter=5, size=s, window=w) cbow_model.save('CBOWModelFile') print("size = %d" % s) print("window = %d" % w) cModel = g.Doc2Vec.load('CBOWModelFile') vocab = list(cModel.wv.vocab) # 좌표평면 상에 그리기 cModel = g.Doc2Vec.load('CBOWModelFile') vocab = list(cModel.wv.vocab) X = cModel[vocab] # 이차원 그래프로 표현: t-SNE tsne = TSNE(n_components=2)
from wikipedia import page from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence import re #Get data from wiki : https://codesachin.wordpress.com/2015/10/09/generating-a-word2vec-model-from-a-block-of-text-using-gensim-python/ title = "Word2vec" wikipage = page(title) raw_content = wikipage.content alphanumeric_content = re.sub('[^0-9a-zA-Z ]+', ' ', raw_content) text_file = open("Output.txt", "w") text_file.write(alphanumeric_content) text_file.close() sentences = LineSentence("Output.txt", max_sentence_length=10) print(sentences) #exit() min_count = 2 size = 50 window = 4 model = Word2Vec(sentences, min_count=min_count, size=size, window=window) #print(model.wv.vocab) for i in model.wv.vocab: print(i) print(model[i]) #print(model[page_list[0]]) #print(model.batch_words)
def train(self, min_count=3, workers=1): self.model = Word2Vec(self.corpus, min_count=min_count, workers=workers)
Created on Thu Aug 9 15:53:53 2018 @author: jjuppuluri13 """ import gensim from gensim.models import Word2Vec #import data_proc print(df.sentence[9]) from nltk.tokenize import word_tokenize df.sentence = df.sentence.apply(lambda x: word_tokenize(x)) sentences = df.sentence model = Word2Vec(sentences, size=200, min_count=1) print(model) words = list(model.wv.vocab) #print(words) #print(model['venlafaxine']) ##### Export X train/test data sequence = [] i = 0 while i < 5457: sequence.append(model[df.sentence[i]]) i += 1 thefile = open('train_word.txt', 'w') for item in sequence: thefile.write("%s\n" % item)
def run(dataset="biorxiv_medrxiv", test=False): millis = int(round(time() * 1000)) logging.basicConfig(filename=f"{defaultpath}/results/info_{millis}.log", filemode='a', format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO) filepath = f"{defaultpath}/processed/{dataset}/body.csv" logging.info(f"Reading {filepath}") df = pd.read_csv(filepath, sep="\t") if test: df = df[:100] logging.info(f"Dataset size: {len(df)}") logging.info(df.shape) df = df.dropna().reset_index(drop=True) logging.info("Number of Null lines") logging.info(df.isnull().sum()) nlp = spacy.load('en', disable=['ner', 'parser']) brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['body']) t = time() logging.info("Cleanning") txt = [ cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=8, n_threads=-1) ] logging.info(printTime(t)) df_clean = pd.DataFrame({'clean': txt}) df_clean = df_clean.dropna().drop_duplicates() logging.info(print(df_clean.shape)) sent = [row.split() for row in df_clean['clean']] phrases = Phrases(sent, min_count=30, progress_per=10000) sentences = phrases[sent] word_freq = defaultdict(int) for sent in sentences: for i in sent: word_freq[i] += 1 print(len(word_freq)) sorted(word_freq, key=word_freq.get, reverse=True) cores = multiprocessing.cpu_count() w2v_model = Word2Vec(min_count=20, window=2, size=300, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores - 1) t = time() w2v_model.build_vocab(sentences, progress_per=10000) logging.info(printTime(t)) t = time() w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1) logging.info(printTime(t)) w2v_model.init_sims(replace=True) w2v_model.save(f"{defaultpath}/results/word2Vec.model") logging.info("Finished")
def train(sens, modelfp): model = Word2Vec(size=100, window=15, sg=1, min_count=1, workers=4) model.build_vocab(sens) model.train(sens, total_examples=len(sens), epochs=10) model.wv.save(modelfp)
df['verse'] = df['verse'].str.split() # Remove Arabic stop words df['verse'] = df['verse'].map(lambda x: [w for w in x if w not in arb_stopwords]) # Exclude these words from the stemmer stem_not = ['الله', 'لله', 'إلهكم', 'اله', 'لله', 'إلهكم', 'إله', 'بالله', 'ولله'] # [On/Off] Stemming the words to reduce dimensionality except stem_not list # df['verse'] = df['verse'].map(lambda x: [w if w in stem_not else st.stem(w) for w in x]) # You can filter for one surah too if you want! verses = df['verse'].values.tolist() # train model model = Word2Vec(verses, min_count=15, window=7, workers=8, alpha=0.22) # summarize the loaded model # fit a 2d PCA model to the vectors X = model[model.wv.vocab] pca = PCA(n_components=2) result = pca.fit_transform(X) # create a scatter plot of the projection plt.scatter(result[:, 0], result[:, 1]) words = list(model.wv.vocab) # Pass list of words as an argument # disable for now in order to show the one below # for i, word in enumerate(words): # reshaped_text = arabic_reshaper.reshape(word) # artext = get_display(reshaped_text)
wordsDocList=[] targetFolder='all' trainFolder='buildDataSet/'+targetFolder+'/google code' pathDir=os.listdir(trainFolder) sen2List(pathDir, wordsDocList) trainFolder='buildDataSet/'+targetFolder+'/github' pathDir=os.listdir(trainFolder) sen2List(pathDir, wordsDocList) ##############3 this is the API of doc2vec #documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(wordsDocList)] model = Word2Vec(wordsDocList,size=vectorsize,window=10,min_count=0,workers=10,iter=10) model.save("my_word2vec_model_"+str(vectorsize)+"_noPun") ''' word_vectors = model.wv if 'word' in word_vectors.vocab: print("bingo") print("not") ''' ''' w1=["click"] aa=model.wv.most_similar(positive=w1, topn=6) print(aa) vector = model.wv['computer'] print(vector)
train_data = Dataset(train_sents, cate2idx=ent2idx) train_data.build_vocab_dict(vocab_size=vocab_size) with open('word2idx.json', 'w') as f: f.write(str(train_data.word2idx)) test_data = Dataset(test_sents, word2idx=train_data.word2idx, cate2idx=ent2idx) test_X, _ = test_data[:] vocab_size = len(train_data.word2idx) w2v_train_sents = [] for doc in docs: w2v_train_sents.append(list(doc.text)) w2v_model = Word2Vec(w2v_train_sents, size=emb_size) w2v_embeddings = np.zeros((vocab_size, emb_size)) for char, char_idx in train_data.word2idx.items(): if char in w2v_model.wv: w2v_embeddings[char_idx] = w2v_model.wv[char] np.save("w2v_embeddings.npy", w2v_embeddings) seq_len = sent_len + 2 * sent_pad model = build_lstm_crf_model(num_cates, seq_len=seq_len, vocab_size=vocab_size, model_opts={'emb_matrix': w2v_embeddings, 'emb_size': emb_size, 'emb_trainable': False}) print(model.summary()) train_X, train_y = train_data[:] print('train_X.shape', train_X.shape)
# coding:utf-8 import sys import gensim import sklearn import numpy as np from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == '__main__': #训练model model = Word2Vec(LineSentence("train_set.txt"), size=100, window=2, min_count=0, workers=4) model.wv.save_word2vec_format('address_word2vec_model') print('word2vec model get!') Model = gensim.models.KeyedVectors.load_word2vec_format( 'address_word2vec_model') print(Model.wv['霄云路']) print(Model.wv.similarity('霄云路', '霄云路'))
from __future__ import print_function import multiprocessing from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == '__main__': inputFile = sys.argv[ 1] # preprocess.py dosyasından elde ettiğimiz wiki.tr.txt dosyasının linkinin input olarak yazabilirsiniz ya da consoldan input olarak verebilirsiniz outputFile = "trmodel" model = Word2Vec(LineSentence(inputFile), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count()) model.wv.save_word2vec_format(outputFile, binary=True)
Author: liuyao8 Descritipn: """ from gensim.models import Word2Vec, KeyedVectors, Phrases from gensim.test.utils import common_texts, get_tmpfile, datapath from gensim.scripts.glove2word2vec import glove2word2vec # 0. 通用数据和函数 # common_texts: list of list,每个list表示一个文档或句子的分词结果 # get_tmpfile(fname):与temporary目录拼接,表示临时目录下的某文件,如C:\\Users\\liuyao8\\AppData\\Local\\Temp\\<fname> # datapath(fname): os.path.join(module_path, 'test_data', fname),表示当前模块的测试目录下的某文件 # 1. 训练word embedding # 1.1 初始训练 model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4) path = get_tmpfile("word2vec.model") model.save(path) # 1.2 加载模型并继续训练(流式训练,reading data from disk on-the-fly) model = Word2Vec.load(path) model.train([["hello", "world"]], total_examples=1, epochs=1) # 1.3 训练得到word embedding word2vector = model.wv # KeyedVectors vector = word2vector['computer'] # numpy vector of shape (100, ) path = get_tmpfile("wordvectors.kv") word2vector.save(path) word2vector = KeyedVectors.load(path, mmap='r')