def load_word_embedding(data_name='google_news', data_type='bin'): logger.info('Start load word2vec word embedding') os_name = get_os_name() if os_name == "windows": file1 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz' file2 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz' file3 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin' file4 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin' elif os_name == 'ubuntu': file1 = '/home/hs/Data/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz' file2 = '/home/hs/Data/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz' file3 = '/home/hs/Data/Word_Embeddings/google_news.bin' file4 = '/home/hs/Data/Word_Embeddings/freebase.bin' if data_name == 'google_news': if data_type == 'bin': model = Word2Vec.load_word2vec_format(file3, binary=True) else: # load .bin.gz data model = Word2Vec.load_word2vec_format(file1, binary=True) else: # load freebase if data_type == 'bin': model = Word2Vec.load_word2vec_format(file4, binary=True) else: model = Word2Vec.load_word2vec_format(file2, binary=True) # using gzipped/bz2 input works too, no need to unzip: logging.info('Loading word embedding complete') return model
def createTransformationMatrix(modelA, modelB): # initialize the matrices labels = [] A = [] B = [] # keep the common words and add them to the matrices nb_words_A = len(modelA.index2word) nb_words_B = len(modelA.index2word) for i in range(0, nb_words_A): word = modelA.index2word[i] if word in modelB.index2word: # add the word to the matrices (and the labels) labels.append(word) A.append(modelA[word]) B.append(modelB[word]) # create the transformation matrix TransM, _ = orthogonal_procrustes(np.asarray(A), np.asarray(B), check_finite=False) # apply the transofrmation matrix to the first model matrix Z = np.matmul(A, TransM) # create the 2 models manually (by first creating a text file and reading it). # it would be most efficient not to have to store the results on files like this. constructModel(np.asarray(Z), labels, "tmpZ.model.txt") constructModel(np.asarray(B), labels, "tmpB.model.txt") modelZ_ = Word2Vec.load_word2vec_format('tmpZ.model.txt', binary=False) modelB_ = Word2Vec.load_word2vec_format('tmpB.model.txt', binary=False) return modelZ_, modelB_
def __init__(self, label_vec_f, feature_vec_f, binary=False): label2vec = Word2Vec.load_word2vec_format(label_vec_f, binary=binary) self.label_embed = label2vec.syn0 self.dictionary = label2vec.index2word self.vocab = label2vec.vocab self.feat_embed = Word2Vec.load_word2vec_format(feature_vec_f, binary=binary).syn0
def load_embeddings(path=None): path = just.make_path(path) binary = path.endswith("gz") or path.endswith("bz2") if binary: embeddings = Word2Vec.load_word2vec_format(path, binary=True) else: embeddings = Word2Vec.load_word2vec_format(path, binary=False) esize = _get_embedding_size(embeddings) return embeddings, esize
def set_embedding_weights(self, embedding_init): # load embedding with gensim from gensim.models import Word2Vec try: m = Word2Vec.load_word2vec_format(embedding_init, binary=False) edim = m.layer1_size except UnicodeDecodeError: try: m = Word2Vec.load_word2vec_format(embedding_init, binary=True) edim = m.layer1_size except UnicodeDecodeError: # not in word2vec format m = Word2Vec.load(embedding_init) edim = m.layer1_size except ValueError: # glove model m = {} if embedding_init.endswith('gz'): fp = gzip.open(embedding_init) else: fp = open(embedding_init) for l in fp: le = l.split() m[le[0].decode('utf-8')] = numpy.array( [float(e) for e in le[1:]], dtype=theano.config.floatX) edim = len(le) - 1 if edim != self.edim: raise Exception("Embedding dim and edim doesn't match") m_lower = {} vocab = (m.vocab if hasattr(m, 'vocab') else m) for k in vocab: if k in ['UNKNOWN', 'PADDING']: continue if self.num: m_lower[replace_numerals(k.lower())] = m[k] else: m_lower[k.lower()] = m[k] # transform weight matrix with using self.w2i params = numpy.zeros( self.tagger.layers[0].layers[0].get_param_vector().shape, dtype=theano.config.floatX) e = self.edim for w in self.w2i: if w in m_lower: v = m_lower[w] i = self.w2i[w] params[i * e:(i + 1) * e] = v if 'UNKNOWN' in vocab: params[-1 * e:] = vocab['UNKNOWN'] if 'PADDING' in vocab: params[-2 * e:-1 * e] = vocab['PADDING'] self.tagger.layers[0].layers[0].set_param_vector(params)
def load_model(model_fn, model_type): logging.info('Loading model: {0}'.format(model_fn)) if model_type == 'word2vec': model = Word2Vec.load_word2vec_format(model_fn, binary=True) elif model_type == 'word2vec_txt': model = Word2Vec.load_word2vec_format(model_fn, binary=False) elif model_type == 'gensim': model = Word2Vec.load(model_fn) else: raise Exception('Unknown model format') logging.info('Model loaded: {0}'.format(model_fn)) return model
def read_glove_model(dim=50, huge=False): print "reading gloVe word embedding vectors..." if dim == 50: return Word2Vec.load_word2vec_format(glove_vector_50, binary=False) elif dim == 100: return Word2Vec.load_word2vec_format(glove_vector_100, binary=False) elif dim == 200: return Word2Vec.load_word2vec_format(glove_vector_200, binary=False) elif dim == 300: return Word2Vec.load_word2vec_format(glove_vector_300, binary=False) elif huge: return read_glove_to_dict(glove_vector_huge)
def set_embedding_weights(self, embedding_init): # load embedding with gensim from gensim.models import Word2Vec try: m = Word2Vec.load_word2vec_format(embedding_init, binary=False) edim = m.layer1_size except UnicodeDecodeError: try: m = Word2Vec.load_word2vec_format(embedding_init, binary=True) edim = m.layer1_size except UnicodeDecodeError: # not in word2vec format m = Word2Vec.load(embedding_init) edim = m.layer1_size except ValueError: # glove model m = {} if embedding_init.endswith('gz'): fp = gzip.open(embedding_init) else: fp = open(embedding_init) for l in fp: le = l.split() m[le[0].decode('utf-8')] = numpy.array( [float(e) for e in le[1:]], dtype=theano.config.floatX) edim = len(le) - 1 if edim != self.edim: raise Exception("Embedding dim and edim doesn't match") m_lower = {} vocab = (m.vocab if hasattr(m, 'vocab') else m) for k in vocab: if k in ['UNKNOWN', 'PADDING']: continue if self.num: m_lower[replace_numerals(k.lower())] = m[k] else: m_lower[k.lower()] = m[k] # transform weight matrix with using self.w2i params = numpy.zeros( self.tagger.layers[0].layers[0].get_param_vector().shape, dtype=theano.config.floatX) e = self.edim for w in self.w2i: if w in m_lower: v = m_lower[w] i = self.w2i[w] params[i*e:(i+1)*e] = v if 'UNKNOWN' in vocab: params[-1*e:] = vocab['UNKNOWN'] if 'PADDING' in vocab: params[-2*e:-1*e] = vocab['PADDING'] self.tagger.layers[0].layers[0].set_param_vector(params)
def generate_cnn_train_test(char_name, word_name): cnn_vec_dir = 'cnn_vec' ensure_path(cnn_vec_dir) char_embedding = Word2Vec.load_word2vec_format(char_name, binary=False) word_embedding = Word2Vec.load_word2vec_format(word_name, binary=False) train_doc, train_label = load_train() # test_doc, test_label = load_test() train_vec_file = cnn_vec_dir + '/' + 'train.txt' # test_vec_file = cnn_vec_dir + '/' + 'test.txt' generate_cnn_vec(char_embedding, word_embedding, train_doc, train_label, train_vec_file) print 'generate cnn train feature ok'
def load_matrix_and_dictionary(fn, typ, dict_fn=None, filt_dict=None): if typ == 'numpy': return np.load(fn), load_dictionary_as_dict(dict_fn) elif typ == 'glove': from glove import Glove m = Glove().load_stanford(fn) return m.word_vectors, m.dictionary elif typ == 'word2vec': from gensim.models import Word2Vec if 'txt' in fn or 'w2v' in fn: m = Word2Vec.load_word2vec_format(fn, binary=False) else: m = Word2Vec.load_word2vec_format(fn, binary=True) return extract_wordvec_matrix_and_dict(m, filt_dict) raise Exception('Unknown matrix format: {}'.format(typ))
def setup_w2v(word2vec_model, country_names_json): ''' Given the path to a word2vec model and a JSON file containing country names and codes, setup the indices and vocabulary for geocoding.''' prebuilt = Word2Vec.load_word2vec_format(word2vec_model, binary=True) vocab_set = set(prebuilt.vocab.keys()) with open(country_names_json) as f: stopword_country_names = json.load(f) countries = stopword_country_names.keys() idx_country_mapping = {} index = numpy.empty(shape=(len(countries), 300), dtype='float64') for idx, country in enumerate(countries): country = unidecode(country) try: vector = prebuilt[country] except KeyError: pass index[idx] = vector try: idx_country_mapping[idx] = stopword_country_names[country] except KeyError: pass return { 'prebuilt': prebuilt, 'vocab_set': vocab_set, 'index': index, 'idx_country_mapping': idx_country_mapping }
def __filter_w2v_model(filename, words_to_remove, num_to_keep): """Filters the words in the Spanish model, removing all the words in the given list and returning the top x words :param filename: The name of the file to read the words in from :param words_to_remove: A list of all the words to get rid of :param num_to_keep: The number of words to keep """ good_words = list() with open(filename, 'r') as f: for line in f: for word in words_to_remove: if not line.startswith(word): good_words.append(line) random.shuffle(good_words) kept_words = good_words[:num_to_keep] with open('tempmodel', 'w') as f: for word in kept_words: f.write(word) f.write('\n') return Word2Vec.load_word2vec_format('tempmodel')
def tokens_to_word2vec(tokens, model): if model == 'word2vec': model = Word2Vec.load_word2vec_format( os.path.join(DIRNAME, '../word2vec/GoogleNews-vectors-negative300.bin'), binary=True) elif model == 'glove': word_to_vector_glove = {} tokens_glove = set(tokens) #with open(os.path.join(DIRNAME, '../glove/glove.6B/glove.6B.300d.txt'), 'r') as f: with open(os.path.join(DIRNAME, '../glove/glove.42B.300d.txt'), 'r') as f: for line in f: split_index = line.index(' ') word = line[:split_index] vector = np.fromstring(line[split_index+1:], dtype=float, sep=' ') assert len(vector)==300 if word == '.': word = '</s>' if word in tokens_glove: word_to_vector_glove[word] = vector return word_to_vector_glove word_to_vector = {} for word in tokens: try: arr = model[word] except KeyError: continue word_to_vector[word] = arr return word_to_vector
def compute_pair_similarity(benchmark_file, embedding_file, binary_embedding=True): logger = logging.getLogger() logger.info('Loading embeddings from {}...'.format(embedding_file)) embedding_model = Word2Vec.load_word2vec_format(embedding_file, binary=binary_embedding) pair_similarities = {} with open(benchmark_file) as bf: for line in bf: line = line.strip() if line: pair = tuple(line.split(',')) term_1, term_2 = pair if term_1 in embedding_model and term_2 in embedding_model: sim_score = float(embedding_model.similarity(term_1, term_2)) pair_similarities[pair] = sim_score accuracy = [] for threshold in arange(0.0, 1.1, 0.1): similar_pair_count = 0 for pair in pair_similarities: if pair_similarities[pair] >= threshold: similar_pair_count += 1 accuracy.append(float(similar_pair_count) / len(pair_similarities)) logger.info('Accuracy: {}'.format(accuracy)) return accuracy
def populate_entity(self, path_vec,path_entity, prod_model = True): self.path_vec = path_vec self.path_entity = path_entity self.prod_model = prod_model self.entity_model = Word2Vec.load_word2vec_format(path_vec) self.entity2idx = {} self.idx2entity = OrderedDict() f = open(path_entity, "r") for line in f: if self.method == "LDA": entity = line[0:-1] idx = len(self.entity2idx) if entity not in self.entity2idx: self.entity2idx[entity] = idx self.idx2entity[idx] = entity else: print("dup?") else: entity = line[0:line.rindex("_")] idx = int(line[1 + line.rindex("_"):]) self.entity2idx[entity] = idx self.idx2entity[idx] = entity
def __init__(self, model_file: str) -> None: if model_file.endswith(".bin"): self.model = Word2Vec.load_word2vec_format(model_file, binary=True) elif model_file.endswith(".model"): self.model = api.load(model_file[:-6]) else: self.model = Word2Vec.load(model_file)
def run(): w2v = True l1 = 1 l2 = 1e-3 iters = 200 wiki = True words_before = 4 words_after = 4 shallow_parse = True try: opts, args = getopt.getopt(sys.argv[1:], 'w:i:c:l:', ['w2v=', 'iters=', 'l1=', 'l2=', 'wiki=', 'words_before=', 'words_after=', 'shallow_parse=']) except getopt.GetoptError as e: print(e) sys.exit(2) for opt, arg in opts: if opt in ('-w', '--w2v'): option = int(arg) if option == 1: w2v = True elif opt in ('-i', '--iters'): iters = int(arg) elif opt in ('-c', '--l1'): l1 = float(arg) elif opt in ('-l', '--l2'): l2 = float(l2) elif opt == '--wiki': option = int(arg) if option == 0: wiki = False elif opt == '--words_before': words_before = int(arg) elif opt == '--words_after': words_after = int(arg) elif opt == 'shallow_parse': option = int(arg) if option == 0: shallow_parse = False else: sys.exit(2) if w2v: print('Loading word2vec model...') if wiki: word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin' else: word2vec_model = 'PubMed-w2v.bin' w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True) print('Loaded word2vec model') else: w2v = None run_crf(w2v, words_before, words_after, shallow_parse)
def compare(dataset, model_name, pre_model_name): # build model if(os.path.isfile(model_name)): model = Word2Vec.load(model_name) logger.debug("model %s already exist, stop training wordvector", model_name) else: logger.info("start trainning word vector") start_time = timeit.default_timer() model = wordvector.build_word_vector(dataset, save=True, save_file=model_name) logger.info("model %s trained in %.4lfs", model_name, timeit.default_timer() - start_time) # find most similar words: for word in keywords: print word print model.most_similar(word, topn=10); # load pre-trained google news model logger.info("start loading pre-trained dataset") start_time = timeit.default_timer() pre_model = Word2Vec.load_word2vec_format(pre_model_name, binary=True) logger.info("pre-trained dataset loaded in %.4lfs", timeit.default_timer() - start_time) # find most similar words: for word in keywords: print word print pre_model.most_similar(word, topn=10);
def get_model(self): if self.word2vec is None: print("Loading word2vec...") self.word2vec = Word2Vec.load_word2vec_format(self.path, binary=True) print("Done!") return self.word2vec
def populate_entity(self, path_vec, path_entity, prod_model=True): self.path_vec = path_vec self.path_entity = path_entity self.prod_model = prod_model self.entity_model = Word2Vec.load_word2vec_format(path_vec) self.entity2idx = {} self.idx2entity = OrderedDict() f = open(path_entity, "r") for line in f: if self.method == "LDA": entity = line[0:-1] idx = len(self.entity2idx) if entity not in self.entity2idx: self.entity2idx[entity] = idx self.idx2entity[idx] = entity else: print("dup?") else: entity = line[0:line.rindex("_")] idx = int(line[1 + line.rindex("_"):]) self.entity2idx[entity] = idx self.idx2entity[idx] = entity
def main(): print "Loading word2vec" global word2vec word2vec = Word2Vec.load_word2vec_format(sys.argv[2], binary=True) tagger = load("taggers/maxent_treebank_pos_tagger/english.pickle") f_sentences = codecs.open(sys.argv[1], encoding="utf-8") invalid = list() valid = list() on = False for line in f_sentences: if line.startswith("#"): continue if line.startswith("VALID"): on = True continue sentence = Sentence(line.strip(), "ORG", "LOC", 6, 1, 2, tagger) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after) if on is True: valid.append(t) elif on is False: invalid.append(t) f_sentences.close() for v in valid: for i in invalid: score = similarity_3_contexts(v, i) print "VALID", v.e1, v.e2, "\t", v.bet_words print "INVALID", i.e1, i.e2, "\t", i.bet_words print score
def init_word2vec(): global WORD2VEC_MODEL if not WORD2VEC_MODEL: print 'loading word2vec model' from gensim.models import Word2Vec WORD2VEC_MODEL = Word2Vec.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True) print 'loading word2vec model [done]'
def __init__(self, config, deterministic=False, *args, **kwargs): self.config = config self.deterministic = deterministic self.word2vec_model = Word2Vec.load_word2vec_format(config.get('word2vec_model'), binary=True) self.word2vec_keys = { word.lower(): word for word in self.word2vec_model.vocab } self.thesarus = data.get_thesarus(config.get('thesarus')) super(SharedIterator, self).__init__(*args, **kwargs)
def main(): topic_files = make_topic_map("./topic_sentences/selected_topics.txt", "./data/") topic_out_files = make_topic_map("./topic_sentences/selected_topics.txt", "./topic_sentences/", write=True) num_topics = len(topic_files) print "loading files" sentence_map = { topic: read_data_file(topic_file) for topic, topic_file in topic_files.items() } print "loading word2vec" word2vec = Word2Vec.load_word2vec_format(word2vec_filepath, binary=True) for topic in topic_files: sentences, _ = sentence_map[topic] sim_scores = compute_sim(sentences, topic, word2vec) outfile = topic_out_files[topic] count = 0 for i, sentence in enumerate(sentences): if (sim_scores[i] > .5): outfile.write(sentence.encode('utf8', 'replace') + "\n") count += 1 print "count for topic: ", topic, " is ", count
def evaluate_google(): # see https://code.google.com/archive/p/word2vec/ # load pretrained google embeddings and test from gensim.models import Word2Vec model_google = Word2Vec.load_word2vec_format( 'data/GoogleNews-vectors-negative300.bin.gz', binary=True) _ = accuracy(model_google, "data/questions-words.txt", False)
def read_test(data_file_name): word_name = '/home/niyao/zhaolei/ZhiHu/data/word_embedding.txt' word_embedding = Word2Vec.load_word2vec_format(word_name, binary=False) x_text = [] reader = pd.read_table(data_file_name, sep='\t', header=None) for i in xrange(reader.shape[0]): x_text.append(reader.iloc[i][0]) # print len(x_text) max_document_length = max([len(x.split(',')) for x in x_text]) print 'the max document length of test is %d' % max_document_length j = 0 x = [] for features in x_text: xi = [] for id in features.split(','): if id in word_embedding: xi.append(word_embedding[id]) for i in xrange(len(xi), max_document_length): xi.append(np.zeros(256)) x.append(xi) j += 1 if j % 1000 == 0: print 'load data %d' % j x = np.array(x).astype(np.float32).reshape(217360, max_document_length * 256) print x.shape return x, max_document_length
def load_CUI_vectors(): ''' From De Vine et al., CIKM 2014 https://github.com/clinicalml/embeddings ''' m = Word2Vec.load_word2vec_format("DeVine_etal_200.txt.gz") return m
def get_word2vec( train_fn="data/rap/input.txt", saved_model_fn="save/save/GoogleNews-vectors-negative300.bin"): try: print "loading word2vec model at {0}".format(saved_model_fn) model = Word2Vec.load_word2vec_format(saved_model_fn, binary=True) print "model loaded" return model except IOError: print "no word2vec model found at {0}".format(saved_model_fn) with open(train_fn) as f: data = f.read() clean = TextLoader.clean_str(data) lines = [line.split(" ") for line in clean.split('\n')] full_data = brown.sents() + movie_reviews.sents() + treebank.sents( ) + lines print "training word2vec model" model = Word2Vec(workers=8) model.build_vocab(full_data) for i in xrange(0, 5): print "epoch " + str(i + 1) # full_data = shuffle(full_data) pb = ProgressBar(maxval=len(full_data)) chunk_size = len(full_data) / 100 j = 0 pb.start() while j + chunk_size < len(full_data): model.train(full_data[j:j + chunk_size]) j += chunk_size pb.update(j) print "done training" model.save(saved_model_fn) return model
def word_2_vec(): csv_paths = ['set1.csv','set2.csv','combined.csv'] model = Word2Vec.load_word2vec_format('/root/libanghuai/homework/GoogleNews-vectors-negative300.bin', binary=True) for csv_path in csv_paths: print "deal_with %s \n" % csv_path out_file_name = "word2vec_result_"+csv_path wordpairs = list(csv_parser(csv_path)) wordpairs = cal_rank(wordpairs,3) ans_list=[] for wordpair in wordpairs: fst_word = wordpair[1] sec_word = wordpair[2] max_sim = model.similarity(fst_word,sec_word) wordpair.append(max_sim) ans_list.append(wordpair) ans_list = cal_rank(ans_list,5) num = 0 sum_gap = 0 for line in ans_list: num += 1 sum_gap += (line[4] - line[6])*(line[4] - line[6]) print num output_file(out_file_name,ans_list) print (1-sum_gap*6.0/(num*(num*num - 1)))
def __init__(self, model_name, glove=False, binary=True, dims=300, models_path="F:\\wiki"): """ Konstruktor wrappera modeli wektorowych :param model_name: nazwa pliku modeli :param glove: flaga czy model jest w formacie glove czy word2vec :param binary: flaga czy model w2v jest w formacie binarnym czy tekstowym :param dims: liczba wymiarów wektorów w modelu """ self._model_name = model_name self._model_path = os.path.join(models_path, model_name) self._glove = glove self._binary = binary self.dims = dims if self._glove: self._model = VectorModelWrap.load_stf(self._model_path, self.dims) else: self._model = Word2Vec.load_word2vec_format(self._model_path, binary=True) if self._binary else\ Word2Vec.load(self._model_path)
def __init__(self, model_path, model_type='fasttext', **kwarg): if model_type == "fasttext": self._model = FastText.load_fasttext_format(model_path) elif model_type == "word2vec": self._model = Word2Vec.load_word2vec_format(model_path) else: raise NotImplementedError("other model is not supported")
def load_word2vec_model(model): """ """ embed_data_path = "../data/embed_dat" embed_vocab_path = "../data/embed_vocab" vector_model_path = "../data/user_vector" if os.path.exists(embed_data_path): os.remove(embed_data_path) if os.path.exists(embed_vocab_path): os.remove(embed_vocab_path) if not os.path.exists(embed_data_path): print("Caching word embeddings in memmapped format...") wv = Word2Vec.load_word2vec_format(vector_model_path, binary=True) print "wv syn0norm shape : " + str(wv.syn0norm.shape) fp = np.memmap(embed_data_path, dtype= np.double, mode='w+', shape=wv.syn0norm.shape) fp[:] = wv.syn0norm[:] with open(embed_vocab_path, "w") as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): f.write(w + "\n") del fp, wv
def generate_word2vec_for_all(self): print 20 * "*" print "GENERATING NETWORK READY FILES." model = Word2Vec.load_word2vec_format(self.word2vec_path, binary=True) # model = "" for folder in self.folder_list: l_dir = os.path.join(self.raw_text_path, folder) print("Folder : ", folder) op_l_dir = os.path.join(self.op_path, folder) if not os.path.exists(op_l_dir): os.makedirs(op_l_dir) questions_dir = self.get_list_of_dirs(l_dir) for question_dir in questions_dir: file_list = self.get_list_of_files( os.path.join(l_dir, question_dir)) if not os.path.exists(os.path.join(op_l_dir, question_dir)): os.makedirs(os.path.join(op_l_dir, question_dir)) print("Question : ", question_dir) for fname in file_list: with open(os.path.join(l_dir, question_dir, fname), "r") as f: if fname == 'support.txt': is_closest_para_file = True try: text = f.readlines()[0] raw_data_content = "" count = 0 for s in sent_tokenize(text): if len(s.split() ) > self.num_of_words_in_sent: raw_data_content += " ".join( s.split() [:self.num_of_words_in_sent]) raw_data_content += ". " else: raw_data_content += " ".join(s.split()) raw_data_content += " " count += 1 if count == self.num_of_sents_in_closest_para: break except: raw_data_content = f.readlines() else: is_closest_para_file = False raw_data_content = f.readlines() f = open( os.path.join(op_l_dir, question_dir, fname[:-4] + ".pkl"), "w") self.write_vecs_to_file(model, raw_data_content, f, is_closest_para_file) f.close() print 20 * "***" print "saving final unknown word2vec dictionary to file" f = open( os.path.join(self.common_files_path, self.unknown_words_vec_dict_file), "wb") pickle.dump(self.unknown_words_vec_dict, f) f.close()
def __init__(self): '''initialize''' self.s_window = 5 self.w2v_dim = 200 self.nb_classes = 4 self.label_id_dict = {u'S': 0, u'B': 1, u'M': 2, u'E': 3} self.train_data_file = "data/msr_training_taged" self.w2v_model_file = "data/msr_training_single_word.w2v.bin" self.model_hdf5_file = "pkl/w2v-word-segment.model" self.loss_history = "pkl/w2v-loss.png" self.check_point_file = "pkl/weights-{epoch:03d}.hdf5" self.NUM_LIST = [str(i) for i in range(10)] self.ENG_LIST = [i for i in list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")] self.w2v_model = Word2Vec.load_word2vec_format( self.w2v_model_file, binary=True, unicode_errors='ignore' ) self.lstm_model = self.create_model(self.s_window,self.w2v_dim, self.nb_classes) # if os.name=="nt":os.system("cls") # else:os.system("clear") self.train_model()
def load(filename): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if filename[:-6] == "bin.gz": model = Word2Vec.load_word2vec_format(filename, binary=True) else: model = Word2Vec.load(filename) return model
def wordvec_sim(self, write_flag=True): print 'load wordvec model:%s/%s' % (macro.DICT_DIR, self.w2v_model_file) w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.DICT_DIR, self.w2v_model_file), binary=True) # C format auto_sim_list = [] for w1, w2, manu_sim in zip(self.word1_list, self.word2_list, self.manu_sim_list): try: auto_sim = w2v_model.similarity(w1, w2) # 将余弦相似度放到0-10得分 if auto_sim <= 0: auto_sim = 1.0 else: auto_sim = auto_sim * 9 + 1 # print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim) except: auto_sim = 1 # cos值的最小值 print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, '______Not Found______') auto_sim_list.append(auto_sim) for w1, w2, manu_sim, auto_sim in zip(self.word1_list, self.word2_list, self.manu_sim_list, auto_sim_list): print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim) if write_flag: print 'write result to file...' with open('%s/%s' % (macro.RESULTS_DIR, self.ofname), 'w') as fw: fw.write(self.headline.strip() + '\tauto_sim_score\n') for w1, w2, manu_sim, auto_sim in zip(self.word1_list, self.word2_list, self.manu_sim_list, auto_sim_list): fw.write('%s\t%s\t%s\t%s\n' % (w1, w2, manu_sim, auto_sim)) return self.word1_list, self.word2_list, self.manu_sim_list, auto_sim_list, self.headline
def load_model(model_file_name): w2v_model = Word2Vec.load_word2vec_format(model_file_name, binary=True) # info('loaded {}'.format(model_name)) w2v_model.init_sims(replace=True) # to save memory vocab, vector_dim = w2v_model.syn0.shape # info('The model shape: {} {} (Vocabulary, dimension)'.format(vocab, vector_dim)) return w2v_model, vector_dim
def find_nearest_neighbors(vector_inpath, max_n, wordlist): """ Find the nearest neighbors for a list of words based on their word embeddings. Args: vector_inpath (str): Path to vector file. File has to have the following format (separated by spaces): <index of original vector #1> <index of original vector #2> <Dimension 1> ... <Dimension n> max_n (int): Number of nearest neighbors that should be determined. wordlist (list): List of words nearest neighbors should be found for. """ print "Loading vectors...." model = w2v.load_word2vec_format(vector_inpath, binary=False) print wordlist # Find nearest neighbors for word in wordlist: most_similar_with_score = model.most_similar(positive=[word], topn=max_n) for v in most_similar_with_score: print v most_similar_words = [pair[0] for pair in most_similar_with_score ] # Only use words, not scores # Print results print u"%i most similar words of %s in dataset %s" % (max_n, word, vector_inpath) for i in range(len(most_similar_words)): print u"%i: %s" % (i + 1, most_similar_words[i])
def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param vec_file: the file storing vectors :param binary: if vector are stored in binary. Google news use binary while yelp not """ self._wordvec = Word2Vec.load_word2vec_format(FileIO.filename(vec_file), binary=binary)
def LoadModel(MakeNew=False, useWiki=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) typeOfModel = "wiki" if useWiki else "text8" if not MakeNew: if os.path.isfile("word2vec/" + typeOfModel + ".model"): print("Using " + typeOfModel + ".model file") model = Word2Vec.load("word2vec/" + typeOfModel + ".model") return model if os.path.isfile("word2vec/" + typeOfModel + ".bin"): print("Using " + typeOfModel + ".bin file") model = Word2Vec.load_word2vec_format( 'word2vec/" + typeOfModel + ".bin', binary=True) # C binary format return model print("Generating new model. This may take some time") sentences = gensim.models.word2vec.Text8Corpus('word2vec/text8') model = Word2Vec(sentences, size=200, workers=4) print("Saving model as text8.model") model.save('word2vec/text8.model') return model
def create_lang_similarty_data(self): model = Word2Vec.load_word2vec_format(fp.w2vfilepath) lang_list = [ 'java', 'python', 'matlab', 'html', 'c++', 'c', 'mysql', 'javascript', 'sql' ] ''' for lang_o,lang_i in zip(lang_list,lang_list): for lan_i in lang_list: sim = model.similarity(lang_o,lan_i) print(lang_o,lan_i,sim) ''' with open(fp.language_sim, 'w') as mycsvfile: temp_list = lang_list[:] temp_list.insert(0, " ") print(temp_list) datawriter = csv.writer(mycsvfile) datawriter.writerow(temp_list) for lang_o in lang_list: lang_sim = [ "{0:.2f}".format(abs(model.similarity(lang_o, lang))) for lang in lang_list ] lang_sim.insert(0, lang_o) print(lang_sim) datawriter.writerow(lang_sim)
def used_model_m(): """测试一个词的词向量""" model = Word2Vec.load_word2vec_format( 'data/GoogleNews-vectors-negative300.bin', binary=True) # model =Word2Vec.load('temp/temp.bin') b = model['spilt'] print b
def main(): analogies_name = config.analogies_name model_name = config.model_name print("[%i Vocab]" % (config.restrict_vocab_nb or -1)) print("Analogies: %s | Model: %s" % (analogies_name, model_name)) start = now() analogies = read_analogies(analogies_name) analogies = prepare_analogies(analogies) model = Word2Vec.load_word2vec_format(model_name, binary=config.is_model_binary) model.init_sims(replace=True) model_loaded = now() print("Model loaded for [%s]" % str(model_loaded - start)) correct_guesses = perform_experiment(model, analogies=analogies) end = now() result_ratio, result_percentage = experiment_result_str( correct_guesses, analogies) print("Correct number of predictions out of all predictions %s [%i%%]" % (result_ratio, result_percentage)) print( "It took %s to load the model. After that, it took %s to perform the check" % (delta_to_str(model_loaded - start), delta_to_str(end - model_loaded)))
def vectors_to_pickled_dict(desired_words, output_file, norm = True, filename = __BIN_FILE_): print("Loading Model") model = Word2Vec.load_word2vec_format(filename, binary=True) print("Loaded") wd2vec = dict() if desired_words: desired_words = set(desired_words) else: desired_words = model.vocab.keys() for i, wd in enumerate(desired_words): if i % 1000 == 0: print(i) wd = remove_non_ascii(wd).replace(" "," ").strip() # for phrases wd_key = wd.replace(" ", "_") if wd_key in model.vocab: ix = model.vocab[wd_key].index vector = model.syn0norm[ix] if norm else model.syn0[ix] wd2vec[wd.replace("_"," ").strip()] = vector with open(output_file, "w+") as f: Pickle.dump(wd2vec, f)
def main(): model = Word2Vec.load_word2vec_format('comments.bin', binary=True) badword_list = json.load(open('badword_list.json')) vocabulary = json.load(open('vocabulary.json')) badwords = [] for badword in badword_list: for word in vocabulary: d = distance(badword, word) r = ratio(badword, word) if d < 2 and r > 0.8: badwords.append(word) #print(badword + " = " + word + " | Distance: " + str(d) + " Ratio:" + str(r)) similarities = {} for word1 in badwords: biggest = 0 for word2 in vocabulary: if word1 != word2: try: s = model.similarity(word1,word2) if s > biggest: similarities[word1] = (word2, s) biggest = s except: pass for word in similarities: print(word + ": " + str(similarities[word]))
def initialize(fword, tword, modelfn, start, debug): juman = Juman() # parse and check from_word ms_f = juman.analysis(fword).mrph_list() if len(ms_f) > 1: print(u'{} is parsed multiple words'.format(fword)) exit(1) wm_f = ms_f[0] if not wm_f.repname: print(u'no repname with {}'.format(fword)) exit(1) fword = wm_f.repname # parse and check to_word ms_t = juman.analysis(tword).mrph_list() if len(ms_t) > 1: print(u'{} is parsed multiple words'.format(tword)) exit(1) wm_t = ms_t[0] if not wm_t.repname: print(u'no repname with {}'.format(tword)) exit(1) tword = wm_t.repname # load and check model print(u'loading model...') if modelfn.split('.')[-1] == 'model': model = Word2Vec.load(modelfn) elif modelfn.split('.')[-1] == 'bin': model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore') if fword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(fword)) exit(1) elif tword not in model.vocab: raise KeyError(u'{} is not found in the model'.format(tword)) exit(1) model.save('hs0.100m.500.5.18mgt100.model') t1 = time.clock() - start if debug: printtime(t1) print(u'constructing id2vocab map...') id2vocab = {} for i, v in enumerate(model.vocab): id2vocab[i] = v t2 = time.clock() - t1 if debug: printtime(t2) print(u'constructing V...') V = [] for v in model.vocab: V.append(model[v]) V = np.vstack(V) t3 = time.clock() - t2 if debug: printtime(t3) return fword, tword, model, V, id2vocab, t3
def extract_relations(model_path, n_entities, min_relation_count, out_path, shuffle, dump_vectors): print 'Loading model...' model = Word2Vec.load_word2vec_format(model_path, binary=True) print 'Finished loading model' relation_vectors = dict() if n_entities > 0: if shuffle: base_entities = random.sample(model.vocab.keys(), n_entities) else: base_entities = model.vocab.keys()[:n_entities] else: base_entities = model.vocab.keys() for i, base_entity in enumerate(base_entities): print i, base_entity for (relation, related_entity) in get_relations_from_base_entity(base_entity): related_entity = unicode(related_entity).encode('utf8') relation = unicode(relation).encode('utf8') if related_entity.startswith(DBPEDIA_PREFIX): related_entity_without_prefix = related_entity[len(DBPEDIA_PREFIX):] if related_entity_without_prefix in model: v1, v2 = model[base_entity], model[related_entity_without_prefix] if relation in relation_vectors: relation_vectors[relation].append(v2 - v1) else: relation_vectors[relation] = [v2 - v1] # print vector_entities relations_statistics = [] mean_relation_vectors = dict() n_relations = len(relation_vectors) for r, relation in enumerate(relation_vectors): print r, '/', n_relations, relation vectors = relation_vectors[relation] if len(vectors) > min_relation_count: cosine_distances = [] for i, vi in enumerate(vectors): for j, vj in enumerate(vectors[i + 1:]): cosine_distances.append(1.0 - spatial.distance.cosine(vi, vj)) if len(cosine_distances) > 1: avg_cos, std_cos = np.mean(cosine_distances), np.std(cosine_distances) if not isnan(avg_cos) and not isnan(std_cos): count = len(vectors) relations_statistics.append((relation, count, avg_cos, std_cos)) if dump_vectors: mean_relation_vectors[relation] = (np.mean(vectors, axis=0), count, avg_cos, std_cos) print 'Sorting relations' relations_statistics.sort(key=lambda x: x[2], reverse=True) print 'Writing to csv' write_csv(relations_statistics, out_path) if dump_vectors: print 'Writing vectors dump' f = open(out_path + '.vectors.pkl', 'wb') cPickle.dump(mean_relation_vectors, f)
def InitModel(): global _WORDVEC_MODEL assert (_WORDVEC_MODEL == None),'InitModel has already been called.' print >>sys.stderr, "Loading Word2Vec Models ..." start = time.time() _WORDVEC_MODEL = Word2Vec.load_word2vec_format('/home/limiao/open_tools/Word2Vec/models/wiki_en_models/wiki.en.text.vector', binary=False) end = time.time() print >>sys.stderr, "Completed! time: ", end-start, "sec."
def QA(question): # model = Word2Vec.load('out') model = Word2Vec.load_word2vec_format('/home/david/Work/googlenews.bin', binary=True) extractor = Rake() words = extractor.run(question) keywords = [words[i][0] for i in xrange(len(words))] return model.most_similar(positive=keywords)[0][0]
def build_w2b_mat(filename, vocab): model = Word2Vec.load_word2vec_format('GloVe-1.2/vectors.txt', binary=False) w2v_mat = np.zeros((len(model[vocab[0]]), len(vocab))) for j, word in enumerate(vocab): w2v_mat[:, j] = model[word] return w2v_mat
def __init__(self, corpus): self.sequence = [] self.all_found = set() self.entities = dict() self.corpus = corpus self.seed = set() self.model = w2v.load_word2vec_format('news_vectors.bin', binary=True) self.candidate_patterns = []
def w2v_model_load(): global w2v_model global w2v_dimension if w2v_model is None: # w2v_model = Word2Vec.load_word2vec_format("features/karlo/GoogleNews-vectors-negative300.bin", binary=True) #w2v_dimension = 300 w2v_model = Word2Vec.load_word2vec_format("features/karlo/vectors.6B.50d.txt", binary=False) w2v_dimension = 50
def __init__(self,tag_data,user_data,k,path): self.tag_data = tag_data self.user_data = user_data self.k = k self.model = Word2Vec.load_word2vec_format(path, binary=True) self.minimium_model = {} self.no_match_tag = [] self.vec_dict = {} self.corr_dict = {}
def load_model(model_path): """Load Word2Vec model and return model, number of features, and word index""" model = Word2Vec.load_word2vec_format(model_path, binary=True) num_features = model.layer1_size model_word_set = set(model.index2word) print 'Finished loading model' return model, num_features, model_word_set
def main(): print 'Preprocessing data ...' tags = preprocess_data() print 'Loading model ...' model = Word2Vec.load_word2vec_format(model_file, binary=True) print 'Reading and converting data from swda ...' data = process_data(model, tags) print 'Saving ...' save_data(data, data_file)
def loadWord2Vec(filename): try: logger.info("Trying to load food2vec model from file: {0}".format(filename)) food2vec = Word2Vec.load_word2vec_format(filename, binary=False) logger.info("Food2vec model has been loaded from file: {0}".format(filename)) return food2vec except IOError as e: logger.error("Cannot load food2vec model from file: {0}: IOError: {1}".format(filename, e.strerror)) sys.exit(e.errno)