#!/usr/bin/python3 # coding: utf-8 from gensim.models.keyedvectors import KeyedVectors model = KeyedVectors.load_word2vec_format('~/datasets/WordVec/GoogleNews/GoogleNews-vectors-negative300.bin', binary=True) model.save_word2vec_format('~/datasets/WordVec/GoogleNews/GoogleNews-vectors-negative300', binary=False)
def ensemble_embedding(self, word_embedding, context_embedding): """Replace current syn0 with the sum of context and word embeddings. Parameters ---------- word_embedding : str Path to word embeddings in GloVe format. context_embedding : str Path to context embeddings in word2vec_format. Returns ------- numpy.ndarray Matrix with new embeddings. """ glove2word2vec(context_embedding, context_embedding + '.w2vformat') w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding) c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding) # compare vocab words using keys of dict vocab assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings' # sort context embedding to have words in same order as word embedding prev_c_emb = copy.deepcopy(c_emb.syn0) for word_id, word in enumerate(w_emb.index2word): c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index] # add vectors of the two embeddings new_emb = w_emb.syn0 + c_emb.syn0 self.syn0 = new_emb return new_emb
def test_type_conversion(self): path = datapath('high_precision.kv.txt') binary_path = datapath('high_precision.kv.bin') model1 = KeyedVectors.load_word2vec_format(path, datatype=np.float16) model1.save_word2vec_format(binary_path, binary=True) model2 = KeyedVectors.load_word2vec_format(binary_path, datatype=np.float64, binary=True) self.assertAlmostEqual(model1["horse.n.01"][0], np.float16(model2["horse.n.01"][0])) self.assertEqual(model1["horse.n.01"][0].dtype, np.float16) self.assertEqual(model2["horse.n.01"][0].dtype, np.float64)
def load_w2v_model(self): # Load pre-trained word2vec model model_loc = os.path.join(os.getcwd(),get_str('devise', 'w2v_model_name')) word_vectors = KeyedVectors.load_word2vec_format(model_loc, binary=True) # Get dimensions of word vector word_dim = word_vectors['the'].shape[0] return word_vectors, word_dim
def create_and_load_dic(self): model = KeyedVectors.load_word2vec_format(self.files_path + '.bin', binary=True) kmeans = cluster.KMeans(n_clusters=self.num_clusters) kmeans.fit(model.wv.vectors) self.w2v_dic = dict(zip(model.wv.index2word, zip(model.wv.vectors, kmeans.labels_))) output = open(self.files_path + '.pkl', 'wb') pickle.dump(self.w2v_dic, output) output.close()
def load_GNews_model(): """ Convenience function for loading the pre-trained Google News word2vec model vectors published with the original work. For more information see: https://code.google.com/archive/p/word2vec/ """ model = KeyedVectors.load_word2vec_format('rdata/GoogleNews-vectors-negative300.bin', binary=True) return model
def __init__(self, config=None): super().__init__() self.embedding_path = config.get("embedding_path") self.embedding_type = config.get("embedding_type") if self.embedding_path is None or self.embedding_path == "": raise ValueError("Embedding_path is expected.") is_binary = True if self.embedding_type == "bin" else False from gensim.models.keyedvectors import KeyedVectors self.embedding = KeyedVectors.load_word2vec_format(self.embedding_path, binary=is_binary)
def test_add_single(self): """Test that adding entity in a manual way works correctly.""" entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)] vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)] # Test `add` on already filled kv. for ent, vector in zip(entities, vectors): self.vectors.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(self.vectors[ent], vector)) # Test `add` on empty kv. kv = EuclideanKeyedVectors(self.vectors.vector_size) for ent, vector in zip(entities, vectors): kv.add(ent, vector) for ent, vector in zip(entities, vectors): self.assertTrue(np.allclose(kv[ent], vector))
def load_kv(filename=None, path=None, limit=None): if path is not None: return KeyedVectors.load_word2vec_format( path, binary=True, limit=limit) elif filename is not None: for dir_path in ASSET_SEARCH_DIRS: try: path = os.path.join(dir_path, filename) return KeyedVectors.load_word2vec_format( path, binary=True, limit=limit) except FileNotFoundError: continue raise FileNotFoundError("Please make sure that 'filename' \ specifies the word vector binary name \ in default search paths or 'path' \ speficies file path of the binary") else: raise TypeError( "load_kv() requires either 'filename' or 'path' to be set.")
def load_model(filepath, keyed_vec=False): """ Instantiate a pre-trained model located at `filepath`. If read-only model vectors were trained by another application, set `keyed_vec=True`. Otherwise, word2vec model is assumed. """ if keyed_vec: model = KeyedVectors.load(filepath) else: model = Word2Vec.load(filepath) return model
def load_word2vec_model_from_path(self): """ Load Word2Vec model Returns: the Word2Vec model """ word_embeddings_model = KeyedVectors.load_word2vec_format( self.word2vec_model_path, binary=True) if not word_embeddings_model: return None return word_embeddings_model
def test_ft_kv_backward_compat_w_360(self): kv = EuclideanKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz")) ft_kv = FastTextKeyedVectors.load(datapath("ft_kv_3.6.0.model.gz")) expected = ['trees', 'survey', 'system', 'graph', 'interface'] actual = [word for (word, similarity) in kv.most_similar("human", topn=5)] self.assertEqual(actual, expected) actual = [word for (word, similarity) in ft_kv.most_similar("human", topn=5)] self.assertEqual(actual, expected)
def _load_word2vec(path, limit=500000): """ Init word2vec model :param path: path to the model :param limit: optional :return: word2vec model """ print('Выгружаю семантическую модель слов...') w2v = KeyedVectors.load_word2vec_format(path, binary=True, unicode_errors='ignore', limit=limit) w2v.init_sims(replace=True) print('Выгрузка окончена') return w2v
def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """ Store the input-hidden weight matrix. `fname` is the file used to save the vectors in `doctag_vec` is an optional boolean indicating whether to store document vectors `word_vec` is an optional boolean indicating whether to store word vectors (if both doctag_vec and word_vec are True, then both vectors are stored in the same file) `prefix` to uniquely identify doctags from word vocab, and avoid collision in case of repeated string in doctag and word vocab `fvocab` is an optional file used to save the vocabulary `binary` is an optional boolean indicating whether the data is to be saved in binary word2vec format (default: False) """ total_vec = len(self.wv.vocab) + len(self.docvecs) # save word vectors if word_vec: if not doctag_vec: total_vec = len(self.wv.vocab) KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec) # save document vectors if doctag_vec: with utils.smart_open(fname, 'ab') as fout: if not word_vec: total_vec = len(self.docvecs) logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) # store as in input order for i in range(len(self.docvecs)): doctag = u"%s%s" % (prefix, self.docvecs.index_to_doctag(i)) row = self.docvecs.doctag_syn0[i] if binary: fout.write(utils.to_utf8(doctag) + b" " + row.tostring()) else: fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row))))
def mine(db_file=None, support_threshold=3, similarity_threshold=0.9, model_file=None, num_epochs=10, min_items=2, max_items=3): blacklist = set([ "strcpy", "seq_printf", "sprintf", "__mlog_printk", "strcat", "strlcpy", "init_timer_key", "IS_ERR", "GFS2_I", "OCFS2_I", "BTRFS_I", "INODE_CACHE", "GFS2_SB", "EXT4_SB", "btrfs_sb" ]) graylist = set([ "dev_err", "dquot_initialize", "ocfs2_inode_lock_full_nested", "warn_slowpath_null", "ocfs2_init_dinode_extent_tree", "fs_path_alloc", "btrfs_next_leaf", "ocfs2_lock_refcount_tree", "ocfs2_check_dir_for_entry", "ocfs2_prepare_dir_for_insert", "warn_slowpath_fmt", "ocfs2_init_dealloc_ctxt" ]) # COMMENT out union to see blacklist only blacklist = blacklist.union(graylist) # Open the handler database and read handler information hdb = HandlerDb(db_file) logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # A map from an item to all of the handlers that have that item in their context import tempfile import subprocess # Mine three item closed sets using eclat tmp_name_tx = "%s/%s" % (tempfile.gettempdir(), next(tempfile._get_candidate_names())) tmp_name_mined = "%s/%s" % (tempfile.gettempdir(), next(tempfile._get_candidate_names())) # Call miner set_counts = defaultdict(set) # Write out sentences to tmp file # For each handler with open(tmp_name_tx, 'w') as f: for h in hdb.handlers.keys(): # For each pair of items, one from context and one from response handler = hdb.handlers[h] # Print out handler sentence if len(handler.context) == 0 or len(handler.response) == 0: continue sentence = ["PRE|" + x.name for x in handler.context] sentence += (["POST|" + x.name for x in handler.response]) f.write(" ".join(sentence)) f.write("\n") print("SENTENCES AT:", tmp_name_tx) # Run eclat support_param = "-s-%s" % support_threshold max_item_param = "-n{}".format(max_items) min_item_param = "-m{}".format(min_items) script_dir = os.path.dirname(os.path.realpath(__file__)) subprocess.call([ '/program2vec/lib/eclat/eclat/src/eclat', '-ts', support_param, max_item_param, min_item_param, '-v %a', tmp_name_tx, tmp_name_mined ]) # os.remove(tmp_name_tx) # Read in the frequent itemsets rules = set() with open(tmp_name_mined, "r") as f: raw_mined = f.readlines() os.remove(tmp_name_mined) raw_mined = [x.rstrip() for x in raw_mined] for t in raw_mined: items = t.split() rule_support = int(items.pop()) context_functions = set() response_functions = set() for item in items: prefix, item_name = item.split("|") if item_name in blacklist: continue if prefix == "PRE": context_functions.add(item_name) else: response_functions.add(item_name) if len(context_functions) != 0 and len(response_functions) != 0: rules.add( AssociationRule(context=context_functions, response=response_functions, support=rule_support)) # Convert to specs, but support is already done specs = [Spec(set([x])) for x in rules] print(support_threshold, scipy.misc.comb(len(specs), 2)) print(len(specs)) single_item_responses = defaultdict(set) for s in specs: for r in s.rules: if len(r.response) == 1: item = next(iter(r.response)) single_item_responses[item].add(s) model = KeyedVectors.load_word2vec_format(model_file, binary=False) print(len(single_item_responses.keys())) # INSIGHT: We only need the raw handler list for merges, so calc. for rules on demand spec_sims = [] for response1, response2 in itertools.combinations(single_item_responses, 2): specs1 = single_item_responses[response1] specs2 = single_item_responses[response2] for spec1, spec2 in itertools.product(specs1, specs2): spec_sims.append((spec1, spec2, spec1.similarity(spec2, model))) spec_sims.sort(key=lambda x: x[2], reverse=True) # Go through the spec pairs with sim above threshold # Merge them in order total_merges = 0 for i in xrange(num_epochs): specs = set(specs) print("Starting epoch {} of {}".format(i + 1, num_epochs)) merged_this_epoch = 0 done = 0 new_sims = [] for ss in spec_sims: spec1 = ss[0] spec2 = ss[1] sim = ss[2] print(sim) done += 1 print(done, len(spec_sims)) if sim >= similarity_threshold: if spec1 not in specs or spec2 not in specs: continue # Add merged spec to list of all specs, and remove individual specs merged = spec1.merge(spec2, hdb) for s in specs: new_sims.append((merged, s, merged.similarity(s, model))) specs.add(merged) specs.remove(spec1) specs.remove(spec2) # Recompute support for merged specifications merged_this_epoch += 1 else: break spec_sims += new_sims spec_sims.sort(key=lambda x: x[2], reverse=True) specs = list(specs) specs.sort(key=lambda x: x.support, reverse=True) total_merges += merged_this_epoch print("{} merges".format(merged_this_epoch)) if merged_this_epoch == 0: break specs.sort(key=lambda x: x.support, reverse=True) return specs
print("Data file found, load it!") file = open(data_folder + sep + "data.pkl", 'rb') X, y, texts = pickle.load(file) file.close() # print("After loading raw data") print(X.shape) # print((X[10:20])) # print((y[10:20])) # print((texts[10:20])) X_rest, X_val, y_rest, y_val = train_test_split(X, y, random_state=123, test_size=0.1, shuffle=True) word_vectors = KeyedVectors.load_word2vec_format( '../intentDetection/w2v/wiki.vi.model.bin', binary=True) EMBEDDING_DIM = 400 word_index = txtTokenizer(getData(file_path))[1] vocabulary_size = min(len(word_index) + 1, NUM_WORDS) print(vocabulary_size) embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM)) for word, i in word_index.items(): if i >= NUM_WORDS: continue try: embedding_vector = word_vectors[word] embedding_matrix[i] = embedding_vector except KeyError: embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)
from sklearn.manifold import TSNE from gensim.models.keyedvectors import KeyedVectors import numpy as np import csv from sklearn.metrics.pairwise import cosine_similarity from util import vector_averaging from util import vector_averaging_with_tfidf from util import process_source_code from util import process_diff_srcml from util import process_diff_srcml2 from util import word2weight import sys csv.field_size_limit(sys.maxsize) cs_vectors = KeyedVectors.load_word2vec_format( "./bi2vec_vectors/cs_vectors_3.txt", binary=False) java_vectors = KeyedVectors.load_word2vec_format( "./bi2vec_vectors/java_vectors_3.txt", binary=False) with open("sentences_cs.txt", "r") as cs_f: cs_data = cs_f.readlines() with open("sentences_java.txt", "r") as java_f: java_data = java_f.readlines() cs_sentences = [x for x in cs_data] java_sentences = [x for x in java_data] cs_word2weight = word2weight(cs_sentences) java_word2weight = word2weight(java_sentences) # print cs_word2weight # Predicting part ----------------------------------------------
checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary vocab_processor.save(os.path.join(out_dir, "vocab")) # Initialize all variables sess.run(tf.global_variables_initializer()) #initW = None #loading pretrained vectors from gensim #reference https://github.com/cahya-wirawan/cnn-text-classification-tf/blob/master/train.py model = KeyedVectors.load_word2vec_format( './Data/GoogleNews-vectors-negative300.bin.gz', binary=True) vocabulary = vocab_processor.vocabulary_ initW = data_helpers.load_word2vec_google(model, vocabulary) sess.run(cnn.W.assign(initW)) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, summaries, loss, accuracy = sess.run([ train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy
def __init__(self, file, **kwargs): self.model = KeyedVectors.load_word2vec_format(file, binary=False) self.embs_cache = {}
def lstm_fun(file): center_cluster = {} centers = [] clusters = [] f1 = open(file, 'r', encoding='utf-8') for line in f1: values = line.split(':') centers.append(values[0]) clusters.append(values[1]) f1.close() #使用聚类中心向量表示 def findAP(word): vec = [] for i in range(len(clusters)): if word in clusters[i]: # print("clusters[i]:",clusters[i].split()) for wd in clusters[i].split(): vec.append(embeddings_index[wd].tolist()) # print(vec) # print('vec:',vec) vec = np.array(vec) vec = vec.mean(axis=0).tolist() # mean value of columns # print('word:', word) # print('mean vector:', vec) # print('------------') break if len(vec) == 0: vec = embeddings_index[word].tolist() return vec ################################################################### print('(4) load word2vec as embedding...') from gensim.models.keyedvectors import KeyedVectors w2v_model0 = KeyedVectors.load_word2vec_format('medCorpus.zh.vector', encoding='utf-8') embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM)) not_in_model = 0 in_model = 0 #the origin word vector for word, i in word_index.items(): if word in w2v_model0: in_model += 1 vec0 = w2v_model0[word].tolist() vec1 = findAP(word) embedding_matrix[i] = np.asarray(vec0 + vec1, dtype='float32') else: not_in_model += 1 print(str(not_in_model) + ' words not in w2v model') from keras.layers import Embedding embedding_layer = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) ################################################################### print('(5) training model...') from keras.layers import Dense, Dropout from keras.layers import LSTM from keras.models import Sequential model = Sequential() model.add(embedding_layer) model.add(LSTM(400, dropout=0.5)) # model.add(Dropout(0.2)) model.add(Dense(32, activation='relu')) model.add(Dense(labels.shape[1], activation='softmax')) model.summary() # plot_model(model, to_file='model.png', show_shapes=True) # exit(0) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) print(model.metrics_names) model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=100, batch_size=128) # model.save('word_vector_lstm.h5') ################################################################### print('(6) testing model...') finalRes = model.evaluate(x_test, y_test) print(finalRes) return finalRes[1]
# project = "db4o" # cs_packages = ["Db4objects.","Db4oUnit"] # java_packages = ["db4o."] usage_type = "method" with open(URL,"r") as f: data = f.readlines() keys = list() for line in data: line = line.strip() splits = line.split("-") keys.append(splits[0]) cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_global_local.txt",binary=False) java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_global_local.txt",binary=False) for key in keys: try: vector = java_vectors[key] k_nearest= cs_vectors.similar_by_vector(vector, topn=50) relevant_k = list() for k in k_nearest: if check_if_token_is_method_signature(k[0]) == True: # if check_package_include(java_packages,k[0]) == True: relevant_k.append(k[0]) if len(relevant_k) != 0:
#!/usr/bin/python3 from gensim.models.keyedvectors import KeyedVectors VECTORS_LOCATION = '../vectors/GoogleNews-vectors-negative300.bin' OUTPUT_LOCATION = '../vectors/google_vecs.txt' word_vectors = KeyedVectors.load_word2vec_format(VECTORS_LOCATION, binary=True) word_vectors.save_word2vec_format(OUTPUT_LOCATION, binary=False)
def init_model(): global model print('Loading trained model...') model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) print('Model is ready.')
odd = '/home/dpappas/' ########################################################### w2v_bin_path = '/home/dpappas/bioasq_all/pubmed2018_w2v_30D.bin' idf_pickle_path = '/home/dpappas/bioasq_all/idf.pkl' ########################################################### avgdl, mean, deviation = 21.688767020746013, 0.7375801616020308, 1.3411418040865049 print(avgdl, mean, deviation) ########################################################### k_for_maxpool = 5 k_sent_maxpool = 5 embedding_dim = 30 #200 ########################################################### print('loading idfs') idf, max_idf = load_idfs(idf_pickle_path) print('loading w2v') wv = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True) wv = dict([(word, wv[word]) for word in wv.vocab.keys()]) ########################################################### my_seed = 1 random.seed(my_seed) torch.manual_seed(my_seed) ########################################################### print('Compiling model...') model = Sent_Posit_Drmm_Modeler(embedding_dim=embedding_dim, k_for_maxpool=k_for_maxpool) if (use_cuda): model = model.cuda() ########################################################### resume_from = '/home/dpappas/bioasq_w2vjpdrmm_demo_run_0/best_dev_checkpoint.pth.tar' load_model_from_checkpoint(resume_from)
print indexi start = time.time() num_features = 200 # Word vector dimensionality min_word_count = 20 # Minimum word count num_workers = 40 # Number of threads to run in parallel context = 10 # Context window size downsampling = 1e-3 # Downsample setting for frequent words model_name = str(num_features) + "features_" + str(min_word_count) + "minwords_" + str( context) + "context_len2alldata" # Load the trained Word2Vec model. model_name = 'wordvectors_reuters_nonpoly.txt' #model = Word2Vec.load(model_name)#.syn0 # Get wordvectors for all words in vocabulary. model = KeyedVectors.load_word2vec_format(model_name, binary=False) word_vectors = model.syn0 all = pd.read_pickle('all.pkl') start1 = time.time() start = time.time() # Set number of clusters. num_clusters = 60 idx, idx_proba = cluster_GMM(num_clusters, word_vectors) idx_proba[idx_proba < 0.2] = 0 n_clusteri = num_clusters f = open(filename, 'a') print "number of k clusters ", str(n_clusteri) f.write("number of k clusters " + str(n_clusteri) + "\n") start2 = time.time() f.write("time taken in clustering " + str(start2 - start1) + "\n") # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
from gensim.models.keyedvectors import KeyedVectors import logging from scipy import stats import numpy as np from sklearn import metrics file = input('The vector file:') model = KeyedVectors.load_word2vec_format(file, binary=False) #verbs similar = model.most_similar('击败') print('击败:') print(similar) print('\n') similar = model.most_similar('引用') print('引用:') print(similar) print('\n') similar = model.most_similar('研究') print('研究:') print(similar) print('\n') similar = model.most_similar('形成') print('形成:') print(similar) print('\n') similar = model.most_similar('增加') print('增加:') print(similar) print('\n')
) #list9+list10+list11+list12 , list99+list1010+list1111+list1212 #test_data, test_label = preprocessing(list1+list2+list3+list4), enumerate_list(list111+list222+list333+list444) #list1+list2+list3+list4 , list11+list22+list33+list44 #note: full test dataset is too huge-> memory error, so use part of it test_data_list = pd.read_csv( os.getcwd() + '\\data\\EI-reg-En-part-test.csv')['Tweet'].tolist() test_label_list = pd.read_csv( os.getcwd() + '\\data\\EI-reg-En-part-test.csv')['Affect Dimension'].tolist() test_data, test_label = preprocessing(test_data_list), test_label_list print("Train shape:", len(train_data), len(train_label)) print("Validation shape:", len(dev_data), len(dev_label)) print("Test shape:", len(test_data), len(test_label)) # Loading all models glove_model = KeyedVectors.load_word2vec_format( 'word2vec.twitter.27B.100d.txt', binary=False) # load Glove model w2v_model = Word2Vec.load('w2v_model.bin') # load word2vec model e2v_model = gsm.KeyedVectors.load_word2vec_format( 'emoji2vec.bin', binary=True) # load emoji2vec model print("All Models Loaded!") # word embedding data with glove pretrained model and real word2vec/w2v input_data = np.concatenate((train_data, dev_data, test_data)) max_sequence_length = max([len(x) for x in input_data ]) # find the length of longest twitter print("Max twitter length:", max_sequence_length) print("input_data shape:", len(input_data)) # Find embedding for corpus def embedding(data, max_len):
y_true = sample['labels'].numpy() if task not in recalls: recalls[task] = 0. recalls[task] += get_recall(y_true, y) if task not in counts: counts[task] = 0 counts[task] += 1 recalls = {task: recall / counts[task] for task, recall in recalls.items()} if save_argmax: return argmaxes else: return recalls print('Loading word vectors...') we = KeyedVectors.load_word2vec_format(args.we_path, binary=1) testset = CrossTask( data_path=args.data_path, features_path=args.features_path, features_path_3D=args.features_path_3D, we=we, feature_framerate=args.feature_framerate, feature_framerate_3D=args.feature_framerate_3D, we_dim=args.we_dim, max_words=args.max_words, ) testloader = DataLoader( testset, batch_size=1, num_workers=args.num_thread_reader, shuffle=False,
avg_topic_vectors, max_topic_vectors = get_topic_emb(lda) if maxPool: topic_vectors = max_topic_vectors else: topic_vectors = avg_topic_vectors cos_sim = cosine_similarity(user_vectors, topic_vectors) user_topic_mapping = create_user_topic_mapping(cos_sim, threshold) return user_topic_mapping """Create user and topic embeddings using SO_Word2Vec_200""" word_vectors = KeyedVectors.load( "/home/norberteke/PycharmProjects/Thesis/data/SO_pre-trained_vectors.kv", mmap='r') CUSTOM_FILTERS = [lambda x: strip_numeric, remove_stopwords] def word2vec_embedding_lookup(words): vectors = [] for w in words: try: vec = word_vectors[w] vectors.append(vec) except: try: w_transformed = w.replace(".", "").replace("=", "").replace( "-", "").replace("*", "").replace("'", "").replace(
def add_embedding(self, embedding_file): self.embedding_files.append(KeyedVectors.load_word2vec_format(embedding_file, binary=self.binary))
# Umlaute entfernen, lowercase def deUmlaut(value): value = re.sub('/ä/g', 'ae', value) value = re.sub('/ö/g', 'oe', value) value = re.sub('/ü/g', 'ue', value) value = re.sub('/Ä/g', 'Ae', value) value = re.sub('/Ö/g', 'Oe', value) value = re.sub('/Ü/g', 'Ue', value) value = re.sub('/ß/g', 'ss', value) return value # Modell im C BIN-Format laden print('Lade Word2Vec-Modell… ', end='') try: model = KeyedVectors.load_word2vec_format('german.model', binary=True) print('OK.') except: print('FAIL.') entry_vectors = [] tokenizer = RegexpTokenizer(r'\w+') stopwords = set(stopwords.words('german')) for entry in df['entry']: entry = deUmlaut(entry) sentence = tokenizer.tokenize(entry) # Stopwords entfernen sentence[:] = (word for word in sentence if word.lower() not in stopwords) vectors = [] for word in sentence:
import codecs from sklearn.manifold import TSNE from gensim.models.keyedvectors import KeyedVectors import numpy as np import csv from sklearn.metrics.pairwise import cosine_similarity from util import vector_averaging from util import vector_averaging_with_tfidf from util import process_source_code from util import word2weight import sys DIMENSION = 25 csv.field_size_limit(sys.maxsize) PROJECT = "cordova" cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_10_25_include_functions.txt",binary=False) java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_10_25_include_functions.txt",binary=False) with open("./sentences/sentences_cs_10.txt","r") as cs_f: cs_data = cs_f.readlines() with open("./sentences/sentences_java_10.txt","r") as java_f: java_data = java_f.readlines() cs_sentences = [x for x in cs_data] java_sentences = [x for x in java_data] cs_word2weight = word2weight(cs_sentences) java_word2weight = word2weight(java_sentences) # print cs_word2weight # Predicting part ---------------------------------------------- # print(cosine_similarity(cs_vectors["while"].reshape(1,-1),java_vectors["class"].reshape(1,-1))
from flask_heroku import Heroku heroku = Heroku(app) #Stripe info stripe_keys = { 'secret_key': 'sk_test_p0wapFDbjIuNPA2HLHJSao9n00CB0hSEyt', 'publishable_key': 'pk_test_iLDTkTXOBsqmeIKupSLGYhHQ00SAHfIA4d' } stripe.api_key = stripe_keys['secret_key'] # Load Google word2vec model from gensim.models.keyedvectors import KeyedVectors basedir = os.path.abspath(os.path.dirname(__file__)) model_path = 'GoogleNews-vectors-negative300-SLIM.bin.gz' w2v_model = KeyedVectors.load_word2vec_format(os.path.join( basedir, model_path), binary=True, limit=100000) from DocSim import DocSim ds = DocSim(w2v_model) # ======== Routing =========================================================== # # -------- Login ------------------------------------------------------------- # @app.route('/', methods=['GET', 'POST']) def login(): if not session.get('logged_in'): form = forms.LoginForm(request.form) if request.method == 'POST': username = request.form['username'].lower() password = request.form['password'] if form.validate():
search_vec = np.zeros(200) counter_w = 0 for w in words: if w in word_model: search_vec = search_vec + word_model[w] counter_w += 1 search_vec = search_vec / counter_w results = game_model.most_similar([search_vec], topn=10) df_results = pd.DataFrame.from_records(results, columns=['id', 'similarity']) df_results['id'] = df_results.id.astype(np.int64) df_results = pd.merge(df_results, data, on='id') return df_results.to_json(orient='index') word_model = KeyedVectors.load_word2vec_format( os.path.join(app_path, 'models/glove_w2v_200')) game_model = KeyedVectors.load_word2vec_format( os.path.join(app_path, 'models/gamevectors_w2c')) data = pickle.load( open(os.path.join(app_path, 'data_processed/tf_idf_data.p'), 'rb')) app = Flask(__name__) @app.route('/') def index(): return render_template(os.path.join(app_path, 'index.html')) @app.route('/get_search')
def __init__(self, name, data_filename): super().__init__(name) self.model = KeyedVectors.load_word2vec_format(data_filename, binary=True)
#import gensim import tensorflow as tf from keras.backend.tensorflow_backend import set_session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.7 set_session(tf.Session(config=config)) # basic model for tag recommendation # Embedding layer -> BiLSTM -> Dense with softmax # word model #embeddings_file_bin = '../glove/vectors.bin' #word_model = KeyedVectors.load_word2vec_format('../glove/vectors.txt', binary=False, unicode_errors='ignore') word_model = KeyedVectors.load_word2vec_format('word2vec/vec_Body_Title.bin', binary=True, unicode_errors='ignore') #meta_model = KeyedVectors.load_word2vec_format('metapath2vec/code_metapath2vec/stack_new_1000', binary=True, unicode_errors='ignore') user_id = pickle.load(open("user.p", 'rb')) user_tag = pickle.load(open("user_tags.p", 'rb')) user_num = pickle.load(open("user_num.p", 'rb')) count = len(user_tag) meta_model = {} openfile = open("graph_train.emd", 'r') for line in openfile: arr = line.split() meta_model[arr[0]] = arr[1:] #print meta_model['0']
import csv from sklearn.metrics.pairwise import cosine_similarity from util import vector_averaging from util import vector_averaging_with_tfidf from util import process_source_code from util import process_diff_srcml from util import process_diff_srcml2 from util import word2weight from util import process_expression from util import mean_average_precision from util import average_precision from util import precision_at_k import sys DIMENSION = 20 cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_11_20.txt",binary=False) java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_11_20.txt",binary=False) with open("./sentences/sentences_cs_11.txt","r") as cs_f: cs_data = cs_f.readlines() with open("./sentences/sentences_java_11.txt","r") as java_f: java_data = java_f.readlines() cs_sentences = [x for x in cs_data] java_sentences = [x for x in java_data] cs_word2weight = word2weight(cs_sentences) java_word2weight = word2weight(java_sentences) with codecs.open("./evaluation_data/keywords.csv","r") as f_csv:
ax1.plot(self.x, self.losses, label="loss") ax1.plot(self.x, self.val_losses, label="val_loss") ax1.legend() ax2.plot(self.x, self.acc, label="accuracy") ax2.plot(self.x, self.val_acc, label="validation accuracy") ax2.legend() plt.show() plot = PlotLearning() # this is how you load the model model = KeyedVectors.load_word2vec_format( "G:\\NLP\\Dataset\\GoogleNews-vectors-negative300.bin", binary=True, limit=100000) batch_size = 32 embedding_size = 128 nclass = 15 # Convolution kernel_size = 5 filters1 = 64 filters2 = 128 filters3 = 256 filters4 = 512 filters5 = 1024 pool_size = 4
# if "System." in split[0] or "antlr" in split[0].lower(): cs_signature_tokens.append(split[0]) print "cs tokens : " + str(len(cs_signature_tokens)) for java_emb in java_embeddings: split = java_emb.split(" ") if func(split[0]) == True: if check_package_include(java_packages,split[0]) == True: # if "java." in split[0] or "antlr" in split[0].lower(): java_signature_tokens.append(split[0]) print "java tokens : " + str(len(java_signature_tokens)) print "Loading word embedding..........." cs_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/cs_vectors_new_window3.txt",binary=False) java_vectors = KeyedVectors.load_word2vec_format("./bi2vec_vectors/java_vectors_new_window3.txt",binary=False) print "Finish loading.............." # print cs_vectors.similar_by_vector(java_vectors["java.util.concurrent.locks.Lock.lock()"], topn=30) # print cs_vectors.similar_by_vector(java_vectors["package"], topn=30) def check_if_relevant_k_contains_exact_name(method_source, relevant_k): check = False for k in relevant_k: split = k.split(".")
#coding=utf-8 from gensim.models.keyedvectors import KeyedVectors import os bin_file = 'vectors.bin' text_file = 'vectors.txt' bin_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), bin_file) text_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), text_file) model = KeyedVectors.load_word2vec_format(bin_file, binary=True) model.save_word2vec_format(text_file, binary=False)
] man = ["he", "man", "male", "him", "boy", "son", "father", "brother"] if (FEMALE): use = woman name = "w" else: use = man name = "m" # adjectives #f = open('out_adjs_'+name,'w') i = 0 for country in countries: wv = KeyedVectors.load_word2vec_format(country, binary=True) for a in adjs: avg = 0. count = 0 for w in use: avg = avg + wv.similarity(a, w) print str(avg) + " " + str(wv.similarity(a, w)) avg = avg / float(len(use)) print avg if (i == 2): break i = i + 1
import gensim from gensim.models.keyedvectors import KeyedVectors w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) text = open('w2v.300d.txt', 'w') for word in w2v.__dict__['vocab']: arr = w2v[word] #print(arr) string = ' '.join([word, ' '.join([str(x) for x in arr])]) print(string, file=text)
def load_w2v_data(self, binary_file_name): self.w2v_model = KeyedVectors.load_word2vec_format( os.path.join(DATA_PATH, binary_file_name), binary=True)
for line in f: if skip_head: skip_head = False continue else: city_list.append(line.split(',')[0]) city_list = list(set(city_list)) # #### Save 'Bin_Tencent_AILab_ChineseEmbedding.bin' in '../embedding' if 'Bin_Tencent_AILab_ChineseEmbedding.bin' not in os.listdir('../embedding'): print('saving word embeddings...') embedding_file = '../embedding/Tencent_AILab_ChineseEmbedding.txt' wv = KeyedVectors.load_word2vec_format(embedding_file, binary=False) wv.init_sims(replace=True) wv.save('../embedding/Bin_Tencent_AILab_ChineseEmbedding.bin') # #### Load Word Embeddings print('loading word embeddings...') wv = KeyedVectors.load('../embedding/Bin_Tencent_AILab_ChineseEmbedding.bin', mmap='r') wv.vectors_norm = wv.vectors # prevent recalc of normed vectors # #### Save 'expanded_keywords.csv' in 'resources'
from collections import defaultdict import numpy as np import time import gensim from gensim.models.keyedvectors import KeyedVectors from sklearn.decomposition import TruncatedSVD import matplotlib.pyplot as plt from collections import Counter get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: path = r"/Users/venkat_kannan/Documents/glove.6B.50d.txt.w2v" t0 = time.time() glove = KeyedVectors.load_word2vec_format(path, binary=False) t1 = time.time() print("elapsed %ss" % (t1 - t0)) # 50d: elapsed 17.67420792579651s # 100d: # In[44]: type(glove['pizza']) # In[45]: import re, string punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
def downloadGlove(file = "C:/glove/glove.6B.50d.txt.w2v"): gant = KeyedVectors.load_word2vec_format(file, binary=False) return gant
eclipse_input_file = 'data_eclipse_openj9.csv' eclipse_output_file = 'data_eclipse_openj9_classified.csv' we_path = '../Classifier/SO_vectors_200.bin' encoder = preprocessing.LabelEncoder() def print_classes(): print('Classes:') for index, encoded_class in enumerate(list(encoder.classes_)): print(index, encoded_class) print('\n') print("Loading word embeddings...") we = KeyedVectors.load_word2vec_format(we_path, binary=True) print("Loaded word embeddings!") data_set = utils.read_csv(eclipse_input_file) train_set = [] pool = [] for example in data_set: if type(example['purpose']) is str: train_set.append(example) else: pool.append(example) train_set_x, train_set_y = utils.split_dataset_to_x_y(train_set) train_set_x_tokenized = utils.tokenize(train_set_x) train_set_x_mean_vectors = utils.get_mean_vectors(we, train_set_x_tokenized, train_set_y)
def load_model(self): logging.info("Loading FastText model...") return KeyedVectors.load(os.path.join("model_fasttext", "model.model"))
def load_emb(): word_vectors = KeyedVectors.load_word2vec_format( '/net/data/cemi/saleh/embeddings/pubmed_s100w10_min.bin', binary=True) return word_vectors
def loadEmbeddingsDataset(path, binaryFormat): if binaryFormat: return KeyedVectors.load_word2vec_format(path, binary=True) else: return KeyedVectors.load_word2vec_format(path)
def setUp(self): self.vectors = EuclideanKeyedVectors.load_word2vec_format( datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
def train_cnn(features, labels, type, max_sequence_len=MAX_SEQUENCE_LENGTH, max_nb_words=MAX_NUM_WORDS, embedding_dim=EMBEDDING_DIM, validation_split=VALIDATION_SPLIT, learning_rate=LEARNING_RATE, batch_size=BATCH_SIZE, dropout_rate=DROPOUT_RATE, innerlayer_dropout_rate=INNERLAYER_DROPOUT_RATE): ''' Trains 1-d CNN on climate change news articles. Referenced from https://www.microsoft.com/developerblog/2017/12/04/predicting-stock-performance-deep-learning/ ''' labels_index = {0: 0, 1: 1} #tokenize text tokenizer = Tokenizer(num_words=max_nb_words) tokenizer.fit_on_texts(features) sequences = tokenizer.texts_to_sequences(features) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data = pad_sequences(sequences, maxlen=max_sequence_len) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) # split the data into a training set and a validation set indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] num_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) x_train = data[:-num_validation_samples] y_train = labels[:-num_validation_samples] x_val = data[-num_validation_samples:] y_val = labels[-num_validation_samples:] num_words = min(max_nb_words, len(word_index) + 1) embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) if type == 'glove': embeddings_index = index_glove() print("Preparing embedding matrix") for word, i in word_index.items(): if i >= max_nb_words: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: #If vector returns none, zeroes are placed instead embedding_matrix[i] = embedding_vector elif type == 'google': print("Using Google pretrained word embeddings.") word_vectors = KeyedVectors.load_word2vec_format( './saved_models/GoogleNews-vectors-negative300.bin', binary=True) for word, i in word_index.items(): if i >= max_nb_words: continue if word in word_vectors.wv.vocab: embedding_vector = word_vectors[word] if embedding_vector is not None: #If vector returns none, zeroes are placed instead embedding_matrix[i] = embedding_vector elif type == 'ours': print("Using our trained word embeddings.") word_vectors = KeyedVectors.load('./saved_models/final_w2v/w2v_1_both') for word, i in word_index.items(): if i >= max_nb_words: continue if word in word_vectors.wv.vocab: embedding_vector = word_vectors[word] if embedding_vector is not None: #If vector returns none, zeroes are placed instead embedding_matrix[i] = embedding_vector # load pre-trained word embeddings into an Embedding layer # note that we set trainable = False so as to keep the embeddings fixed embedding_layer = Embedding( num_words, EMBEDDING_DIM, embeddings_initializer=Constant(embedding_matrix), input_length=max_sequence_len, trainable=False) sequence_input = Input(shape=(max_sequence_len, ), dtype='int32') embedded_sequences = embedding_layer(sequence_input) #Model architecture x = Conv1D(128, 5, activation='relu', kernel_initializer='lecun_uniform')(embedded_sequences) x = MaxPooling1D(3)(x) x = Dropout(innerlayer_dropout_rate)(x) x = Conv1D(128, 5, activation='relu', kernel_initializer='lecun_uniform')(x) x = MaxPooling1D(3)(x) x = Dropout(innerlayer_dropout_rate)(x) x = Conv1D(128, 5, activation='relu', kernel_initializer='lecun_uniform')(x) x = MaxPooling1D(35)(x) # global max pooling x = Flatten()(x) x = Dense(100, activation='relu', kernel_initializer='lecun_uniform')( x) # best initializers: #glorot_normal #VarianceScaling #lecun_uniform x = Dropout(dropout_rate)(x) preds = Dense(len(labels_index), activation='softmax')(x) #no initialization in output layer model = Model(sequence_input, preds) adam = optimizers.Adam(lr=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, clipvalue=0.5) #, clipnorm=1.) rmsprop = optimizers.RMSprop(lr=LEARNING_RATE, rho=0.9, epsilon=1e-08, decay=0.00) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['acc']) history = History() early_stopping = EarlyStopping(monitor='val_loss', patience=10) history = model.fit(x_train, y_train, batch_size=32, epochs=24, validation_data=(x_val, y_val), callbacks=[early_stopping, history]) model.save('./saved_models/classifiers/cnn_earlystop_{}'.format(type)) #plotting plt.figure(1) # summarize history for accuracy plt.subplot(211) plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.legend(['train', 'test'], loc='upper left') # summarize history for loss plt.subplot(212) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.savefig('cnn_earlystop_{}.png'.format(type, bbox_inches='tight'))
def load_model(self, datatype): path = datapath('high_precision.kv.txt') kv = KeyedVectors.load_word2vec_format(path, binary=False, datatype=datatype) return kv