def __init__(self, sentences=None, size=300, alpha=0.025, window=8, min_count=5, sample=0, seed=1, workers=1, min_alpha=0.0001, dm=1, hs=1, negative=0, dm_mean=0, train_words=True, train_lbls=True, **kwargs): """ Initialize the model from an iterable of `sentences`. Each sentence is a LabeledSentence object that will be used for training. The `sentences` iterable can be simply a list of LabeledSentence elements, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. `dm` defines the training algorithm. By default (`dm=1`), distributed memory is used. Otherwise, `dbow` is employed. `size` is the dimensionality of the feature vectors. `window` is the maximum distance between the current and predicted word within a sentence. `alpha` is the initial learning rate (will linearly drop to zero as training progresses). `seed` = for the random number generator. `min_count` = ignore all words with total frequency lower than this. `sample` = threshold for configuring which higher-frequency words are randomly downsampled; default is 0 (off), useful value is 1e-5. `workers` = use this many worker threads to train the model (=faster training with multicore machines). `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0). `negative` = if > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean. Only applies when dm is used. """ Word2Vec.__init__(self, size=size, alpha=alpha, window=window, min_count=min_count, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean, **kwargs) self.train_words = train_words self.train_lbls = train_lbls if sentences is not None: self.build_vocab(sentences) self.train(sentences)
def wsp_similarity(cls, start: WSP, end: WSP): total_vector = np.zeros(shape=(300, ), dtype=np.float32) for word in start.onyms: if Word2Vec.contains(word): total_vector = np.add(Word2Vec.vector(word), total_vector) Word2Vec.add_vector(start.name, total_vector) return Word2Vec.similarity(start.name, end.word)
def __init__(self, sentences, model_file=None, size=200, alpha=0.025, window=5, min_count=5, sample=0, seed=1, workers=16, min_alpha=0.0001, model="cb", hs=1, negative=0, cbow_mean=0, iteration=1, word_learn=1, init_adjust=True, update_mode=0, normalize_each_epoch=False): self.sg = 1 if model == "sg" or model == "dbow" else 0 self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving self.alpha = float(alpha) self.window = int(window) self.seed = seed self.sample = sample self.workers = workers self.min_alpha = min_alpha self.hs = hs self.negative = negative self.cbow_mean = int(cbow_mean) self.iteration = iteration self.word_learn = int(word_learn) self.layer1_size = size self.min_count = min_count self.sent_no_hash = {} #mapping sent_id to index of self.sents self.sent_id_list = [] #mapping sent_no to sent_id self.sane_vec_len = 100000 #for sanity check self.sane_max_sim10 = 0.9 #for sanity check self.init_adjust = init_adjust #for adjustment of initialization self.update_mode = update_mode #0:SGD, 1: AdaGrad, 2:AdaDelta, (3:ADAM not implemented) self.normalize_each_epoch = normalize_each_epoch if sentences: if model_file: self.w2v = Word2Vec.load(model_file) self.vocab = self.w2v.vocab self.layer1_size = self.w2v.layer1_size self.build_vec(sentences, has_vocab=True) else: self.word_learn = 1 self.w2v = Word2Vec(None, self.layer1_size, self.alpha, self.window, self.min_count, self.sample, self.seed, self.workers, self.min_alpha, self.sg, self.hs, self.negative, self.cbow_mean) self.build_vec(sentences, has_vocab=False) self.train_iteration(sentences, iteration=iteration)
def trainWord2Vec(doc_list=None, buildvoc=1, passes=10, sg=1, size=100, dm_mean=0, window=5, hs=0, negative=5, min_count=1, workers=1): model = Word2Vec(size=size, sg=sg, window=window, hs=hs, negative=negative, min_count=min_count, workers=workers, compute_loss=True) if buildvoc == 1: print('Building Vocabulary') model.build_vocab(doc_list) # build vocabulate with words + nodeID for epoch in range(passes): print('Iteration %d ....' % epoch) # shuffle(doc_list) # shuffling gets best results model.train(doc_list, total_examples=len(doc_list), epochs=1) print(model.running_training_loss) print(model.sg, model.window, model.hs, model.min_count) print('batch words', model.batch_words) return model
def init_embedder(dataset): ''' initialize the embedder by load it from file if available or build the model by the dataset and save it ''' fname = DIR_MODEL + '%s_embedder.pkl'%(prefix) if os.path.exists(fname): print >> sys.stderr, 'embedding model %s found and loaded'%(fname) return Word2Vec.load(fname) else: class x_iterator: def __init__(self, dataset): self.dataset = dataset def __iter__(self): for set_x, set_y in self.dataset: for x in set_x: yield x embedder = Word2Vec() embedder.build(x_iterator(dataset), dim_proj) embedder.dump(fname) return embedder
def __init__(self): self.db = MySQLdb.connect(host="127.0.0.1", user="******", passwd="wmmkscsie", db="recommender_system", charset="utf8") self.cursor = self.db.cursor() # sql = "SELECT a.relationship_type, a.scenario_type, b.id, b.scenario_e2v_bert FROM movies as a, movies_vector as b Where a.id=b.id and a.id >= 1 and a.id <= 1171 and b.scenario_e2v_bert !=''" sql = "SELECT a.relationship_type, a.scenario_type, b.id, b.scenario_e2v_w2v_sg FROM movies as a, movies_vector as b Where a.id=b.id and a.id >= 1 and a.id <= 1171 and b.scenario_e2v_w2v_sg !=''" print(sql) self.cursor.execute(sql) self.movies_information = self.cursor.fetchall() # Relationship Model ####################### # self.model = CNN_E2V_BERT() # For Produce Vector # self.bert_embedding = BertEmbedding(model = 'bert_12_768_12', dataset_name='wiki_cn', max_seq_length = 50) # self.relationship_e2v_bert = [] # self.scenario_e2v_bert = [] ####################### self.model = CNN_E2V_W2V_SG() # 產生一個 word2vec 物件 self.t = Word2Vec() self.t.train_file_setting("segmentation.txt", "e2v_w2v_sg") self.t.load_model() self.dimension = self.t.size self.relationship_e2v_w2v_sg = [] self.scenario_e2v_w2v_sg = []
def main(): tagged_words = brown.tagged_words() words_corpus = brown.words() word2vec = Word2Vec() word2vec.train(words_corpus) word_vecs = [word2vec.word2vec(word) for word in words_corpus] n_clusters = 10 # random number for now kmeans = KMeans(n_clusters) kmeans.compute(word_vecs) # word-cluster HMM p_word = {} p_cluster = {} p_cluster_given_word = None # softmax p_word_given_cluster = None # joint probability formula p_transition_cluster = None # count p_initial_cluster = None # count # cluster-tag HMM p_cluster_given_tag = None # softmax p_transition_tag = None # count from tagged data p_initial_tag = None # count from tagged data hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster) hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag) words = [] clusters = hmm_word_cluster.viterbi(words) tags = hmm_cluster_tag.viterbi(clusters)
def test_skipgram(self): skipgram = Word2Vec(learning_rate=self.learning_rate) W1_m, W2_m, loss_m = skipgram.skipgram(np.asmatrix(self.context_words), np.asmatrix(self.center_word), self.W1, self.W2, 0.) with tf.name_scope("skipgram"): x = tf.placeholder(shape=[self.V, 1], dtype=tf.float32, name="x") W1_tf = tf.Variable(self.W1, dtype=tf.float32) W2_tf = tf.Variable(self.W2, dtype=tf.float32) h = tf.matmul(tf.transpose(W1_tf), x) u = tf.stack([tf.matmul(tf.transpose(W2_tf), h) for i in range(len(self.context_words))]) loss_tf = -tf.reduce_sum([u[i][int(np.where(c == 1)[0])] for i, c in zip(range(len(self.context_words)), self.context_words)], axis=0)\ + tf.reduce_sum(tf.log(tf.reduce_sum(tf.exp(u), axis=1)), axis=0) grad_W1, grad_W2 = tf.gradients(loss_tf, [W1_tf, W2_tf]) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) W1_tf, W2_tf, loss_tf, dW1_tf, dW2_tf = sess.run([W1_tf, W2_tf, loss_tf, grad_W1, grad_W2], feed_dict={x: self.center_word.reshape(self.V, 1)}) W1_tf -= self.learning_rate * dW1_tf W2_tf -= self.learning_rate * dW2_tf for i in range(self.V): for j in range(self.N): self.assertAlmostEqual(W1_m[i, j], W1_tf[i, j], places=5) for i in range(self.N): for j in range(self.V): self.assertAlmostEqual(W2_m[i, j], W2_tf[i, j], places=5) self.assertAlmostEqual(loss_m, float(loss_tf), places=5)
def test_cbow(self): cbow = Word2Vec(learning_rate=self.learning_rate) W1_m, W2_m, loss_m = cbow.cbow(np.asmatrix(self.context_words), np.asmatrix(self.center_word), self.W1, self.W2, 0.) with tf.name_scope("cbow"): x = tf.placeholder(shape=[self.V, len(self.context_words)], dtype=tf.float32, name="x") W1_tf = tf.Variable(self.W1, dtype=tf.float32) W2_tf = tf.Variable(self.W2, dtype=tf.float32) hh = [tf.matmul(tf.transpose(W1_tf), tf.reshape(x[:, i], [self.V, 1])) for i in range(len(self.context_words))] h = tf.reduce_mean(tf.stack(hh), axis=0) u = tf.matmul(tf.transpose(W2_tf), h) loss_tf = -u[int(np.where(self.center_word == 1)[0])] + tf.log(tf.reduce_sum(tf.exp(u), axis=0)) grad_W1, grad_W2 = tf.gradients(loss_tf, [W1_tf, W2_tf]) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) W1_tf, W2_tf, loss_tf, dW1_tf, dW2_tf = sess.run([W1_tf, W2_tf, loss_tf, grad_W1, grad_W2], feed_dict={x: self.context_words.T}) W1_tf -= self.learning_rate * dW1_tf W2_tf -= self.learning_rate * dW2_tf for i in range(self.V): for j in range(self.N): self.assertAlmostEqual(W1_m[i, j], W1_tf[i, j], places=5) for i in range(self.N): for j in range(self.V): self.assertAlmostEqual(W2_m[i, j], W2_tf[i, j], places=5) self.assertAlmostEqual(loss_m, float(loss_tf), places=5)
def process(args): if args.format == "adjlist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) elif args.format == "edgelist": G = graph.load_adjacencylist(args.input, undirected=args.undirected) else: raise Exception( "unknown file format: '%s'. valid formats: 'adjlist', 'edgelist'" % args.format) print("number of nodes: {}".format(len(G.nodes()))) # .format 格式化字符串(取代{}) num_walks = len(G.nodes()) * args.number_walks # 每个节点有多个walks print("number of walks: {}".format(num_walks)) data_size = num_walks * args.walk_length print("data size (walk*length): {}".format(data_size)) print("walking...") walk_file = walks.write_walks_to_disk(G, args.output, num_paths=args.number_walks, path_length=args.walk_length, alpha=0, rand=random.Random(args.seed)) model = Word2Vec(walk_file, args.output, emb_dimension=args.representation_size, window_size=args.window_size, min_count=0) print("Training...") model.skip_gram_train()
def main(input, output, iter=5, size=128, worker=4, batch_nodes=10000, negative=5, sample=1e-4, output_format="gensim"): # load karate graph in csr matrix RWG = RandomWalksGeneratorCSR(path=input) # init model skipgram = Word2Vec(sg=1, iter=iter, min_count=0, size=size, workers=worker, batch_words=batch_nodes, sample=sample, negative=negative) # build vocab skipgram.build_vocab(RWG) # learn embbeding skipgram.train(RWG) if output_format == "gensim": skipgram.save(output) elif output_format == "txt": skipgram.save_word2vec_format(output)
def average_distance(cls, start: WSP, end: WSP): total_distance = 0 for word in start.onyms: if Word2Vec.contains(word): total_distance += cls.distance(word, end.word) average_distance = total_distance / len(start.onyms) return average_distance
def load_embeddings(self): # check if embeddings saved in cache if os.path.exists(R.EMB.format(self.dim)): # read from cache; return return pickle.load(open(R.EMB.format(self.dim), 'rb')) # read model from word2vec model = Word2Vec(self.dim).get_model() embeddings = [] for w in self._vocab: # if word in model if w in model: emb = model[w] # else check if lower-case of w in model elif w.lower() in model: emb = model[w.lower()] # return zero vector else: emb = np.zeros(self.dim) # keep track of embedding embeddings.append(emb) # np.array embeddings = np.stack(embeddings) # attach to self self.emb = embeddings # write to cache pickle.dump(self.emb, open(R.EMB.format(self.dim), 'wb')) # make sure vocab size == num of embeddings assert self.vocab_size() == self.emb.shape[0] return self.emb
def test_word2vec(): data = [ 'Merge multiple sorted inputs into a single sorted output', 'The API below differs from textbook heap algorithms in two aspects' ] wv = Word2Vec(vec_len=50) wv.train(data, model='cbow') print(wv['into'])
def export_vocab(tweets, vocab_size, export=True): words = [] for tweet in tweets: words.extend(tweet) vocab = Word2Vec.vocab_to_num(words, vocab_size) if export: np.save('./data/vocab.npy', vocab) return vocab
def key_words(self, string, top_number=10): # use the pretrained model with 5.9 million pretrained model model_name = '5.9m' model_download = W2VModelDownload(bq_project) model_download.download_w2v_model('patent_landscapes', model_name) word2vec5_9m = Word2Vec('5.9m') w2v_runtime = word2vec5_9m.restore_runtime() return w2v_runtime.find_similar(string, top_number)
def initialise_model(data): input_file = 'test.txt' f = open(input_file,'w') input_txt = get_all_text(data) f.write(input_txt) f.close() model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=1, workers=8) model.save(input_file + '.model') model.save_word2vec_format(input_file + '.vec')
def load_cat2vec_format(cls, cat_model=None, sent_model=None, word_model=None): """ Load sentence vectors """ model = Category2Vec(None) count = 0 if cat_model: logger.info("loading %s object(cat) from %s" % (cls.__name__, cat_model)) for line in open(cat_model,"r"): line = line.rstrip() if count == 0: info = line.split() model.cat_len = int(info[0]) model.layer1_size = int(info[1]) model.sg = int(info[2]) model.hs = int(info[3]) model.negative = int(info[4]) model.cbow_mean = int(info[5]) model.cats = empty((model.cat_len, model.layer1_size), dtype=REAL) model.cat_no_hash = {} model.cat_id_list = [] else: idx = count - 1 row = line.split("\t") cat_id = utils.to_unicode(row[0]) model.cat_no_hash[cat_id] = idx model.cat_id_list.append(cat_id) vals = row[1].split() for j in xrange(model.layer1_size): model.cats[idx][j] = float(vals[j]) count += 1 count = 0 if sent_model: logger.info("loading %s object(sentence) from %s" % (cls.__name__, sent_model)) for line in open(sent_model,"r"): line = line.rstrip() if count == 0: info = line.split() model.sents_len = int(info[0]) model.sents = empty((model.sents_len, model.layer1_size), dtype=REAL) model.sent_no_hash = {} model.sent_id_list = [] else: idx = count - 1 row = line.split("\t") sent_id = utils.to_unicode(row[0]) model.sent_no_hash[sent_id] = idx model.sent_id_list.append(sent_id) vals = row[1].split() for j in xrange(model.layer1_size): model.sents[idx][j] = float(vals[j]) count += 1 if word_model: logger.info("loading word2vec from %s" % word_model) model.w2v = Word2Vec.load(word_model) model.vocab = model.w2v.vocab return model
def train_model(window_size, embedding_dim, batch_size_word2vec): file_to_save_trained_data = '../../results/word2vec/ver6/ws-' + str( window_size) + '-embed-' + str(embedding_dim) + 'batch_size-' + str( batch_size_word2vec) + '.pkl' word2vec = Word2Vec(window_size=window_size, epoch_word2vec=epoch_word2vec, embedding_dim=embedding_dim, batch_size_word2vec=batch_size_word2vec, file_to_save_trained_data=file_to_save_trained_data) vectors, word2int, int2word = word2vec.train()
def main(args: argparse.Namespace) -> None: """Main entrypoint for the script.""" # Make sure we have at least one file. if len(args.filenames) == 0: logger.error('At least one text file is required!') exit(1) set_seed(args.seed) tokenizer = Tokenizer(max_tokens=args.max_tokens, min_word_frequency=args.min_word_frequency, sample_threshold=args.sample_threshold) start_time = time.time() logger.info('Building vocabulary from corpus...') tokenizer.build(filenames=args.filenames) logger.info('Finished building vocabulary (took {:.2f} seconds)'.format( time.time() - start_time)) dataset = make_dataset(args.filenames, tokenizer, window_size=args.window_size, batch_size=args.batch_size, epochs=args.epochs) model = Word2Vec(tokenizer, hidden_size=args.hidden_size, batch_size=args.batch_size, n_negative_samples=args.n_negative_samples, lambda_power=args.lambda_power, bias=args.bias) # Create output directory args.output_dir.mkdir(parents=True, exist_ok=True) run_name = args.run_name or args.filenames[0].stem logdir = args.output_dir / get_next_run_id(args.output_dir, run_name) logger.info('Starting training (for {} epochs).'.format(args.epochs)) model.train(dataset, logdir, args.initial_lr, args.target_lr, args.log_freq, args.save_freq) # Save embeddings and vocab # # The weights of the projection layer are components of the # embedding vectors. The i-th row of the weight matrix is the # embedding vector for the word whose encoded index is i. proj = model.weights[0].numpy() np.save(logdir / 'proj_weights', proj) # Save the tokenizer state tokenizer.save(logdir / 'tokenizer.json') # Save a list of the vocabulary words with open(logdir / 'vocab.txt', 'w') as file: for word in tokenizer.words: file.write(f'{word}\n')
def export_vocab(comments, categories, vocab_size, export=True): words = [] for key in comments: words.extend(list(itertools.chain.from_iterable(comments[key]))) for key in categories: words.extend(list(itertools.chain.from_iterable(categories[key]))) vocab = Word2Vec.vocab_to_num(words, vocab_size) if export: with open("../resources/vocab.json", "w") as f: json.dump(vocab, f, indent=2) return vocab
def init_from_config(args): global w2v, sparql_backend, entity_linker, facts_ranker, facts_extractor global wiki_url config_options = globals.config w2v = Word2Vec.init_from_config(config_options) sparql_backend = globals.get_sparql_backend(config_options) wiki_url = WikiUrl(config_options) entity_linker = EntityLinker.init_from_config(config_options, wiki_url) facts_ranker = Ranker.init_from_config(config_options) facts_extractor = FactExtractor.init_from_config(config_options)
def __init__(self, dataset_file=None, cv_folds=10): """ :param embeddings_file: path to the embeddings file. :param dataset_file: path to a labeled dataset file. :param cv_folds: int, number of folds for cross validation """ self.dataset_file = dataset_file self.cv_folds = cv_folds # read dataset dataset = pd.read_csv(self.dataset_file) text = dataset['tweet'] self.Y = dataset['label'] # Option 1-- word2vec using embedding -- # w2v = Word2Vec() self.X = w2v.getVectors(text, ) # -- Option 2 count vectorization -- # #self.X = self.features_extraction(text) # -- Option 3 TFIDF -- # #self.X = self.tfidfFeatureExtraction(text) info('Done loading and vectorizing data.') info("--- Sentiment CLASSIFIERS ---") info("fitting ... ") self.accuracies = {} # classifiers to use classifiers = [ #RandomForestClassifier(n_estimators=100), #SGDClassifier(), LinearSVC(), #LinearDiscriminantAnalysis(), #LogisticRegression(), #GaussianNB(), #DecisionTreeClassifier() ] # RUN classifiers for c in classifiers: self.classify(c) info('results ...') for k, v in self.accuracies.items(): string = '\tAcc. {:.2f}% F1. {:.2f}% P. {:.2f} R. {:.2f} : {}' print( string.format(v[0] * 100, v[1] * 100, v[2] * 100, v[3] * 100, k)) info("DONE!")
def word2vec(rdd): sentences = parse_sentences(rdd) sentences_without_id = sentences.map(lambda (_id, sent): sent) model = Word2Vec(size=100, hs=0, negative=8) dd2v = DistDoc2VecFast(model, learn_hidden=True, num_partitions=15, num_iterations=20) dd2v.build_vocab_from_rdd(sentences_without_id) print "*** done training words ****" print "*** len(model.vocab): %d ****" % len(model.vocab) return dd2v, sentences
def ExtractSent2Vec(filename): model = Word2Vec(LineSentence(filename), size=512, window=5, sg=0, min_count=5, workers=8) model.save(filename + '.model') model.save_word2vec_format(filename + '-01.vec') model = Sent2Vec(LineSentence(filename), model_file=filename + '.model') model.save_sent2vec_format(filename + '-02.vec')
def main(): contexts = np.fromfile("./data/npcontexts.dat", dtype=int) neighbors = np.fromfile("./data/npneighbors.dat", dtype=int) skipgram = Word2Vec(contexts, neighbors, 35000, 10, 0.001, 64, "sg.ckpt", batch_size=500) skipgram.train(2)
def create_dataset(tweets, window, datafile="mapped_tweets.npy", export=True): if tweets is None: try: tweets = np.load(datafile).item() except FileNotFoundError: print("cannot find " + datafile) exit(1) contexts, neighbors = Word2Vec.create_dataset(tweets, window) if export: print("saving train set to file") contexts = np.array(contexts) neighbors = np.array(neighbors) contexts.tofile('./data/npcontexts.dat') neighbors.tofile('./data/npneighbors.dat')
def __init__(self, data): self.data = data self.corpus = None self.liu = LiuLexicon() self.subj = SubjLexicon() self.buildTweetCorpus() self.word_vec_model = Word2Vec(self.corpus) self.glove_vec_model = Glove(100, self.corpus) self.clusters = Cluster(100) self.initEncoders() self.topicVecs = self.word_vec_model.getVectorsForTopics( self.topicenc.classes_) self.collectTopUnigrams() self.collectTopBigrams()
def main(text): params = getattr(parameters, text) w2v = Word2Vec(params['file'], window_size=params['window_size'], learning_rate=params['learning_rate'], vocab_size=params['vocab_size'], embedding_size=params['embedding_size'], n_negative=params['n_negative']) w2v.fit(n_iter=params['n_iter'], num_proc=params['num_proc']) print(w2v.process_time) print(w2v.process_time[-1] - w2v.process_time[0])
def main(): # with open("/Users/johnkarasev/PycharmProjects/TweetGrouper/word2vec/contexts.json") as fp: # contexts = json.load(fp) # with open("/Users/johnkarasev/PycharmProjects/TweetGrouper/word2vec/neighbors.json") as fp: # neighbors = json.load(fp) print("Reading dat files") npn = np.fromfile("npneighbors.dat", dtype=int) print(str(npn.shape[0])) npc = np.fromfile("npcontexts.dat", dtype=int) print(str(npc.shape[0])) print("finished read") # train skipgram model skipgram = Word2Vec(npn, npc, 35000, 10, 0.001, 64, "sg.ckpt", batch_size=500) skipgram.train(5) # train cbow model cbow = Word2Vec(npc, npn, 35000, 10, 0.001, 64, "sg.ckpt", batch_size=500) cbow.train(5)
def getTextualFeature(text_reading_path): # Train and save the Word2Vec model for the text file. # Please note that, you can change the dimension of the resulting feature vector by modifying the value of 'size'. model = Word2Vec(LineSentence(text_reading_path), size=500, window=5, sg=0, min_count=5, workers=8) model.save(text_reading_path + '.model') # Train and save the Sentence2Vec model for the sentence file. model = Sent2Vec(LineSentence(text_reading_path), model_file=text_reading_path + '.model') model.save_sent2vec_format(text_reading_path + '.vec') program = os.path.basename(sys.argv[0])
def create_trainset(window, export=True): with open("mapped_comments.json") as f: comments = json.load(f) sentences = [] for key, index in zip(comments, range(len(comments))): progress(index, len(comments), "combining sentences") sentences.extend(comments[key]) sentences = list(filter(lambda x: x, sentences)) print("finished") sentences = np.array(sentences) contexts, neighbors = Word2Vec.create_dataset(sentences, window) if export: npc = np.array(contexts) npn = np.array(neighbors) npc.tofile('npcontexts.dat') npn.tofile('npneighbors.dat')
def __init__(self, sentences, model_file=None, size=200, alpha=0.025, window=5, min_count = 5, sample=0, seed=1, workers=16, min_alpha=0.0001, model="cb", hs=1, negative=0, cbow_mean=0, iteration=1, word_learn=1, init_adjust=True, update_mode = 0, normalize_each_epoch = False): self.sg = 1 if model == "sg" or model == "dbow" else 0 self.table = None # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving self.alpha = float(alpha) self.window = int(window) self.seed = seed self.sample = sample self.workers = workers self.min_alpha = min_alpha self.hs = hs self.negative = negative self.cbow_mean = int(cbow_mean) self.iteration = iteration self.word_learn = int(word_learn) self.cat_learn = 1 self.layer1_size = size self.min_count = min_count self.sent_no_hash = {} # mapping sent_id to index of self.sents self.sent_id_list = [] # mapping sent_no to sent_id self.cat_no_hash = {} # mapping cat_id to index of self.cats self.cat_id_list = [] # mapping cat_no to cat_id self.sane_vec_len = 100000 # for sanity check self.sane_max_sim10 = 0.9 # for sanity check self.init_adjust = init_adjust # for adjustment of initialization self.update_mode = update_mode # 0:SGD, 1: AdaGrad, 2:AdaDelta, 3:ADAM self.normalize_each_epoch = normalize_each_epoch # normalize vectors after each epoch if sentences: if model_file: self.w2v = Word2Vec.load(model_file) self.vocab = self.w2v.vocab self.layer1_size = self.w2v.layer1_size self.build_vec(sentences, has_vocab = True) else: self.word_learn = 1 self.w2v = Word2Vec(None, self.layer1_size, self.alpha, self.window, self.min_count, self.sample, self.seed, self.workers, self.min_alpha, self.sg, self.hs, self.negative, self.cbow_mean) self.build_vec(sentences, has_vocab = False) self.train_iteration(sentences, iteration=iteration)
def word2vec_feat(reviews): w2v_model_file = "../../models/laptop.word2vec.model" w2v_model = Word2Vec.load(w2v_model_file) bags = [] for review in reviews: bag = [] for sent in review.sentences: instance = None count = 0. for w in sent: if w not in w2v_model: continue if count == 0: instance = w2v_model[w] count += 1. else: instance += w2v_model[w] count += 1. instance /= count bag.append(instance.tolist()) bags.append(bag) save_sparse_feature(corpus_name="laptop", view_name="word2vec", features=bags) save_view_info(view_name="word2vec", dim=100, data_format="sparse", view_type="continuous")
def load(cls, fname, mmap=None): model = super(Sentence2Vec, cls).load(fname, mmap) if os.path.isfile(fname+"_w2v"): model.w2v = Word2Vec.load(fname+"_w2v", mmap) model.vocab = model.w2v.vocab return model
w2v = Word2Vec(vocabulary_size=vocabulary_size, architecture='cbow', # loss_type='nce_loss', n_steps=2001) # print w2v.get_params() w2v.fit(words) print(w2v.final_embeddings.shape) print(len(w2v.sort('the'))) print('words closest to %s:' % 'the') print(w2v.sort('the')[:10]) # print([reverse_dictionary[i] for i in range(3)]) # print(w2v.transform([0,1,2,3]).shape) save_path = w2v.save('models/test_model') print(w2v.final_embeddings[0,0]) print save_path # restore a saved model w2c_restored = Word2Vec.restore(save_path) print(w2c_restored.final_embeddings[0,0]) print(w2c_restored.dictionary['the']) print(w2c_restored.reverse_dictionary.items()[:5])
def main(): optparser = OptionParser() optparser.add_option("-p", "--pro", dest="product") (options, args) = optparser.parse_args() (train_file, test_file) = CORPUS[options.product] train_reviews = load_dataset(DATA_PATH + train_file) test_reviews = load_dataset(DATA_PATH + test_file) n_cates, cate_index = get_categories(train_reviews + test_reviews) vocab_size = 1000 vocab_index = get_vocab(train_reviews, vocab_size) train_bags = [extract_unigram(vocab_index, vocab_size, review)\ for review in train_reviews] train_X = [bag2vec(bag) for bag in train_bags] train_labels = [extract_labels(cate_index, review)\ for review in train_reviews] test_bags = [extract_unigram(vocab_index, vocab_size, review)\ for review in test_reviews] test_X = [bag2vec(bag) for bag in test_bags] test_labels = [extract_labels(cate_index, review)\ for review in test_reviews] # add word2vec feature w2v_model_file = "../../models/laptop.word2vec.model" w2v_model = Word2Vec.load(w2v_model_file) train_X2 = word2vec_feat(train_reviews, w2v_model) train_X = merge_features(train_X, train_X2) test_X2 = word2vec_feat(test_reviews, w2v_model) test_X = merge_features(test_X, test_X2) labelwise_acc = [] labelwise_output = [] for cate in range(n_cates): # train a binary svm model train_Y = get_Y(train_labels, cate) prob = svm_problem(train_Y, train_X) #param = svm_parameter("-s 0 -t 0 -b 1") param = svm_parameter("-s 0 -t 2 -b 1") m = svm_train(prob, param) # test test_Y = get_Y(test_labels, cate) p_label, p_acc, p_val = svm_predict(test_Y, test_X, m, '-b 1') labelwise_acc.append(p_acc) labelwise_output.append(p_label) # evaluation p, r, f = microF1(labelwise_output, test_labels) # output out_dir = "results/rbf/" out_dir = "results/" out_file = out_dir + options.product + ".txt" cates = list(cate_index.items()) cates = sorted(cates, key=lambda x:x[1]) labelwise_acc = [(cates[i][0], labelwise_acc[i][0]) for i in range(n_cates)] labelwise_acc = sorted(labelwise_acc, key=lambda x:x[1]) with open(out_file, 'w') as out: out.write("Precision:\t{}\nRecall:\t{}\nF1:\t{}\n".format(p, r, f)) print("{}\n{}\n{}".format(p, r, f)) for cate_i in range(n_cates): out.write("{}:\t{}\n".format(labelwise_acc[cate_i][0], labelwise_acc[cate_i][1]))
return dists best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) # ignore (don't return) words from the input result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] return result[:topn] if __name__ == "__main__": logging.basicConfig(stream=sys.stdout, level=logging.INFO) # debug can see comprehensive result: oov and wrong predict test_words =[] import analogy as analogy # load model #sys.argv 1: model 2: analogy question 3: res 4: word 5: vector # create analogy result model = Word2Vec.load_word2vec_format(sys.argv[1], binary=True, encoding='iso-8859-1') accuracy = model.accuracy(sys.argv[2], restrict_vocab=30000, most_similar=analogy.most_similar, use_lowercase=False) # list return as incorrect, section, correct # list return as incorrect, section, correct print accuracy[0] # write Analogy result to file writeAnRes(accuracy, 'incorrect') # read analogy result file fullList = [] with io.open("analogy_res.txt", 'r', encoding='utf-8') as infile: for line in infile.readlines(): test_words = line.split(":", 1)[1].split() fullList.append(test_words) #writeVec2file(fullList) # read type and vector to form word2vec