예제 #1
0
def train_model(in_file_name, out_file_name, use_plain_word2vec=False, size=100, phrases_n_gram=1, threads=4):
    options = {
        'size': size,
    }

    if use_plain_word2vec:
        if phrases_n_gram > 1:
            phrases_file_name = '{}.phrases'.format(in_file_name)
            word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True)
            in_file_name = phrases_file_name

        if threads:
            options['threads'] = threads

        # noinspection PyCallingNonCallable
        word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options)
    else:
        sentences = LineSentence(in_file_name)
        for i in range(phrases_n_gram - 1):
            n_gram_transformer = Phrases(sentences)
            sentences = n_gram_transformer[sentences]

        if threads:
            options['workers'] = threads

        model = Word2Vec(sentences, **options)
        model.save(out_file_name)
예제 #2
0
def main():
    word2vec.word2phrase('./text8', './text8-phrases', verbose=True)
    word2vec.word2vec('./text8-phrases', './text8.bin', size=100, verbose=True)
    word2vec.word2clusters('./text8',
                           './text8-clusters.txt',
                           100,
                           verbose=True)
예제 #3
0
def train():
    word2vec.word2phrase('all.txt', 'phrase.txt', verbose=True)
    word2vec.word2vec('phrase.txt',
                      'vec.bin',
                      min_count=50,
                      size=50,
                      verbose=False)
예제 #4
0
def preprocess():
    """Preprocess given data and build vocabulary file."""

    def extend_vocab():
        """."""
        ground_truth_vocab = ""
        with open("gt_input.txt", "r") as f:
            for line in f:
                ground_truth_vocab += line.strip()
                ground_truth_vocab += " "
        ground_truth_vocab = ground_truth_vocab[:-1]

        text8_vocab = ""
        with open("./text8-phrases", "r") as f:
            for line in f:
                text8_vocab += line

        return text8_vocab + " " + ground_truth_vocab

    logging.info("Beginning preprocessing:")

    # covert text8 vocabulary to include bigram phrases
    word2vec.word2phrase("./text8", "./text8-phrases", verbose=True, min_count=1)

    logging.info("Done creating test8-phrases.")

    # extend text8-phrases vocab with ground truth vocab then write to file
    full_vocab = extend_vocab()
    with open("./text8-phrases-extra", "w") as f:
        f.write(full_vocab)

    logging.info("Done creating test8-phrases-extra")
    logging.info("Done preprocessing")
예제 #5
0
def word_clusters(
    corpora,
    size=100,
    verbose=True,
    text='text.txt',
    phrases='phrases.txt',
    binary='text.bin',
    clusters='clusters.txt'
):
    """Produce word2vec word clusters."""
    words = []
    for corpus in corpora:
        for document in corpus.documents:
            for sentence in document.sentences:
                for word in sentence.words:
                    words.append(word.lower().strip(punctuation + whitespace))
    with io.open(text, mode='w', encoding='utf-8') as file:
        file.write(u' '.join(words))
    word2vec.word2phrase(text, phrases, verbose=verbose)
    word2vec.word2vec(phrases, binary, size=size, verbose=verbose)
    word2vec.word2clusters(text, clusters, size, verbose=verbose)
    json_clusters = clusters.rstrip('.txt') + '.json'
    with io.open(clusters, mode='r', encoding='utf-8') as file:
        d = dict(
            (w, int(c)) for w, c in map(split, file.read().splitlines())
        )
    with io.open(json_clusters, mode='w', encoding='utf-8') as file:
        json.dump(d, file, indent=4, ensure_ascii=False)
    return d
예제 #6
0
파일: wordvec.py 프로젝트: pamge/ML2017
def main():
    words, pos_tags = load_data('all.txt')
    word2vec.word2phrase('all.txt', 'word2phrase.txt', verbose=False)
    word2vec.word2vec('word2phrase.txt',
                      'word2vec.bin',
                      alpha=0.087,
                      hs=1,
                      size=100,
                      verbose=False)
    model = word2vec.load('word2vec.bin')
    words_table, words_vec = get_most_frequent_words(500, model, pos_tags)
    tsne = TSNE(n_components=2, random_state=87)
    words_t_vec = tsne.fit_transform(words_vec)
    # show
    figure = pyplot.figure(figsize=(12, 6), dpi=150)
    pyplot.scatter(words_t_vec[:, 0],
                   words_t_vec[:, 1],
                   c='b',
                   alpha=0.2,
                   s=15)
    texts = []
    for vec, text in zip(words_t_vec, words_table):
        texts.append(pyplot.text(vec[0], vec[1], text, size=5))
    adjust_text(texts, arrowprops=dict(arrowstyle='-', color='k', lw=0.5))
    pyplot.show()
    figure.savefig('figure.png')
예제 #7
0
def train_model(in_file_name,
                out_file_name,
                use_plain_word2vec=False,
                size=100,
                phrases_n_gram=1,
                threads=4):
    options = {
        'size': size,
    }

    if use_plain_word2vec:
        if phrases_n_gram > 1:
            phrases_file_name = '{}.phrases'.format(in_file_name)
            word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True)
            in_file_name = phrases_file_name

        if threads:
            options['threads'] = threads

        # noinspection PyCallingNonCallable
        word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options)
    else:
        sentences = LineSentence(in_file_name)
        for i in range(phrases_n_gram - 1):
            n_gram_transformer = Phrases(sentences)
            sentences = n_gram_transformer[sentences]

        if threads:
            options['workers'] = threads

        model = Word2Vec(sentences, **options)
        model.save(out_file_name)
예제 #8
0
def main():

    if '--download-nltk' in argv:
        nltk.download('punkt')
        nltk.download('maxent_treebank_pos_tagger')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('brown')

    if not isfile('wordvec.bin') or '--train' in argv:
        print("\nwords to phrases...")
        wv.word2phrase('./HarryPotter/HarryPotter.txt', 'phrase', verbose=1)
        print("\nphrases to vectors...")
        wv.word2vec('phrase', 'wordvec.bin', size=50, verbose=1)
        print("")

    print("\nload model...")
    model = wv.load('wordvec.bin')
    print("model shape: " + repr(model.vectors.shape))

    X, Y = [], []
    if '--load-vector' in argv:
        if isfile('X.npy') and isfile('Y.npy'):
            X = np.load('X.npy')
            Y = np.load('Y.npy')
        else:
            print("can't load X.npy, Y.npy")
            return
    else:
        print("TSNE...")
        tsne = TSNE(n_components=2, learning_rate=10, random_state=0)
        vectors = tsne.fit_transform(X=model.vectors[:SIZE, :])
        X = vectors[:, 0]
        Y = vectors[:, 1]

    print("start plot...(using nltk.corpus.brown)")
    brown_tagged_sents = brown.tagged_sents(categories='news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    words = unigram_tagger.tag(model.vocab[:SIZE])
    texts = []
    plt.figure(figsize=(12, 8))

    for x, y, word in zip(X, Y, words):
        print("word: (%s, %s)" % (word[0], word[1]), end="")

        if filter_words(word[0], word[1]):
            print("\r\t\t\t\tplot")
            plt.plot(x, y, 'o')
            texts.append(plt.text(x, y, word[0], fontsize=8))

        else:
            print("\r\t\t\t\tignore")

    adjust_text(texts,
                force_text=1,
                arrowprops=dict(arrowstyle="-", color="k", lw=1))

    plt.savefig("wordvec.png", dpi=100)
    plt.show()
예제 #9
0
def checkForSemanticIndex(carrel):

    # configure
    MODEL = 'reader.bin'
    TXT = 'model.txt'
    PHRASES = 'model.phrases'

    # require
    from pathlib import Path
    from word2vec import word2vec, word2phrase
    import os

    # initialize
    localLibrary = configuration('localLibrary')
    model = localLibrary / carrel / ETC / MODEL

    # see if we have been here previously
    if not model.exists():

        # initialize some more
        stopwords = localLibrary / carrel / ETC / STOPWORDS
        corpus = localLibrary / carrel / ETC / CORPUS
        txt = str(Path.home() / TXT)
        phrases = str(Path.home() / PHRASES)

        # tokenize
        click.echo('Indexing. This needs to be done only once.', err=True)
        click.echo('Step #1 of 6: Tokenizing corpus...', err=True)
        tokens = open(corpus).read().split()

        # normalize
        click.echo('Step #2 of 6: Normalizing tokens...', err=True)
        tokens = [token.lower() for token in tokens if token.isalpha()]

        # remove stop words
        click.echo('Step #3 of 6: Removing stop words...', err=True)
        stopwords = open(stopwords).read().split()
        tokens = [token for token in tokens if token not in stopwords]

        # save
        click.echo('Step #4 of 6: Saving tokens...', err=True)
        with open(txt, 'w') as handle:
            handle.write(' '.join(tokens))

        # create phrases
        click.echo('Step #5 of 6: Creating phrases...', err=True)
        word2phrase(txt, phrases, verbose=True)

        # do the work
        click.echo('Step #6 of 6: Indexing...', err=True)
        word2vec(phrases, str(model), size=100, binary=True, verbose=True)

        # clean up and done
        os.remove(txt)
        os.remove(phrases)
        click.echo('\nDone. Happy searching!', err=True)
예제 #10
0
def extract(dim, data, trained):
    if(not trained):
        word2vec.word2phrase(data, data+'-phrases', verbose=True)
        word2vec.word2vec(data+'-phrases', data+'.bin', size=dim, verbose=True)
    model = word2vec.load(data+'.bin')
    keys = model.vocab
    features = model.vectors
    dic = dict(zip(keys,features))
    print(len(dic))
    return dic
예제 #11
0
파일: wordvec.py 프로젝트: willtuna/ML2017
def word_training(path, embedded_size):
    dirname = os.path.dirname(path)
    filename = os.path.basename(path)
    phrasesname = os.path.join(dirname, '{}-phrases'.format(filename))
    modelname = os.path.join(dirname, '{}.bin'.format(filename))
    print('Training...')
    word2vec.word2phrase(path, phrasesname)
    word2vec.word2vec(phrasesname, modelname, size=embedded_size)
    print('Training Done!!!')
    return modelname
예제 #12
0
    def build_vector_model(self, file_name, verbose=True, size=200):
        # Build word vector phrase models
        word2vec.word2phrase(self.token_dir + "/" + file_name,
                             self.phrase_dir + "/" + file_name,
                             verbose=verbose)

        # Build the word vector now
        word2vec.word2vec(self.phrase_dir + "/" + file_name,
                          self.vector_dir + "/" + file_name + ".bin",
                          size=size,
                          verbose=verbose)
예제 #13
0
def word_to_vec(config_path: str, dimension: int, T = ""):
    folder = "data/word2vec"
    words_file = os.path.join(folder, f"{T}words-noisefiltered-{dimension}")
    phrases_file = os.path.join(folder, f"{T}phrases-noisefiltered-{dimension}")
    w2v_file = os.path.join(folder, f"{T}noisefiltered-{dimension}.bin")
    import word2vec

    word2vec.word2phrase(words_file, phrases_file, verbose=True)
    word2vec.word2vec(phrases_file, w2v_file, size=dimension)
    logging.info("wrote to " + w2v_file)
    return word2vec.load(w2v_file)
예제 #14
0
def train():
    movie_set = cornell_movie_set.MovieSet()
    movie_set.parse_movie_set('train')
    word2vec.word2phrase('cornell_movie_train.txt',
                         'movie_phrases_train.txt',
                         verbose=True)
    word2vec.word2vec('movie_phrases_train.txt',
                      'movie_train.bin',
                      size=100,
                      verbose=True)
    model = word2vec.load('movie_train.bin')
    return model
예제 #15
0
def getWordVector(_fileName):
    phraseName = _fileName + "-phrases.txt"
    binName = _fileName + ".bin"
    outPutName = _fileName + "_100d.vocab"
    file  = open(outPutName, 'wt')
    word2vec.word2phrase(_fileName, phraseName, verbose=True)
    word2vec.word2vec(phraseName, binName, size=100, verbose=True)
    model = word2vec.load(binName)
    for key in model.vocab:
        vectorLine = key
        for value in model[key]:
            vectorLine = vectorLine + " " + str(value)
        file.write(vectorLine + "\n")
    file.close()
예제 #16
0
def train_model(docfile_root="corpora/Thaliana/documents-processed"):
    print "phrases..."
    word2vec.word2phrase(docfile_root + ".txt",
                         docfile_root + "-phrases.txt",
                         verbose=True)
    #print "word2vec"
    #word2vec.word2vec(docfile_root + "-phrases.txt", docfile_root + ".bin", size=1000, verbose=True, min_count=1)
    print "word2cluster"
    word2vec.word2clusters(docfile_root + ".txt",
                           docfile_root + '-clusters.txt',
                           10000,
                           verbose=True,
                           min_count=1,
                           threads=4)
def create_word2vec_model(save_text_file):
    '''run word2vec on the text corpus and create a model'''

    save_phrases = save_text_file + '_phrases'
    save_model = save_text_file + '.bin'
    save_cluster = save_text_file + '-cluster.txt'

    # create phrases for processing
    word2vec.word2phrase(save_text_file, save_phrases, verbose=True)

    # create model
    word2vec.word2vec(save_phrases, save_model, size=100, verbose=True)

    # create cluster
    word2vec.word2clusters(save_text_file, save_cluster, 100, verbose=True)
예제 #18
0
def create_model():
    global created
    if (created):
        model = word2vec.load('movie_all.bin')
        return model
    else:
        movie_set = parse_movie_set.MovieSet()
        movie_set.parse_movie_set('train')
        movie_set.parse_movie_set('test')
        movie_set.parse_movie_set('all')
        word2vec.word2phrase('parsed_all.txt', 'parsed_all_phrases.txt', verbose=True)
        word2vec.word2vec('parsed_all_phrases.txt', 'movie_all.bin', size=100, verbose=True)
        model = word2vec.load('movie_all.bin')
        created = True
        return model
예제 #19
0
def files_10():
    count = 1
    while count < 11:
        for i in os.listdir(directory):
            if i == 'text' + str(count):  
                text = "text" + str(count) + "-adapted" 
                vec = "text" + str(count) + "-vec.bin"
                cluster = "text" + str(count) + "-clusters.txt" 
                result = "text" + str(count) + "-result.txt" 
                #result_list = []
                stopwordsss = ["in", "it", "as", "my", "do", "is", "don't", "doesn't", "am", "it's", "i", "you", "and", "to", "the", "on", "but", "that", "are", "so", "to", "me", "of", "with", "try", 'a', 'about', 'after', 'all', 'also', 'always', 'am', 'an', 'and', 'any', 'are', 'at', 'be', 'been', 'being', 'but', 'by', 'came', 'can', "can't", 'come', 'could' , 'did', 'do', 'does', 'doing', 'else', 'for', 'from', 'get', 'give', 'goes', 'going', 'had', 'happen', 'has', 'have', 'having', 'how', 'in', 'into', 'really', 'if', 'see', 'plus', 'then',  "i'll", "then", "or", "will", "i'm", "too", "doesn't", "don't", "will", "that's", "-", "i've", "would", "making", "usually", "what", "hasn't", "it's", "hmmm", "really", "this", "someone", "not", "i'll", "like", "this", "e", "=", "just", "more", "actually", "most", "one", ":", "very", "b", "yes", "same"]                                                                       
                word2vec.word2phrase(i, text, verbose=True)
                #model = word2vec.load(vec)
                reading = open(text).read().lower().replace(',', ' ').replace('.', ' ').replace('/', ' ').replace('-', ' ').replace('(', ' ').replace(')', ' ').replace('?', ' ').split()
                for i in stopwordsss:
                    try:
                        reading = list(filter(lambda x: x != i, reading))
                    except:
                        continue
                print(reading)
                iterat = 0
                s = ' '
                s = s.join(reading)
                with open(text, "w") as result_file:
                    result_file.write(s)
                word2vec.word2vec(text, vec, 100, verbose=True)
                word2vec.word2clusters(vec, cluster, 100, verbose=True)
                for word in reading:
                    word2vec_model = Word2Vec([reading], min_count=1)
                    vocabulary = word2vec_model.wv.vocab
                    sim_words = word2vec_model.wv.most_similar(word)[:3]
                    #indexes, metrics = model.similar(word.lower())
                    #real_time = model.generate_response(indexes, metrics).tolist()
                    #result_list.append(sim_words)
                    #result_list.append('\n\n')
                    if iterat == 0:
                        with open(result, "w") as result_file:
                            result_file.write(str(word) + '\n' + str(sim_words) + '\n\n')
                        iterat = 1
                    elif iterat == 1:
                        with open(result, "a") as result_file:
                            result_file.write(str(word) + '\n' + str(sim_words) + '\n\n') 
                count += 1
            else:
                continue
예제 #20
0
def main():
    argv1 = "sparkler.config"
    argv2 = "Sparkler"
    configs = read_config(argv1, argv2)
    train_file_obj = codecs.open(configs['train_file'], 'r')
    train_lines = train_file_obj.readlines()
    train_file_obj.close()
    dimension_input = int(configs['dimension_input'])
    word2vec.word2phrase(cleanText('data/200andOcean.txt', 'clean200Ocean'),
                         'ocean-full-phrases',
                         verbose=True)
    word2vec.word2vec('ocean-full-phrases',
                      'ocean.bin',
                      size=dimension_input,
                      verbose=True,
                      min_count=5)
    model = word2vec.load('ocean.bin')
    word = 'ocean'
    print(closeWords(model, word, 5))
    print(model.vectors.shape)
예제 #21
0
def w2v(dim_w2v=dim_w2v, data_training=None):
    if data_training is None:
        print('nooooooo')
        return 0
    with open("word2vec_corpus.tmp", "w") as f:
        f.write(("\n".join(data_training) + "\n"))
    print('running word2vec ...')
    word2vec.word2phrase('word2vec_corpus.tmp',
                         'word2vec_corpus_phrases',
                         verbose=True)
    word2vec.word2vec('word2vec_corpus_phrases',
                      'word2vec_corpus.bin',
                      size=dim_w2v,
                      verbose=True,
                      window=5,
                      cbow=0,
                      binary=1,
                      min_count=1,
                      sample='1e-5',
                      hs=1,
                      iter_=5)
예제 #22
0
# step 1 install word2vec (ref: http://nbviewer.jupyter.org/github/danielfrg/word2vec/blob/master/examples/word2vec.ipynb)
import word2vec
import numpy as np
import scipy.io as sio

vector_size = 100
amount_nearest = 100

word2vec.word2phrase('text8', 'text8-phrases', verbose=True)
word2vec.word2vec('text8-phrases', 'text8.bin', size=vector_size, verbose=True)
word2vec.word2clusters('text8', 'text8-clusters.txt', vector_size, verbose=True)

# read the trained model
model = word2vec.load('text8.bin')

# list of vague motivation coming from mind (topic: potential problems for enterprise)
motivation = ['enterprise', \
				'business',\
				'solution',\
				'entrepreneur',\
				'latent',\
				'problem',\
				'funds',\
				'management',\
				'quality',\
				'projects']
							
# start get nearest clusters by picking the similar words
amount_motivation = len(motivation)
motivation_vector = []
nearest_indexes = []
예제 #23
0
def testWord2Phrase():
    #Run word2phrase to group up similar words "Los Angeles" to "Los_Angeles"
    #This will create a text8-phrases that we can use as a better input for word2vec.
    # #Note that you could easily skip this previous step and use the origial data as input for word2vec.
    word2vec.word2phrase('/D/test/text8/text8', '/D/test/text8/text8-phrases', verbose=True)
예제 #24
0
#!/usr/bin/env python3

import word2vec
import nltk
import numpy as np
import matplotlib.pyplot as plt

from sklearn.manifold import TSNE
from adjustText import adjust_text

import sys

txt = sys.argv[1]

phrase_file_name = "text_phrases.bin"
word2vec.word2phrase(txt, phrase_file_name, verbose=True)

model_file_name = "text.bin"
word2vec.word2vec(phrase_file_name, model_file_name, verbose=True, cbow=1)

model = word2vec.load(model_file_name)

words = []
word_vectors = []
for word in model.vocab:
    words.append(word)
    word_vectors.append(model[word])
words = np.array(words)
word_vectors = np.array(word_vectors)

num_plotted_words = 1000
예제 #25
0
def setup_module(module):
    word2vec.word2phrase(input_, output_phrases, verbose=False)
    word2vec.word2vec(input_, output_bin, size=10, binary=1, verbose=False)
    word2vec.word2vec(input_, output_txt, size=10, binary=0, verbose=False)
    word2vec.word2clusters(input_, output_clusters, 10, verbose=True)
예제 #26
0
# coding: utf-8

import word2vec
import gensim

word2vec.word2phrase('./refined_text.txt', './wiki-phrase', verbose=True)
word2vec.word2vec('./wiki-phrase',
                  './word2vec_model.bin',
                  size=100,
                  verbose=True)

word2vec.word2clusters(
    '/Users/KYD/Documents/wiki_project/refined_text.txt',
    'Users/KYD/Documents/wiki_project/refined_text_cluster.txt',
    100,
    verbose=True)
예제 #27
0
# -*- coding: utf-8 -*-
import plyvel
import re, string
import sys, locale
import word2vec
import os

reload(sys)
sys.setdefaultencoding(locale.getdefaultlocale()[1])


model_path = os.path.abspath('model.bin')
text_path = os.path.abspath('text.txt')
phrase_path = os.path.abspath('phrases.txt')

word2vec.word2phrase(text_path, phrase_path, verbose=True)
word2vec.word2vec(phrase_path, model_path, binary=1, verbose=True)
model = word2vec.load(model_path)

indexes, metrics = model.cosine('seymour')
print (string.join(model.vocab[indexes], ' '))

예제 #28
0
    else:
        features[7] = len(sentence1) / len(sentence2)

    return features

# Uses treetagger-python (Installation https://github.com/miotto/treetagger-python ; http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/)
try:
    semanticsimilarity_lookuptable = pickle.load(open('semanticsimilarity_lookuptable.pkl', 'rb'))
except Exception:
    semanticsimilarity_lookuptable = {}

print "Build Word2Vec Corpus"
dir = os.path.dirname(os.path.abspath(__file__))
try:
    # on OSX for some reason this does not work
    word2vec.word2phrase(dir + '/text8', dir + '/text8-phrases', verbose=True)
    word2vec.word2vec(dir + '/text8-phrases', dir + '/text8.bin', size=100, verbose=True)
except Exception as e:
    print e

model = word2vec.load(dir + '/text8.bin')
print "Finish"

def computeSemantics(sentence1, sentence2):
def computeSemanticSimilarityFeatures(sentence1, sentence2):
    features = [0] * 9

    if (sentence1 + sentence2) not in semanticsimilarity_lookuptable:
        def prepareSentence(sentence):
            return sentence.replace('-', ' ').replace('$', ' ')
예제 #29
0
def test_run_word2phrase():
    word2vec.word2phrase(input_text, output_phrases)
    assert os.path.exists(output_phrases)
예제 #30
0
import word2vec
import os
print os.getcwd()
for file in os.listdir(os.getcwd()):
    print file
word2vec.word2phrase('C:\devbox\DeepSphere\word2vec\text8', 'C:\devbox\DeepSphere\word2vec\text8-phrases', verbose=True)
예제 #31
0
from gensim.models.keyedvectors import KeyedVectors
import word2vec
import timeit

start_time = timeit.default_timer()

word2vec.word2phrase('3g-p.txt', '3g-phrases.txt', verbose=True)
word2vec.word2vec('3g-phrases.txt', '3g.bin', size=100, verbose=True)

elapsed = timeit.default_timer() - start_time
InMinutes = elapsed / 60

word2vec.word2clusters('3g-p.txt', '3g-clusters.txt', 100, verbose=True)

model = KeyedVectors.load_word2vec_format('3g.bin', binary=True)

model.save_word2vec_format('3g-vectors.txt', binary=False)

print("The Totatl Execution Time in Minutes is: ", InMinutes)
예제 #32
0
import word2vec as w2v
import clean_articles
from pprint import pprint
import sys

if len(sys.argv) > 1 and sys.argv[1] in ['-t', '-train']:

    # Add new articles to file
    clean_articles.clean()

    # Train new model
    w2v.word2phrase('combined', './text-phrases', verbose=True)
    w2v.word2vec('text8-phrases', 'text.bin', size=100, verbose=True)
    w2v.word2clusters('combined', 'text-clusters.txt', 100, verbose=True)

# Initialize pre-trained model
model_old = w2v.load('text8.bin')
model = w2v.load('text.bin')
clusters = w2v.load_clusters('text-clusters.txt')
model.clusters = clusters

#ind = clusters['Trump']
#print(clusters.get_words_on_cluster(ind))
print(len(model_old.vocab))
print(len(model.vocab))

# King - man + woman : "Man is to King as Woman is to
# Trump - America + Germany
pos = ['Putin', 'America']
neg = ['Russia']
leader = model.analogy(pos, neg)
예제 #33
0
import sys

if sys.argv[1] == "train":
    print("Phrasing starts")

    MIN_COUNT = 5
    WORDVEC_DIM = 100
    WINDOW = 5
    NEGATIVE_SAMPLES = 5
    ITERATIONS = 0
    MODEL = 1
    LEARNING_RATE = 0.025
    model_name = sys.argv[2] + ".txt"
    model_phrase = sys.argv[2] + "-phrase.txt"

    word2vec.word2phrase(model_name, model_phrase, verbose=True)
    print("===============")
    print("Training starts")
    # train model
    word2vec.word2vec(train=model_phrase,
                      output=sys.argv[3] + ".bin",
                      size=WORDVEC_DIM,
                      min_count=MIN_COUNT,
                      window=WINDOW,
                      negative=NEGATIVE_SAMPLES,
                      alpha=LEARNING_RATE,
                      verbose=False)
else:
    # load model for plotting
    model = word2vec.load("hp/" + sys.argv[1])
예제 #34
0
import word2vec

word2vec.word2phrase('topics.xml', 'topics-phrases.txt', verbose=True)

    line = line.split()[1:]
    for term in line:
        try:
            if  stop.stop3(term.split('/')[1]):
                continue
        except:
            print count
            print 'ERROR'
            print term
            #quit()
        term = term.split('/')[0]
        l.write(term + ' ')
    l.write('\n')
f.close()
l.close()

quit()
word2vec.doc2vec(cleanFileName, fileName, cbow=0, size=50, window=10, negative=5, hs=0, sample='1e-4', threads=12, iter_=20, min_count=1, verbose=True)
word2vec.word2phrase(cleanFileName, fileName, verbose=True)
word2vec.word2clusters(cleanFileName, fileName, 100, verbose=True)

model = word2vec.load(fileName)
print model.vectors.shape

quit()
indexes, metrics = model.cosine('_*1')
model.generate_response(indexes, metrics).tolist()

'''
找出重要詞彙的相似字 把它存起來,vectorSim.json
'''
예제 #36
0
# In[ ]:

data = re.sub(
    '\s+', ' ',
    re.sub('[:\.\(\)0123456789%,–\?\\\&\']', '',
           ' '.join(product_fr.values).replace('-', ' ').lower()))

# In[ ]:

with open('data/names.txt', 'w') as f:
    f.write(data)
    f.close()

# In[ ]:

word2vec.word2phrase('data/names.txt', 'data/names-phrases.txt', verbose=True)

# In[ ]:

word2vec.word2vec('data/names.txt',
                  'data/names-model.bin',
                  size=100,
                  verbose=True)

# In[ ]:

word2vec.word2clusters('data/names.txt',
                       'data/names-clusters.txt',
                       100,
                       verbose=True)
예제 #37
0
def train(path):
    word2vec.word2phrase(path, path+'-phrases', verbose=True)
    word2vec.word2vec(path+'-phrases', path+'.bin', size=100, binary=True, verbose=True)
    word2vec.word2clusters(path, path + '.clusters.txt', 100, verbose=True)
    model = word2vec.load(path+'.bin')
    return model
예제 #38
0
def setup_module(module):
    word2vec.word2phrase(input_, output_phrases, verbose=False)
    word2vec.word2vec(input_, output_bin, size=10, binary=1, verbose=False)
    word2vec.word2vec(input_, output_txt, size=10, binary=0, verbose=False)
    word2vec.word2clusters(input_, output_clusters, 10, verbose=True)
예제 #39
0
from gensim.models import Word2Vec
import word2vec
import numpy as np
from argparse import ArgumentParser
import os
import sys
import multiprocessing

word2vec.word2phrase('all.txt', 'phrases.txt', verbose=True)
#word2vec.word2vec('phrases.txt' , 'my_model.bin',size=100, verbose=True)
# parser = ArgumentParser()
# parser.add_argument('--train', action='store_true',
#                     help='Set this flag to train word2vec model')
# parser.add_argument('--corpus-path', type=str, default='all.txt',
#                     help='Text file for training')
# parser.add_argument('--model-path', type=str, default='my_model.bin',
#                     help='Path to save word2vec model')
# parser.add_argument('--plot-num', type=int, default=600,
#                     help='Number of words to perform dimensionality reduction')
# args = parser.parse_args()
MIN_COUNT = 6
WORDVEC_DIM = 300
WINDOW = 5
NEGATIVE_SAMPLES = 10
ITERATIONS = 5
MODEL = 1
LEARNING_RATE = 1e-3
#CPU_COUNT = multiprocessing.cpu_count()
# train model
word2vec.word2vec(
    train='phrases.txt',
예제 #40
0
import word2vec
import sys
import getpass

user = getpass.getuser()
if user == 'ctnuser':
    root = '/home/ctnuser/bjkomer/'
elif user == 'bjkomer':
    root = '/home/bjkomer/'

if len(sys.argv) == 2:
    dim = int(sys.argv[1])
else:
    dim = 100

word2vec.word2phrase(root + 'word2vec/text8',
                     root + 'semantic-network/data/text8-phrases', verbose=True)

word2vec.word2vec(root + 'semantic-network/data/text8-phrases',
                  root + 'semantic-network/data/text8-%s.bin'%dim, size=dim,
                  verbose=True)

word2vec.word2clusters(root + 'word2vec/text8',
                       root + 'semantic-network/data/text8-%s-clusters.txt'%dim, dim,
                       verbose=True)
예제 #41
0
#train['link_pred'] = (train.temp2 >= 1) | (train.temp1 >= train.temp1.quantile(0.7))
#accuracy = (train.link_pred == train.link.astype(bool)).mean()
#print 'Accuracy is {acc}'.format(acc=accuracy)

## Try word2vec train

import word2vec
from sklearn.metrics.pairwise import cosine_similarity as cosine

# Create txt file from node_info
all_abst_file_name = 'all_abstracts.txt'
all_phrases_file_name = 'all_abstracts_phrases.txt'
word2vec_out_file_name = 'all_abstracts.bin'

with open(pth(all_abst_file_name), 'w') as f:
    for abstract in node_info.abstract.as_matrix():
        f.write(abstract + '\n')
        
word2vec.word2phrase(pth(all_abst_file_name), pth(all_phrases_file_name), verbose=True)
word2vec.word2vec(pth(all_phrases_file_name), pth(word2vec_out_file_name), \
                    size=30, iter_=3, verbose=True)

model = word2vec.load(pth(word2vec_out_file_name))


indexes, metrics = model.cosine('applications', 20)


indexes, metrics = model.analogy(pos=['theorem', 'false'], neg=['true'], n=10)

model.vocab[indexes]
예제 #42
0
def train_word2phrase():
    word2vec.word2phrase("all_of_sefaria.txt", "word2phrase.bin", verbose=True)