def preprocess():
    train_df = load_data_from_csv(train_file)
    val_df = load_data_from_csv(validation_file)
    test_df = load_data_from_csv(test_file)


    train_df = data_clean(train_df,train_after_clean)
    val_df = data_clean(val_df,val_after_clean)
    test_df = data_clean(test_df,test_after_clean)


    train_content = train_df.iloc[:,1]
    val_content = val_df.iloc[:,1]
    test_content = test_df.iloc[:,1]

    all_content = []
    all_content.extend(train_content)
    all_content.extend(val_content)
    all_content.extend(test_content)

    print(len(all_content))

    all_seg_words = seg_words(all_content)

    with open(seg_text,"w+") as txt_write:
        for sentence in tqdm(all_seg_words):
            sentence = sentence.replace("\n","") + "\n"

            txt_write.write(sentence)
    txt_write.close()

    word2vec.word2vec(seg_text,embedding_bin,min_count = 5,size = 100,verbose = True)
Пример #2
0
def main():
    words, pos_tags = load_data('all.txt')
    word2vec.word2phrase('all.txt', 'word2phrase.txt', verbose=False)
    word2vec.word2vec('word2phrase.txt',
                      'word2vec.bin',
                      alpha=0.087,
                      hs=1,
                      size=100,
                      verbose=False)
    model = word2vec.load('word2vec.bin')
    words_table, words_vec = get_most_frequent_words(500, model, pos_tags)
    tsne = TSNE(n_components=2, random_state=87)
    words_t_vec = tsne.fit_transform(words_vec)
    # show
    figure = pyplot.figure(figsize=(12, 6), dpi=150)
    pyplot.scatter(words_t_vec[:, 0],
                   words_t_vec[:, 1],
                   c='b',
                   alpha=0.2,
                   s=15)
    texts = []
    for vec, text in zip(words_t_vec, words_table):
        texts.append(pyplot.text(vec[0], vec[1], text, size=5))
    adjust_text(texts, arrowprops=dict(arrowstyle='-', color='k', lw=0.5))
    pyplot.show()
    figure.savefig('figure.png')
Пример #3
0
def main():
    # size: 次元, threds: スレッド数, binary: 出力する形式を指定、0でテキスト形式で出力
    word2vec.word2vec(train='out_rename.txt',
                      output='knock90.txt',
                      size=300,
                      threads=4,
                      binary=0)

    t_index = OrderedDict()  # 単語:インデックス の辞書

    with open('knock90.txt', 'rt') as f:
        for i, line in enumerate(f):
            line = line.strip().split(' ')

            if i == 0:  # 1行目が語彙数と次元数
                words_count = int(line[0])
                size = int(line[1])
                # 行列の作成
                matrix_90 = np.zeros([words_count, size], dtype=np.float64)
                continue

            # たまに次元数が300以下のときがある
            if len(line[1:]) < 300:
                continue

            word = line[0]
            t_index[word] = i - 1

            matrix_90[i - 1] = line[1:]

    io.savemat('knock90_300', {'knock90_300': matrix_90})
    with open('./pickles/knock90_idx_t', 'wb') as f:
        pickle.dump(t_index, f)
def test_single_static_model(args):
    params = json.load(open("conf/{}.json".format(args.model), "r"))
    embedding_size = params['embedding_size']
    filter_sizes = params['filter_sizes']
    num_filters = params['num_filters']
    dropout_keep_prob = params['dropout_keep_prob']
    sequence_length = params['sequence_length']
    num_classes = params['num_classes']
    batch_size = params['batch_size']
    num_epochs = params['num_epochs']
    train_data = params['train_data']
    test_data = params['test_data']

    w2c = word2vec(args.word2vec).word2vec
    embedding_size = word2vec(args.word2vec).embedding_size
    datas = DataSetWord2vecEval(sequence_length=sequence_length,
                                batch_size=batch_size,
                                train_data=train_data,
                                test_data=test_data,
                                word2vec=w2c,
                                embedding_size=embedding_size)
    vocab_size = datas.vocab_size
    checkpoint_dir = os.path.abspath(
        os.path.join("{}".format(args.model), "checkpoints"))
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    model = eval(args.model)(sequence_length, num_classes, embedding_size,
                             filter_sizes, num_filters)
    model.load(checkpoint_file)
    model.eval(datas)
Пример #5
0
def train_model(in_file_name, out_file_name, use_plain_word2vec=False, size=100, phrases_n_gram=1, threads=4):
    options = {
        'size': size,
    }

    if use_plain_word2vec:
        if phrases_n_gram > 1:
            phrases_file_name = '{}.phrases'.format(in_file_name)
            word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True)
            in_file_name = phrases_file_name

        if threads:
            options['threads'] = threads

        # noinspection PyCallingNonCallable
        word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options)
    else:
        sentences = LineSentence(in_file_name)
        for i in range(phrases_n_gram - 1):
            n_gram_transformer = Phrases(sentences)
            sentences = n_gram_transformer[sentences]

        if threads:
            options['workers'] = threads

        model = Word2Vec(sentences, **options)
        model.save(out_file_name)
def train_lstm_mean_model(args):
    sequence_length = args.sequence_length
    num_classes = args.num_classes
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    train_data = args.train_data
    test_data = args.test_data
    data_exists = args.data_exists
    w2c = word2vec(args.word2vec).word2vec
    embedding_size = word2vec(args.word2vec).embedding_size
    datas = DataSetWord2vecMeanRnn(sequence_length=sequence_length,
                                   batch_size=batch_size,
                                   train_data=train_data,
                                   test_data=test_data,
                                   exists=data_exists,
                                   word2vec=w2c,
                                   embedding_size=embedding_size)
    params = {
        "embedding_size": embedding_size,
        "sequence_length": sequence_length,
        "num_classes": num_classes,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
        "train_data": train_data,
        "test_data": test_data,
        "model": args.model
    }
    if not os.path.exists("conf"):
        os.mkdir("conf")
    json.dump(params, open("conf/{}.json".format(args.model), "w"))
    model = eval(args.model)(sequence_length, num_classes, embedding_size)
    model.fit(datas, num_epochs)
Пример #7
0
def build_vectors():
    word2vec.word2vec(training_file,
                      './by1.bin',
                      size=vector_size,
                      verbose=True)
    model = word2vec.load('./by1.bin')
    return model
Пример #8
0
def train_model(in_file_name,
                out_file_name,
                use_plain_word2vec=False,
                size=100,
                phrases_n_gram=1,
                threads=4):
    options = {
        'size': size,
    }

    if use_plain_word2vec:
        if phrases_n_gram > 1:
            phrases_file_name = '{}.phrases'.format(in_file_name)
            word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True)
            in_file_name = phrases_file_name

        if threads:
            options['threads'] = threads

        # noinspection PyCallingNonCallable
        word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options)
    else:
        sentences = LineSentence(in_file_name)
        for i in range(phrases_n_gram - 1):
            n_gram_transformer = Phrases(sentences)
            sentences = n_gram_transformer[sentences]

        if threads:
            options['workers'] = threads

        model = Word2Vec(sentences, **options)
        model.save(out_file_name)
Пример #9
0
def train_word2vec(word2vec_size=128):
    seg_file = "/home/chenyu/intent_reco/output/seg.txt"
    word2vec_output_file = "/home/chenyu/intent_reco/output/word2vec_" + str(
        word2vec_size) + ".bin"
    print "Start training word2vec"
    word2vec.word2vec(seg_file,
                      word2vec_output_file,
                      size=word2vec_size,
                      verbose=True)
    print "End training word2vec"

    print "Start creating dictionary ..."
    word_dic = {}
    model = word2vec.load(word2vec_output_file)
    voc_size = model.vocab.size
    for i in range(voc_size):
        word_dic[model.vocab[i]] = model.vectors[i].tolist()
    print "End creating dictionary"

    word_dict_path = "/home/chenyu/intent_reco/output/word_dic_" + str(
        word2vec_size) + ".json"
    print "Start storing dictionary ..."
    with open(word_dict_path, "w") as f:
        json.dump(word_dic, f)
    print "End storing dictionary"
Пример #10
0
def get_pos_dictionary_matrix():
    txt_fname = 'tags.txt'
    vec_fname = 'vec.bin'
    vec_size = 15
    with open(txt_fname, 'w') as tags_file:
        words = masc_tagged.tagged_words()
        tags_file.write(' '.join([w[1] for w in words if w[1]]))

    word2vec.word2vec(txt_fname,
                      vec_fname,
                      size=vec_size,
                      negative=5,
                      sample=1,
                      cbow=1,
                      window=3,
                      verbose=False)
    model = word2vec.load(vec_fname)
    pos_dictionary = {}
    count = 2
    for tag in model.vocab:
        pos_dictionary[tag] = count
        count += 1
    pos_dictionary['UNK'] = 1
    pos_dictionary['<pad>'] = 0
    pos_matrix = np.concatenate((np.zeros(
        (2, 15), dtype='float'), model.vectors),
                                axis=0)
    return pos_dictionary, torch.tensor(pos_matrix)
Пример #11
0
def main():
    word2vec.word2phrase('./text8', './text8-phrases', verbose=True)
    word2vec.word2vec('./text8-phrases', './text8.bin', size=100, verbose=True)
    word2vec.word2clusters('./text8',
                           './text8-clusters.txt',
                           100,
                           verbose=True)
Пример #12
0
def main():
    # learn embeddings
    word2vec.word2vec()
    # convert training,test and eval data into np arrays
    DataProcessor.build_data()
    # this calculates sentiments for the data
    lstm.lstm_script()
Пример #13
0
def main():
    train_file = 'tokens81.txt'
    output_file = 'vectors.txt'
    maxtrix_file = 'matrix_x300'
    dict_index_file = 'dict_index_t'
    word2vec.word2vec(train=train_file,
                      output=output_file,
                      size=300,
                      threads=3,
                      binary=0)

    with open(output_file, 'rt') as f:
        status = f.readline().split(' ')
        size_dict = int(status[0])
        size_x = int(status[1])

        dict_index_t = OrderedDict()
        matrix_x = np.zeros([size_dict, size_x], dtype=np.float64)

        for i, line in enumerate(f):
            vecs = line.strip().split(' ')
            dict_index_t[vecs[0]] = i
            matrix_x[i] = vecs[1:]

    io.savemat(maxtrix_file, {'matrix_x300': matrix_x})
    with open(dict_index_file, 'wb') as f:
        pickle.dump(dict_index_t, f)
Пример #14
0
def word_clusters(
    corpora,
    size=100,
    verbose=True,
    text='text.txt',
    phrases='phrases.txt',
    binary='text.bin',
    clusters='clusters.txt'
):
    """Produce word2vec word clusters."""
    words = []
    for corpus in corpora:
        for document in corpus.documents:
            for sentence in document.sentences:
                for word in sentence.words:
                    words.append(word.lower().strip(punctuation + whitespace))
    with io.open(text, mode='w', encoding='utf-8') as file:
        file.write(u' '.join(words))
    word2vec.word2phrase(text, phrases, verbose=verbose)
    word2vec.word2vec(phrases, binary, size=size, verbose=verbose)
    word2vec.word2clusters(text, clusters, size, verbose=verbose)
    json_clusters = clusters.rstrip('.txt') + '.json'
    with io.open(clusters, mode='r', encoding='utf-8') as file:
        d = dict(
            (w, int(c)) for w, c in map(split, file.read().splitlines())
        )
    with io.open(json_clusters, mode='w', encoding='utf-8') as file:
        json.dump(d, file, indent=4, ensure_ascii=False)
    return d
Пример #15
0
 def build_model(self, record_list_fname, cache_dir, tmp_file):
     self.__load_seg_plainstr(record_list_fname, cache_dir, tmp_file)
     word2vec.word2vec(tmp_file,
                       self.model_name + '.bin',
                       size=self.vec_out,
                       verbose=True)
     return word2vec.load(self.model_name + '.bin')
Пример #16
0
def solve(dataId, usingExist=True):

    dataId = str(dataId)
    dataPath = './data/' + dataId + '.txt'
    binPath = './out/' + dataId + '.bin'
    outputPath = "out/ans" + dataId + ".txt"

    if not os.path.exists(binPath) or not usingExist:
        word2vec.word2vec(dataPath, binPath, size=100, verbose=True)

    # 使用word2vec载入binPath
    model = word2vec.load(binPath)

    # 打开输出文件
    output = codecs.open(outputPath, "w", "utf-8")

    ClustersNumber = 10
    WordNumber = len(model.vectors)

    # 使用Kmeans算法
    kmeans = KMeans(n_clusters=ClustersNumber,
                    random_state=0).fit(model.vectors)

    # 得到每个word ID 所属于的cluster 编号,编号范围[0, WordNumber)
    label = kmeans.labels_
    # 获取每个word 的得分,即是每个word和cluster中心的距离的相反数
    scores = []
    for i in xrange(WordNumber):
        scores.append(kmeans.score([model.vectors[i]]))

    # 把处于相同cluster的word ID 放入相同的list
    allCluster = []
    for i in xrange(ClustersNumber):
        allCluster.append([])
    for i in xrange(len(label)):
        allCluster[label[i]].append(i)

    # 定义两个word ID的大小关系,使用scores数组比较其大小关系
    def comparator(a, b):

        vala = scores[a]
        valb = scores[b]

        if vala > valb: return 1
        elif vala == valb: return 0
        else: return -1

    #对于每个cluster分别处理
    for clusterId in xrange(len(allCluster)):
        output.write("-----------------------------------cluster " +
                     str(clusterId) + ":\n")

        #排序,按照score从高到低排序
        allCluster[clusterId].sort(cmp=comparator, reverse=True)

        #获取前30个
        for x in allCluster[clusterId][:30]:
            #输出score的相反数,即输出距离
            output.write(model.vocab[x] + "  " + str(-scores[x]) + "\n")
    print '\n'
Пример #17
0
def create():
    data_path_ = os.path.dirname(os.path.dirname(
        os.path.abspath(__file__))) + '/data_.txt'
    glove_data = os.path.dirname(os.path.dirname(
        os.path.abspath(__file__))) + '/code/GloVe-master/data_.txt'
    shutil.copyfile(data_path_, glove_data)
    os.system("cd GloVe-master;sh demo.sh")
    os.remove(glove_data)
    os.remove(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        '/code/GloVe-master/vectors.bin')
    if not os.path.exists(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
            '/vectors'):
        os.mkdir(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
            '/vectors')
    shutil.move(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        '/code/GloVe-master/vectors.txt',
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        '/vectors/glove_vectors.txt')
    shutil.move(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        '/code/GloVe-master/vocab.txt',
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        '/vocab.txt')
    word2vec.word2vec(
        data_path_,
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +
        '/vectors/word2vec_vectors.txt',
        size=300,
        verbose=True,
        binary=0,
        min_count=5)
def train_vector(input_set,vector_size, min_count, words_path, bin_path):
	f = open(words_path,'w')
	for i in input_set:
		f.write(i)
		f.write(' ')
	f.close()
	word2vec.word2vec(words_path, bin_path, size=vector_size,min_count=min_count, verbose=True)
Пример #19
0
 def word_vector(self):
     print('%.2f:开始转化词向量' % (time.time() - now))
     word2vec.word2vec(self.word_file,
                       self.bin_file,
                       binary=1,
                       verbose=False)
     print('%.2f:词向量转化完成' % (time.time() - now))
Пример #20
0
def train_word2vec(filepath='./data/hp_allphrase.txt', size=100, window=5, neg=5):
    
    # Turn training data into as better input for word2vec
    #word2vec.word2phrase('./data/hp_all.txt', './data/hp_allphrase.txt', verbose=True)
    
    # Train model
    word2vec.word2vec(filepath, './model/hp.bin', size=size, window=window, negative=neg, verbose=False)
Пример #21
0
def train():
    word2vec.word2phrase('all.txt', 'phrase.txt', verbose=True)
    word2vec.word2vec('phrase.txt',
                      'vec.bin',
                      min_count=50,
                      size=50,
                      verbose=False)
Пример #22
0
def create_model(input_path, output_path):
  word2vec.word2vec(\
    input_path, \
    output_path, \
    size=10, binary=1, verbose=True)
  assert(os.path.isfile(output_path))
  #return word2vec.load(output_path)
  return word2vec.WordVectors.from_binary(output_path, encoding='ISO-8859-1')
Пример #23
0
 def get_word_vec(self, file_in, size):
     """
     Args:
         file_in (string): 
         szie (int): size of word embeddings
     the model stored in self.file_word2vec_bin
     """
     word2vec.word2vec(file_in, self.file_word2vec_bin, size, verbose=False)
Пример #24
0
def generate_model(_size=150):
    # model_file = 'model90.bin'
    word2vec.word2vec('undersore.txt',
                      model_file,
                      size=_size,
                      min_count=10,
                      verbose=True)
    print('Model generated: ' + model_file)
Пример #25
0
def create_model(input_path, output_path):
    word2vec.word2vec(\
      input_path, \
      output_path, \
      size=10, binary=1, verbose=True)
    assert (os.path.isfile(output_path))
    #return word2vec.load(output_path)
    return word2vec.WordVectors.from_binary(output_path, encoding='ISO-8859-1')
Пример #26
0
 def train_model(self):
     word2vec(self.src_file,
              self.model_file,
              window=self.window,
              hs=self.hs,
              alpha=self.alpha,
              size=self.size,
              verbose=self.verbose)
Пример #27
0
def main():

    if '--download-nltk' in argv:
        nltk.download('punkt')
        nltk.download('maxent_treebank_pos_tagger')
        nltk.download('averaged_perceptron_tagger')
        nltk.download('brown')

    if not isfile('wordvec.bin') or '--train' in argv:
        print("\nwords to phrases...")
        wv.word2phrase('./HarryPotter/HarryPotter.txt', 'phrase', verbose=1)
        print("\nphrases to vectors...")
        wv.word2vec('phrase', 'wordvec.bin', size=50, verbose=1)
        print("")

    print("\nload model...")
    model = wv.load('wordvec.bin')
    print("model shape: " + repr(model.vectors.shape))

    X, Y = [], []
    if '--load-vector' in argv:
        if isfile('X.npy') and isfile('Y.npy'):
            X = np.load('X.npy')
            Y = np.load('Y.npy')
        else:
            print("can't load X.npy, Y.npy")
            return
    else:
        print("TSNE...")
        tsne = TSNE(n_components=2, learning_rate=10, random_state=0)
        vectors = tsne.fit_transform(X=model.vectors[:SIZE, :])
        X = vectors[:, 0]
        Y = vectors[:, 1]

    print("start plot...(using nltk.corpus.brown)")
    brown_tagged_sents = brown.tagged_sents(categories='news')
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    words = unigram_tagger.tag(model.vocab[:SIZE])
    texts = []
    plt.figure(figsize=(12, 8))

    for x, y, word in zip(X, Y, words):
        print("word: (%s, %s)" % (word[0], word[1]), end="")

        if filter_words(word[0], word[1]):
            print("\r\t\t\t\tplot")
            plt.plot(x, y, 'o')
            texts.append(plt.text(x, y, word[0], fontsize=8))

        else:
            print("\r\t\t\t\tignore")

    adjust_text(texts,
                force_text=1,
                arrowprops=dict(arrowstyle="-", color="k", lw=1))

    plt.savefig("wordvec.png", dpi=100)
    plt.show()
Пример #28
0
def pre_train_word_embedding():
    word2vec.word2vec('./data2/word2vec_corpus.txt',
                      './data2/word_embedding.bin',
                      size=200,
                      window=10,
                      sample='1e-5',
                      cbow=0,
                      save_vocab='./data2/worddict',
                      min_count=6)
Пример #29
0
def checkForSemanticIndex(carrel):

    # configure
    MODEL = 'reader.bin'
    TXT = 'model.txt'
    PHRASES = 'model.phrases'

    # require
    from pathlib import Path
    from word2vec import word2vec, word2phrase
    import os

    # initialize
    localLibrary = configuration('localLibrary')
    model = localLibrary / carrel / ETC / MODEL

    # see if we have been here previously
    if not model.exists():

        # initialize some more
        stopwords = localLibrary / carrel / ETC / STOPWORDS
        corpus = localLibrary / carrel / ETC / CORPUS
        txt = str(Path.home() / TXT)
        phrases = str(Path.home() / PHRASES)

        # tokenize
        click.echo('Indexing. This needs to be done only once.', err=True)
        click.echo('Step #1 of 6: Tokenizing corpus...', err=True)
        tokens = open(corpus).read().split()

        # normalize
        click.echo('Step #2 of 6: Normalizing tokens...', err=True)
        tokens = [token.lower() for token in tokens if token.isalpha()]

        # remove stop words
        click.echo('Step #3 of 6: Removing stop words...', err=True)
        stopwords = open(stopwords).read().split()
        tokens = [token for token in tokens if token not in stopwords]

        # save
        click.echo('Step #4 of 6: Saving tokens...', err=True)
        with open(txt, 'w') as handle:
            handle.write(' '.join(tokens))

        # create phrases
        click.echo('Step #5 of 6: Creating phrases...', err=True)
        word2phrase(txt, phrases, verbose=True)

        # do the work
        click.echo('Step #6 of 6: Indexing...', err=True)
        word2vec(phrases, str(model), size=100, binary=True, verbose=True)

        # clean up and done
        os.remove(txt)
        os.remove(phrases)
        click.echo('\nDone. Happy searching!', err=True)
 def __init__(self, originData=None, w2vModelPath="vectors.w2v", vectorSize=100):
     self.__model = None
     self.__vectorSize = vectorSize
     if type(originData) is str:
         word2vec.word2vec(
             originData, 
             w2vModelPath, 
             size=vectorSize, 
             verbose=True)
         self.__model = word2vec.load(w2vModelPath)
Пример #31
0
def w2v_train():
    print '.....train word2vec start at ', time.strftime(
        '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    word2vec.word2vec(corpus_file,
                      model_file,
                      size=300,
                      verbose=True,
                      threads=30)
    print '.....finish training word2vec end at ', time.strftime(
        '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
Пример #32
0
def word_training(path, embedded_size):
    dirname = os.path.dirname(path)
    filename = os.path.basename(path)
    phrasesname = os.path.join(dirname, '{}-phrases'.format(filename))
    modelname = os.path.join(dirname, '{}.bin'.format(filename))
    print('Training...')
    word2vec.word2phrase(path, phrasesname)
    word2vec.word2vec(phrasesname, modelname, size=embedded_size)
    print('Training Done!!!')
    return modelname
Пример #33
0
def extract(dim, data, trained):
    if(not trained):
        word2vec.word2phrase(data, data+'-phrases', verbose=True)
        word2vec.word2vec(data+'-phrases', data+'.bin', size=dim, verbose=True)
    model = word2vec.load(data+'.bin')
    keys = model.vocab
    features = model.vectors
    dic = dict(zip(keys,features))
    print(len(dic))
    return dic
Пример #34
0
 def train(self):
     if not os.path.isfile(self.trained_fname):
         print("Previous training '" + self.trained_fname + "' not found. Begin training on input '" +
               self.input_fname + "' into " + str(self.train_dimensions) + " dimensions ...")
         self.trained_fname = 'src/resources/output' + self.train_dimensions
         word2vec.word2vec(self.input_fname, self.trained_fname, size=self.train_dimensions)
     else:
         print("Trained data seems to exist at '" + self.trained_fname + "'")
     print("Loading training results...")
     self.model = word2vec.load(self.trained_fname, kind='bin')
def create_word2vec_model(save_text_file):
    '''run word2vec on the text corpus and create a model'''

    save_phrases = save_text_file + '_phrases'
    save_model = save_text_file + '.bin'
    save_cluster = save_text_file + '-cluster.txt'

    # create phrases for processing
    word2vec.word2phrase(save_text_file, save_phrases, verbose=True)

    # create model
    word2vec.word2vec(save_phrases, save_model, size=100, verbose=True)

    # create cluster
    word2vec.word2clusters(save_text_file, save_cluster, 100, verbose=True)
Пример #36
0
def test_verbose():
    saved_stdout = sys.stdout

    try:
        sys.stdout = io.StringIO()

        word2vec.word2vec(input_, output_bin, size=10, binary=1, verbose=True)
        output = sys.stdout.getvalue()

        assert "b'" not in output
        assert "Starting training" in output
        assert "\\r" not in output
        assert "\r" in output

    finally:
        sys.stdout = saved_stdout
def w2v_bin(general_bin_file_path, general_corpus_file_path, corpus_name):
    """

    :param general_bin_file_path:
    :param general_corpus_file_path:
    :param corpus_name:
    :return:
    """
    # combine all files in one corpus
    text_file_path = ''.join((general_bin_file_path, corpus_name, '.text'))
    corpus_path = ''.join((general_corpus_file_path, corpus_name, '\\'))
    # create .text file for word2vec
    concatenate_files(corpus_path, text_file_path)

    # create word2vec .bin file
    word2vec_bin_path = ''.join((general_bin_file_path, corpus_name, '.bin'))
    word2vec.word2vec(text_file_path, word2vec_bin_path, size=200, verbose=True)  # size of word vectors
Пример #38
0
def testWord2Vec():
    #Train the model using the word2phrase output.
    #That generated a text8.bin file containing the word vectors in a binary format.
    word2vec.word2vec('/D/test/text8/text8-phrases.txt', '/D/test/text8/text8-phrases.bin', size=100, verbose=True)
    # bin文件:
    # 第一行存储的是 vocab_size, vector_size,读取代码为 vocab_size, vector_size = list(map(int, header.split()))
    # 其余行存储的是 word vector

    #Predictions
    model = word2vec.load('/D/test/text8/text8-phrases.bin')

    #take a look at the vocabulaty as a numpy array
    print model.vocab   #vocabulaty

    #Or take a look at the whole matrix
    print model.vectors.shape   #word vector
    print model.vectors

    # retreive the vector of individual words
    print model['dog'].shape
    print model['dog'][:10]

    #We can do simple queries to retreive words similar to "socks" based on cosine similarity:
    indexes, metrics = model.cosine('socks')

    #Its possible to get the words of those indexes
    print model.vocab[indexes]

    #There is a helper function to create a combined response: a numpy record array
    print model.generate_response(indexes, metrics).tolist()


    #Since we trained the model with the output of word2phrase we can ask for similarity of "phrases"
    indexes, metrics = model.cosine('los_angeles')  #单词的索引和余弦相似度
    print model.generate_response(indexes, metrics).tolist()  #单词和余弦相似度


    #Its possible to do more complex queries like analogies such as: king - man + woman = queen This method returns the same as cosine the indexes of the words in the vocab and the metric
    indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10)
    print model.generate_response(indexes, metrics).tolist()
Пример #39
0
def setup_model(input, 
                output, 
                binary=1, 
                cbow=0, 
                size=300, 
                window=10, 
                negative=5, 
                hs=0, 
                threads=12, 
                iter_=5, 
                min_count=5, 
                verbose=False):
    """ setup default value here for word2vec parameters
    """
    return word2vec.word2vec(input, output, binary=binary, cbow=cbow, size=size, window=window, negative=negative, hs=hs, threads=threads, iter_=iter_, min_count=min_count, verbose=verbose)
Пример #40
0
def training():
    '''
        Training use file in '../tmp/book-seg.txt'.
        Make sure that you have ran the function 'text_seg' to do segmentation first.
    '''
    word2vec.word2vec('../tmp/book-seg.txt', '../tmp/book.bin', size=300, verbose=True)
Пример #41
0
import word2vec
import sys
import getpass

user = getpass.getuser()
if user == 'ctnuser':
    root = '/home/ctnuser/bjkomer/'
elif user == 'bjkomer':
    root = '/home/bjkomer/'

if len(sys.argv) == 2:
    dim = int(sys.argv[1])
else:
    dim = 100

word2vec.word2phrase(root + 'word2vec/text8',
                     root + 'semantic-network/data/text8-phrases', verbose=True)

word2vec.word2vec(root + 'semantic-network/data/text8-phrases',
                  root + 'semantic-network/data/text8-%s.bin'%dim, size=dim,
                  verbose=True)

word2vec.word2clusters(root + 'word2vec/text8',
                       root + 'semantic-network/data/text8-%s-clusters.txt'%dim, dim,
                       verbose=True)
Пример #42
0
Файл: news.py Проект: xialei/poc
def wordvec():
    # 少于min_count次数的单词会被丢弃掉
    word2vec.word2vec('D:\nlp\corpora\segs.txt', 'vectors.bin', size=100, window=10, sample='1e-3', hs=1, negative=0, threads=12, iter_=5, min_count=10, binary=1, cbow=0, verbose=True)
Пример #43
0
# -*- coding: utf-8 -*-
"""
Created on Sat Jan 30 21:17:57 2016

@author: dudu
"""

import word2vec

if __name__ == '__main__':
    path = '/home/dudu/hack_cambridge/all.txt'
    out_path = '/home/dudu/hack_cambridge/cambridge/word2vec_model.bin'
    word2vec.word2vec(path, out_path, size=10, verbose=True)
Пример #44
0
# -*- coding: utf-8 -*-
import word2vec

if __name__ == '__main__':
    word2vec.word2vec("raw_text","text8.bin",size=100,verbose=True)
Пример #45
0
sentences = sent_tokenize(textbooks.decode('utf8'))
print 'sentence tokenization finished'

count = 0
outLines = list()
for s in sentences:
	count = count+1
	if count % 10000 == 0:
		print count
	tokens = word_tokenize(s)
	if len(tokens) < 3:
		continue
	outLines.append(str.join(' ', tokens))

print 'word tokenization finished'
outFile.write((str.join('\n', outLines)).encode('utf8'))

textbooksFile.close()
outFile.close()

#
# word2vec.word2phrase(
# 	'data/books/textbooks.txt', 'data/books/phrases', verbose=True)

print 'starting word2vec'
word2vec.word2vec(
	'data/allTokenized.txt', 'data/model_allTokenized.bin',
	 verbose=True, min_count = 5, threads = 4, size = 300, 
	 window = 9, iter_ = 10)
print 'finish'
Пример #46
0
def preprocess():
    wakati_text_file = config.get("word2vec", "wakati.file.path")
    word2vec.word2vec(wakati_text_file, wakati_bin_file, size=300, verbose=True)
Пример #47
0
def preprocess():
    word2vec.word2vec('wakati_text.txt', 'wakati_text.bin', size=300, verbose=True)
Пример #48
0
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
# Split train/test set
# TODO: This is very crude, should use cross-validation
x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
vocab_size = len(vocabulary)
print("Vocabulary Size: {:d}".format(vocab_size))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

data = x_train.flatten()
iterations = 1000
data = data[data!=468]
w2v = word2vec.word2vec(data,vocabulary,vocabulary_inv,vocab_size,iterations)
final_embeddings = w2v.runWord2Vec()

accuracies = [] 
# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        cnn = TextCNN(
            sequence_length=x_train.shape[1],
            num_classes=2,
# step 1 install word2vec (ref: http://nbviewer.jupyter.org/github/danielfrg/word2vec/blob/master/examples/word2vec.ipynb)
import word2vec
import numpy as np
import scipy.io as sio

vector_size = 100
amount_nearest = 100

word2vec.word2phrase('text8', 'text8-phrases', verbose=True)
word2vec.word2vec('text8-phrases', 'text8.bin', size=vector_size, verbose=True)
word2vec.word2clusters('text8', 'text8-clusters.txt', vector_size, verbose=True)

# read the trained model
model = word2vec.load('text8.bin')

# list of vague motivation coming from mind (topic: potential problems for enterprise)
motivation = ['enterprise', \
				'business',\
				'solution',\
				'entrepreneur',\
				'latent',\
				'problem',\
				'funds',\
				'management',\
				'quality',\
				'projects']
							
# start get nearest clusters by picking the similar words
amount_motivation = len(motivation)
motivation_vector = []
nearest_indexes = []
        features[7] = len(sentence1) / len(sentence2)

    return features

# Uses treetagger-python (Installation https://github.com/miotto/treetagger-python ; http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/)
try:
    semanticsimilarity_lookuptable = pickle.load(open('semanticsimilarity_lookuptable.pkl', 'rb'))
except Exception:
    semanticsimilarity_lookuptable = {}

print "Build Word2Vec Corpus"
dir = os.path.dirname(os.path.abspath(__file__))
try:
    # on OSX for some reason this does not work
    word2vec.word2phrase(dir + '/text8', dir + '/text8-phrases', verbose=True)
    word2vec.word2vec(dir + '/text8-phrases', dir + '/text8.bin', size=100, verbose=True)
except Exception as e:
    print e

model = word2vec.load(dir + '/text8.bin')
print "Finish"

def computeSemantics(sentence1, sentence2):
def computeSemanticSimilarityFeatures(sentence1, sentence2):
    features = [0] * 9

    if (sentence1 + sentence2) not in semanticsimilarity_lookuptable:
        def prepareSentence(sentence):
            return sentence.replace('-', ' ').replace('$', ' ')

        tt = TreeTagger(language='english')
Пример #51
0
import word2vec

word2vec.word2phrase('text8', 'text8-phrases', verbose=True)
word2vec.word2vec('text8-phrases', 'text8.bin', size=100, verbose=True)

# word2vec.word2clusters('text8', 'text8-clusters.txt', 10, verbose=True)

# word2vec.word2phrase('enwik9', 'enwik9-phrases', verbose=True)
# word2vec.word2vec('enwik9-phrases', 'enwik9.bin', size=100, verbose=True)
Пример #52
0
def setup_module(module):
    word2vec.word2phrase(input_, output_phrases, verbose=False)
    word2vec.word2vec(input_, output_bin, size=10, binary=1, verbose=False)
    word2vec.word2vec(input_, output_txt, size=10, binary=0, verbose=False)
    word2vec.word2clusters(input_, output_clusters, 10, verbose=True)
Пример #53
0
# -*- coding: utf-8 -*-
import plyvel
import re, string
import sys, locale
import word2vec
import os

reload(sys)
sys.setdefaultencoding(locale.getdefaultlocale()[1])


model_path = os.path.abspath('model.bin')
text_path = os.path.abspath('text.txt')
phrase_path = os.path.abspath('phrases.txt')

word2vec.word2phrase(text_path, phrase_path, verbose=True)
word2vec.word2vec(phrase_path, model_path, binary=1, verbose=True)
model = word2vec.load(model_path)

indexes, metrics = model.cosine('seymour')
print (string.join(model.vocab[indexes], ' '))

Пример #54
0
print args

#read the fil




# Read a CSV file with the train paramters



with open(args.parameter_file,'r') as input:
    runs = json.load(input)
    #print runs
    for run_code in runs:
        print "Procesing %s " % run_code

        output = "dewiki-" + run_code + ".bin"
        
        
        word2vec.word2vec(args.text_file,output=output,
                          verbose=True,
                         **runs[run_code])

        print "Successfully finished processing %s" % run_code 





Пример #55
0
import word2vec
from timeit import default_timer as timer

start = timer()
word2vec.word2vec('data-test.txt', 'vectors-model.bin', cbow=0, size=100, window=10, negative=5, hs=0, sample='1e-4', threads=8, iter_=20, min_count=1, verbose=True)
end = timer()
print('Model generated in %f seconds' % (end - start))
Пример #56
0
def w2v_train():
    print '.....train word2vec start at ', time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    word2vec.word2vec(corpus_file, model_file,size=300, verbose=True,threads=30)
    print '.....finish training word2vec end at ', time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
Пример #57
0
#train['link_pred'] = (train.temp2 >= 1) | (train.temp1 >= train.temp1.quantile(0.7))
#accuracy = (train.link_pred == train.link.astype(bool)).mean()
#print 'Accuracy is {acc}'.format(acc=accuracy)

## Try word2vec train

import word2vec
from sklearn.metrics.pairwise import cosine_similarity as cosine

# Create txt file from node_info
all_abst_file_name = 'all_abstracts.txt'
all_phrases_file_name = 'all_abstracts_phrases.txt'
word2vec_out_file_name = 'all_abstracts.bin'

with open(pth(all_abst_file_name), 'w') as f:
    for abstract in node_info.abstract.as_matrix():
        f.write(abstract + '\n')
        
word2vec.word2phrase(pth(all_abst_file_name), pth(all_phrases_file_name), verbose=True)
word2vec.word2vec(pth(all_phrases_file_name), pth(word2vec_out_file_name), \
                    size=30, iter_=3, verbose=True)

model = word2vec.load(pth(word2vec_out_file_name))


indexes, metrics = model.cosine('applications', 20)


indexes, metrics = model.analogy(pos=['theorem', 'false'], neg=['true'], n=10)

model.vocab[indexes]
Пример #58
0
import os
import sys
sys.path.append(os.path.abspath(__file__ + "/../../"))

import pandas as pd
import word2vec as w2v

#w2v.word2phrase('text8.txt', 'text8-phrases', verbose=True)

w2v.word2vec('training_text.txt', 'training_text_clean.bin', size=100, verbose=True)

#w2v.word2clusters('/Users/Henrik/Downloads/test/text8', '/Users/Henrik/Downloads/test/text8-clusters.txt', 100, verbose=True)