def get_word2vec_model():

    with open("../sentence_level_corpus_all_information_normalized.csv") as f:
        data = f.readlines()

    sentences = []
    for i in data[1:]:
        sen = i.split("|")[1]
        words = nltk.word_tokenize(sen)
        sentences.append(words)

    sentences_1 = [
        i for i in Text8Corpus('/home2/hk/workshop/Data/text8/text8')
    ]
    sentences = sentences + sentences_1

    model = Word2Vec(sentences,
                     size=100,
                     window=5,
                     min_count=1,
                     workers=5,
                     iter=10)
    model.wv.save(path)
    # model.save("word2vec.model")
    vector = model.wv['brother-in-law']  # numpy vector of a word
    print(vector)
예제 #2
0
파일: test.py 프로젝트: errazudin/talks
def train_models(corpus_file, output_name):
    output_file = '{:s}_ft'.format(output_name)
    if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))):
        print('Training fasttext on {:s} corpus..'.format(corpus_file))
        %time !{FT_HOME}fasttext skipgram -input {corpus_file} -output {MODELS_DIR+output_file}  -lr {lr} -dim {dim} -ws {ws} -epoch {epoch} -minCount {minCount} -neg {neg} -loss {loss} -t {t}
    else:
        print('\nUsing existing model file {:s}.vec'.format(output_file))
        
    output_file = '{:s}_ft_no_ng'.format(output_name)
    if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))):
        print('\nTraining fasttext on {:s} corpus (without char n-grams)..'.format(corpus_file))
        %time !{FT_HOME}fasttext skipgram -input {corpus_file} -output {MODELS_DIR+output_file}  -lr {lr} -dim {dim} -ws {ws} -epoch {epoch} -minCount {minCount} -neg {neg} -loss {loss} -t {t} -maxn 0
    else:
        print('\nUsing existing model file {:s}.vec'.format(output_file))
        
    output_file = '{:s}_gs'.format(output_name)
    if not os.path.isfile(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file))):
        print('\nTraining word2vec on {:s} corpus..'.format(corpus_file))
        
        # Text8Corpus class for reading space-separated words file
        %time gs_model = Word2Vec(Text8Corpus(corpus_file), **params); gs_model
        # Direct local variable lookup doesn't work properly with magic statements (%time)
        locals()['gs_model'].save_word2vec_format(os.path.join(MODELS_DIR, '{:s}.vec'.format(output_file)))
        print('\nSaved gensim model as {:s}.vec'.format(output_file))
    else:
        print('\nUsing existing model file {:s}.vec'.format(output_file))
예제 #3
0
 def fit_transform(self, data, return_ids=False, overwrite_corpus=True):
     filtered_data = self.filter.filter_dataset(data)
     if overwrite_corpus:
         self.save_corpus(filtered_data)
         self.tf_idf = self.tf_idf.fit(
             filtered_data['description'].values.astype('U'))
         corpus = Text8Corpus(self.corpus_path)
         self.word2vec = Word2Vec(corpus, size=300, min_count=1)
     tf_idf_weightings = dict(
         zip(self.tf_idf.get_feature_names(), self.tf_idf.idf_))
     vectors = []
     ids = []
     counter_empty = 0
     for i, d in enumerate(filtered_data['description'].values.astype('U')):
         descriptor_count = 0
         weighted_terms = []
         terms = d.split(' ')
         for term in terms:
             if term in tf_idf_weightings.keys():
                 tf_idf_weighting = tf_idf_weightings[term]
                 word_vector = self.word2vec.wv.get_vector(term).reshape(
                     1, 300)
                 weighted_word_vector = tf_idf_weighting * word_vector
                 weighted_terms.append(weighted_word_vector)
                 descriptor_count += 1
         if len(weighted_terms) == 0:
             counter_empty += 1
         review_vector = [np.zeros(300)] if not len(
             weighted_terms) else sum(weighted_terms) / len(weighted_terms)
         vectors.append(review_vector)
         ids.append(filtered_data['id'][i])
     # print('Вин без описания', counter_empty)
     vectors = np.concatenate(vectors)
     return vectors, np.array(ids) if return_ids else vectors
예제 #4
0
def train_glove(path):
    import itertools
    from gensim.models.word2vec import Text8Corpus
    from gensim.scripts.glove2word2vec import glove2word2vec
    from glove import Corpus, Glove
    #import os
    #import struct
    sentences = list(itertools.islice(Text8Corpus(path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES)
    glove.save(file_name)
    glove2word2vec(file_name, file_name + '_modified')
    """
    command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified'
    os.system(command)
    with open(file_name+'_modified', mode='rb') as file: # b is important -> binary
        fileContent = file.read()
        print 'Content',fileContent
    """
    print 'Finished'
    return glove
예제 #5
0
def load_wv():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    sentences = Text8Corpus('text8')
    model = Word2Vec(sentences, size=200)
    model.save('features/text8_w2v_features/text8.model')
    model.wv.save_word2vec_format('features/text8_w2v_features/text.model.bin',
                                  binary=True)
예제 #6
0
def main():
    psr = argparse.ArgumentParser()
    psr.add_argument('-d', '--dim', default=200, type=int)
    psr.add_argument('-p', '--path', default='ubuntu_data/train.txt')
    args = psr.parse_args()
    sentences = Text8Corpus(args.path)
    print('training')
    model = Word2Vec(sentences,
                     size=args.dim,
                     window=5,
                     min_count=5,
                     workers=4)
    model.save('ubuntu_word2vec_' + str(args.dim) + '.model')
    print('saved.')
예제 #7
0
def preprocess_hebrew(reports_df, corpus_name):
    """
    Hebrew reports preprocessing
    :param reports_df: dataframe of hebrew reports
    :param corpus_name: Name of the corpus
    :return: model name, vocab list, model_name, raw_text_file to tokenize on
    """
    reports_df = reports_df[conf.col_report][reports_df[conf.col_source] == conf.HEBREW_REPORT_INDICATOR]

    # preprocess
    reports_df = reports_df.apply(sub_preporcess_hebrew)
    text_reports = reports_df.values
    text_reports = ' '.join(list(text_reports)).encode('utf-8')

    # Save raw text
    date = get_date_for_model_name()
    model_name = date + '_bert_' + conf.HEBREW_IDENTIFIER_NAME + '_' + corpus_name
    raw_text_file = conf.DATA_PATH + model_name + '.txt'

    if os.path.exists(raw_text_file):
        os.remove(raw_text_file)

    with open(raw_text_file, "wb") as text_file:
        text_file.write(text_reports)

    # Load training data.
    sentences = Text8Corpus(raw_text_file)

    # Train a toy bigram model.
    phrases = Phrases(sentences, min_count=7, threshold=100, max_vocab_size=len(text_reports))
    del text_reports

    # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
    bigram = Phraser(phrases)

    reports_df = reports_df.str.split(' ')
    reports_df = reports_df.apply(lambda x: bigram[x])
    reports_df = reports_df.apply(lambda x: " ".join(x))

    reports_df.to_csv(raw_text_file, header=None, index=None, sep=' ', mode='a')
    del reports_df

    counter = Counter()
    with open(raw_text_file, encoding="utf-8") as f:
        for line in f:
            counter.update(line.split())

    vocab = list(counter.keys())
    return model_name, vocab, raw_text_file
예제 #8
0
파일: utils.py 프로젝트: lschuell/kbp
def collocation(filepath):
    '''Creates corpus considering collocations, frequent co-occuring bigrams are merged (new york -> new_york)'''

    abs_path = os.getcwd() + "/"
    corpus = Text8Corpus(datapath(abs_path + filepath))
    phrases = Phrases(corpus)
    collocations = Phraser(phrases)
    text_list = [collocations[line] for line in corpus]
    flattened_list = [i for sub in text_list for i in sub]
    flattened_corpus = " ".join(flattened_list)

    outfile = open(filepath + ".collocation", "w")

    for line in flattened_corpus:
        outfile.write(line)

    outfile.close()
예제 #9
0
def main():
    docs = list(itertools.islice(Text8Corpus('text8'), None))
    ''' Make model '''
    corpus = Corpus()
    corpus.fit(docs, window=10)
    ''' Load Model '''
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)

    glove.add_dictionary(corpus.dictionary)

    print('man')
    pprint.pprint(glove.most_similar('man', number=10))
    print('flog')
    pprint.pprint(glove.most_similar('flog', number=10))

    return
예제 #10
0
def create_graph():
    #ctx = mx.gpu(0)
    bert_embedding = BertEmbedding(model='bert_12_768_12',
                                   dataset_name='book_corpus_wiki_en_cased')

    text8Corpus = Text8Corpus(
        "/Users/divyakoyyalagunta/Projects/codenames/text8.txt",
        max_sentence_length=10)

    # sentences = ["urged Filipinos to stop weeping for the man who had laughed all the way to the bank",
    #  "Soon after setting off we came to a forested valley along the bank of the Gwaun",
    #   "The condom balloon was denied official entry status this year",
    #   "The marine said, get down behind that grass bank, sir, and he immediately lobbed a mills grenade into the river"]
    for sentence in text8Corpus:
        joined_sentence = (" ").join(sentence)

        add_sentence_emb(bert_embedding, joined_sentence)

    print("Number of embeddings", len(embedding_vector_averages))
    print("Number of words in word_to_idx_dict", len(word_to_idx_dict.keys()))

    tree_idx = 0
    mod = 50000
    emb_size = 768
    t = AnnoyIndex(emb_size, metric='angular')

    for x in range(len(embedding_vector_averages)):
        if (tree_idx % mod == 0):
            print("ADDED ", tree_idx, " EMBEDDINGS TO ANNOY TREE")

        embedding = embedding_vector_averages[x][0]

        if len(embedding) == 0:
            continue

        t.add_item(x, embedding)

        tree_idx += 1

    t.build(100)

    idx_to_word_dict = {v: k for k, v in word_to_idx_dict.items()}

    t.save('annoy_tree_bert_emb_768_test.ann')
    np.save('annoy_tree_index_to_word_bert_emb_768_test.npy', idx_to_word_dict)
예제 #11
0
 def createModel(self,
                 pathCorpus,
                 min_count=5,
                 size=300,
                 workers=8,
                 window=5,
                 iter=5,
                 sg=1,
                 negative=10):
     sentences = Text8Corpus(datapath(pathCorpus))
     model = Word2Vec(
         sentences,
         min_count=min_count,  # Ignore words that appear less than this
         size=size,  # Dimensionality of word embeddings
         workers=workers,  # Number of processors
         window=window,  # Context window for words during training
         iter=iter,  # Number of epochs training over corpus
         sg=sg,  # skip gram true
         negative=negative)
     return model
예제 #12
0
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import Text8Corpus

# Using params from Word2Vec_FastText_Comparison
params = {
    'alpha': 0.05,
    'vector_size': 100,
    'window': 5,
    'epochs': 5,
    'min_count': 5,
    'sample': 1e-4,
    'sg': 1,
    'hs': 0,
    'negative': 5,
}
model = Word2Vec(Text8Corpus(text8_path), **params)
wv = model.wv
print("Using trained model", wv)

###############################################################################
# 3. Construct AnnoyIndex with model & make a similarity query
# ------------------------------------------------------------
#
# An instance of ``AnnoyIndexer`` needs to be created in order to use Annoy in Gensim.
# The ``AnnoyIndexer`` class is located in ``gensim.similarities.annoy``.
#
# ``AnnoyIndexer()`` takes two parameters:
#
# * **model**: A ``Word2Vec`` or ``Doc2Vec`` model.
# * **num_trees**: A positive integer. ``num_trees`` effects the build
#   time and the index size. **A larger value will give more accurate results,
예제 #13
0
def multibleutest():
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)
    if not os.path.exists("./multiBleu_log/"):
        os.makedirs("./multiBleu_log/")

    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    # glove embeddings
    gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
    gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]

    with tf.Session() as sess:
        model = GAN('G_test',
                    FLAGS.size,
                    FLAGS.num_layers,
                    FLAGS.vocab_size,
                    _buckets,
                    FLAGS.feature_size,
                    FLAGS.baseline,
                    FLAGS.lr,
                    FLAGS.lr_decay,
                    FLAGS.grad_norm,
                    critic=None,
                    use_attn=FLAGS.use_attn,
                    output_sample=True,
                    input_embed=True,
                    batch_size=FLAGS.batch_size,
                    D_lambda=FLAGS.lambda_dis,
                    G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                    dtype=tf.float32)

        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        print('read in model from {}'.format(ckpt.model_checkpoint_path))

        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        prompt = None
        cheatsheetMAP = {}
        with open(FLAGS.test_path, 'r') as source:
            for line in source.readlines():
                line = line.strip()
                if line == "":
                    prompt = None
                    continue
                elif prompt == None:
                    cheatsheetMAP[line] = []
                    prompt = line
                else:
                    cheatsheetMAP[prompt].append(line)

        answer = []
        with open("./multiBleu_log/" + FLAGS.file_head + "_ref.txt",
                  'w') as ffop:
            with open("./multiBleu_log/" + FLAGS.file_head + "_Q.txt",
                      'w') as fop:
                feature, output_file, output_list = [], [], []
                for i in range(FLAGS.feature_size):
                    output_file.append("./multiBleu_log/" + FLAGS.file_head +
                                       "_{}.txt".format(i))
                    feature.append([[
                        3 if x == i else 0 for x in range(FLAGS.feature_size)
                    ]])
                    output_list.append([])

                for p, refs in cheatsheetMAP.items():
                    check = False
                    token_ids = data_utils.sentence_to_token_ids(
                        tf.compat.as_bytes(p), vocab, normalize_digits=False)
                    token_ids.append(data_utils.EOS_ID)
                    if len(token_ids) > _buckets[-1][0]:
                        continue
                    encoder_pad = [data_utils.PAD_ID
                                   ] * (_buckets[-1][0] - len(token_ids))
                    encoder_lens = [len(token_ids)]
                    token_ids = list(token_ids) + encoder_pad
                    encoder_inputs = []
                    for idx in token_ids:
                        encoder_inputs.append([idx])
                    decoder_inputs = [[data_utils.GO_ID]]

                    outputs_list = []
                    for x in range(FLAGS.feature_size):
                        A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \
                                                                decoder_inputs, feature[x], gloveA_emb, gloveB_emb)
                        outputs = [output_ids[0] for output_ids in outputs]
                        if data_utils.EOS_ID in outputs:
                            outputs = outputs[:outputs.index(data_utils.EOS_ID
                                                             )]
                        if data_utils.UNK_ID in outputs:
                            check = True
                            break
                        outputs_list.append(" ".join([
                            tf.compat.as_str(rev_vocab[output])
                            for output in outputs
                        ]))

                    if check:
                        continue
                    else:
                        fop.write(p)
                        fop.write('\n')
                        for x in refs:
                            ffop.write(x + '\n')
                        ffop.write('\n')
                        answer.append(refs)
                        for x in range(FLAGS.feature_size):
                            output_list[x].append(outputs_list[x])

            for x in range(FLAGS.feature_size):
                with open(output_file[x], 'w') as op:
                    bleu = []
                    for i, line in enumerate(output_list[x]):
                        op.write(line)
                        op.write("\n")
                        score = sentence_bleu(answer[i], line)
                        bleu.append(score)
                    op.write("My BLEU: {}".format(sum(bleu) / len(bleu)))
                    op.write('\n')
예제 #14
0
            scorer=None)  # we will use our score_item function redefinition
        new_s = []
        for words, score in bigrams:
            if score is not None:
                words = delimiter.join(words)
            new_s.append(words)
        return [utils.to_unicode(w) for w in new_s]


if __name__ == '__main__':
    logging.basicConfig(
        format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
        level=logging.INFO)
    logging.info("running %s", " ".join(sys.argv))

    # check and process cmdline input
    program = os.path.basename(sys.argv[0])
    if len(sys.argv) < 2:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    infile = sys.argv[1]

    from gensim.models import Phrases  # noqa:F811 for pickle
    from gensim.models.word2vec import Text8Corpus
    sentences = Text8Corpus(infile)

    # test_doc = LineSentence('test/test_data/testcorpus.txt')
    bigram = Phrases(sentences, min_count=5, threshold=100)
    for s in bigram[sentences]:
        print(utils.to_utf8(u' '.join(s)))
예제 #15
0
import re
import numpy as np

from gensim.models.word2vec import Text8Corpus
import glove
from multiprocessing import Pool
from scipy import spatial
import itertools

sentences = list(itertools.islice(Text8Corpus('text8'),None))
iv = open("iv.txt","r").read()

iv.fit(sentences, window = 10)
def word2vec():
    data = Text8Corpus('data/corpus_winestyle.txt')
    return Word2Vec(data, size=300)
def Similarity_Criteria(data1, criteria, MAX_CAT, Max_Iter):
    import pandas as pd
    import numpy as np

    text_to_be_used_later = data1['Text'].tolist()
    data1['Text'] = cleanText(data1)
    data1['Category'] = data1['Category'].astype(str)
    y = data1['Category'].tolist()
    cols_in_data_final = [w.replace('[', '_') for w in y]
    cols_in_data_final = [w.replace(']', '_') for w in cols_in_data_final]
    cols_in_data_final = [w.replace('<', '_') for w in cols_in_data_final]
    data1['Category'] = cols_in_data_final

    if criteria == 'Doc2Vec':
        from gensim.models.doc2vec import Doc2Vec, TaggedDocument
        from nltk.tokenize import word_tokenize
        data = data1['Text'].astype(str).tolist()

        tagged_data = [
            TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)])
            for i, _d in enumerate(data)
        ]

        vec_size = 100
        ALPHA = 0.01
        max_epochs = 100
        DM = 1
        MIN_COUNT = 5
        MIN_ALPHA = 0.0001

        model = Doc2Vec(size=vec_size,
                        alpha=ALPHA,
                        min_alpha=MIN_ALPHA,
                        min_count=MIN_COUNT,
                        dm=DM,
                        seed=1234,
                        workers=1)

        model.build_vocab(tagged_data)

        for epoch in range(int(max_epochs)):
            #print('iteration {0}'.format(epoch))
            model.train(tagged_data,
                        total_examples=model.corpus_count,
                        epochs=model.iter)
            # decrease the learning rate
            model.alpha -= 0.0002
            # fix the learning rate, no decay
            model.min_alpha = model.alpha

        freq_count = data1['Category'].value_counts()
        freq_count = pd.DataFrame(freq_count)
        freq_count['catName'] = freq_count.index
        freq_count.columns = ['Cnt', 'catName']

        new_categories_to_fill = []
        MAX_CAT = MAX_CAT

        j = 1
        while len(np.unique(data1['Category'])) > MAX_CAT and j < Max_Iter:
            new_categories_to_fill = []
            for i in range(len(data1)):
                original_cat = data1.iloc[i]['Category']
                most_similar_cat = data1.iloc[int(
                    model.docvecs.most_similar([i])[0][0])]['Category']
                orig_freq_cnt = freq_count[freq_count['catName'] ==
                                           original_cat]['Cnt'].values[0]
                most_similar_freq_cnt = freq_count[
                    freq_count['catName'] == most_similar_cat]['Cnt'].values[0]
                if orig_freq_cnt >= most_similar_freq_cnt:
                    new_categories_to_fill.append(original_cat)
                else:
                    new_categories_to_fill.append(most_similar_cat)
            data1['Category'] = new_categories_to_fill
            freq_count = data1['Category'].value_counts()
            freq_count = pd.DataFrame(freq_count)
            freq_count['catName'] = freq_count.index
            freq_count.columns = ['Cnt', 'catName']

            j += 1
            print('Iteration....', j)

    if criteria == 'TfIdf':

        from sklearn.feature_extraction.text import TfidfVectorizer
        from scipy.sparse.csr import csr_matrix
        from nltk.corpus import stopwords
        from nltk.tokenize import word_tokenize
        from nltk.stem import WordNetLemmatizer
        import re
        from sklearn.metrics.pairwise import linear_kernel

        tf = TfidfVectorizer(input=data1['Text'].tolist(),
                             analyzer='word',
                             lowercase=False,
                             ngram_range=(1, 10),
                             sublinear_tf=True,
                             norm='l2')
        tfidf_matrix = tf.fit_transform(data1['Text'].tolist())
        cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
        np.fill_diagonal(cosine_similarities, -1)
        distances = np.argmax(cosine_similarities, axis=1)

        freq_count = data1['Category'].value_counts()
        freq_count = pd.DataFrame(freq_count)
        freq_count['catName'] = freq_count.index
        freq_count.columns = ['Cnt', 'catName']

        new_categories_to_fill = []
        MAX_CAT = MAX_CAT

        j = 1
        while len(np.unique(data1['Category'])) > MAX_CAT and j < Max_Iter:
            new_categories_to_fill = []
            for i in range(len(data1)):
                original_cat = data1.iloc[i]['Category']
                most_similar_cat = data1.iloc[distances[i]]['Category']
                orig_freq_cnt = freq_count[freq_count['catName'] ==
                                           original_cat]['Cnt'].values[0]
                most_similar_freq_cnt = freq_count[
                    freq_count['catName'] == most_similar_cat]['Cnt'].values[0]
                if orig_freq_cnt >= most_similar_freq_cnt:
                    new_categories_to_fill.append(original_cat)
                else:
                    new_categories_to_fill.append(most_similar_cat)
            data1['Category'] = new_categories_to_fill
            freq_count = data1['Category'].value_counts()
            freq_count = pd.DataFrame(freq_count)
            freq_count['catName'] = freq_count.index
            freq_count.columns = ['Cnt', 'catName']

            j += 1
            print('Iteration....', j)

    if criteria == 'Word2Vec-PretrainedGoogle':
        import gensim.models.keyedvectors as word2vec
        model = word2vec.KeyedVectors.load_word2vec_format(
            'GoogleNews-vectors-negative300.bin.gz', binary=True)

        from scipy import spatial

        index2word_set = set(model.wv.index2word)

        def avg_feature_vector(sentence, model, num_features, index2word_set):
            words = sentence.split()
            feature_vec = np.zeros((num_features, ), dtype='float32')
            n_words = 0
            for word in words:
                if word in index2word_set:
                    n_words += 1
                    feature_vec = np.add(feature_vec, model[word])
            if (n_words > 0):
                feature_vec = np.divide(feature_vec, n_words)
            return feature_vec

        foo = avg_feature_vector(data1['Text'][0],
                                 model=model,
                                 num_features=300,
                                 index2word_set=index2word_set)
        foo = pd.DataFrame(foo).T
        fooToFill = foo[0:0]
        for i in range(len(data1)):
            foo = avg_feature_vector(data1['Text'][i],
                                     model=model,
                                     num_features=300,
                                     index2word_set=index2word_set)
            foo = pd.DataFrame(foo).T
            fooToFill = pd.concat([fooToFill, foo], axis=0)

        from sklearn.metrics.pairwise import cosine_distances
        cosine_similarities = 1 - cosine_distances(fooToFill, fooToFill)
        np.fill_diagonal(cosine_similarities, -1)
        distances = np.argmax(cosine_similarities, axis=1)

        freq_count = data1['Category'].value_counts()
        freq_count = pd.DataFrame(freq_count)
        freq_count['catName'] = freq_count.index
        freq_count.columns = ['Cnt', 'catName']

        new_categories_to_fill = []
        MAX_CAT = MAX_CAT

        j = 1
        while len(np.unique(data1['Category'])) > MAX_CAT and j < Max_Iter:
            new_categories_to_fill = []
            for i in range(len(data1)):
                original_cat = data1.iloc[i]['Category']
                most_similar_cat = data1.iloc[distances[i]]['Category']
                orig_freq_cnt = freq_count[freq_count['catName'] ==
                                           original_cat]['Cnt'].values[0]
                most_similar_freq_cnt = freq_count[
                    freq_count['catName'] == most_similar_cat]['Cnt'].values[0]
                if orig_freq_cnt >= most_similar_freq_cnt:
                    new_categories_to_fill.append(original_cat)
                else:
                    new_categories_to_fill.append(most_similar_cat)
            data1['Category'] = new_categories_to_fill
            freq_count = data1['Category'].value_counts()
            freq_count = pd.DataFrame(freq_count)
            freq_count['catName'] = freq_count.index
            freq_count.columns = ['Cnt', 'catName']

            j += 1
            print('Iteration....', j)

    if criteria == 'Word2Vec-Text8Corpus':
        from gensim.models.word2vec import Text8Corpus
        from gensim.models import Word2Vec
        w2v_model2 = Word2Vec(Text8Corpus('text8'),
                              size=100,
                              window=5,
                              min_count=5,
                              workers=4)
        index2word_set = set(w2v_model2.wv.index2word)

        def avg_feature_vector_Text8(sentence, model, num_features,
                                     index2word_set):
            words = sentence.split()
            feature_vec = np.zeros((num_features, ), dtype='float32')
            n_words = 0
            for word in words:
                if word in index2word_set:
                    n_words += 1
                    feature_vec = np.add(feature_vec, model[word])
            if (n_words > 0):
                feature_vec = np.divide(feature_vec, n_words)
            return feature_vec

        foo = avg_feature_vector_Text8(data1['Text'][0],
                                       model=w2v_model2,
                                       num_features=100,
                                       index2word_set=index2word_set)
        foo = pd.DataFrame(foo).T
        fooToFill = foo[0:0]
        for i in range(len(data1)):
            foo = avg_feature_vector_Text8(data1['Text'][i],
                                           model=w2v_model2,
                                           num_features=100,
                                           index2word_set=index2word_set)
            foo = pd.DataFrame(foo).T
            fooToFill = pd.concat([fooToFill, foo], axis=0)

        from sklearn.metrics.pairwise import cosine_distances
        cosine_similarities = 1 - cosine_distances(fooToFill, fooToFill)
        np.fill_diagonal(cosine_similarities, -1)
        distances = np.argmax(cosine_similarities, axis=1)

        freq_count = data1['Category'].value_counts()
        freq_count = pd.DataFrame(freq_count)
        freq_count['catName'] = freq_count.index
        freq_count.columns = ['Cnt', 'catName']

        new_categories_to_fill = []
        MAX_CAT = MAX_CAT

        j = 1
        while len(np.unique(data1['Category'])) > MAX_CAT and j < Max_Iter:
            new_categories_to_fill = []
            for i in range(len(data1)):
                original_cat = data1.iloc[i]['Category']
                most_similar_cat = data1.iloc[distances[i]]['Category']
                orig_freq_cnt = freq_count[freq_count['catName'] ==
                                           original_cat]['Cnt'].values[0]
                most_similar_freq_cnt = freq_count[
                    freq_count['catName'] == most_similar_cat]['Cnt'].values[0]
                if orig_freq_cnt >= most_similar_freq_cnt:
                    new_categories_to_fill.append(original_cat)
                else:
                    new_categories_to_fill.append(most_similar_cat)
            data1['Category'] = new_categories_to_fill
            freq_count = data1['Category'].value_counts()
            freq_count = pd.DataFrame(freq_count)
            freq_count['catName'] = freq_count.index
            freq_count.columns = ['Cnt', 'catName']

            j += 1
            print('Iteration....', j)

    data1['Text'] = text_to_be_used_later
    return data1
예제 #18
0
def seriestest():
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)

    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    # glove embeddings
    gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
    gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]

    with tf.Session() as sess:
        # build the model
        model = StarGAN('G_test',
                        FLAGS.size,
                        FLAGS.num_layers,
                        FLAGS.vocab_size,
                        _buckets,
                        FLAGS.feature_size,
                        FLAGS.baseline,
                        FLAGS.lr,
                        FLAGS.lr_decay,
                        FLAGS.grad_norm,
                        critic=None,
                        use_attn=FLAGS.use_attn,
                        output_sample=True,
                        input_embed=True,
                        batch_size=FLAGS.batch_size,
                        D_lambda=FLAGS.lambda_dis,
                        G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                        dtype=tf.float32)
        #sess.run(tf.variables_initializer(tf.global_variables()))
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        print('read in model from {}'.format(ckpt.model_checkpoint_path))

        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        sys.stdout.write('> ')
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            if sentence.strip() == 'exit()':
                break
            # step
            number = 0
            feature = []
            for f in range(FLAGS.feature_size):
                feature.append(
                    [[3 if x == f else 0 for x in range(FLAGS.feature_size)]])

            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), vocab, normalize_digits=False)
            print(token_ids)
            token_ids.append(data_utils.EOS_ID)
            encoder_pad = [data_utils.PAD_ID
                           ] * (_buckets[-1][0] - len(token_ids))
            encoder_lens = [len(token_ids)]
            # feature in my implementation
            token_ids = list(token_ids) + encoder_pad
            encoder_inputs = []
            for idx in token_ids:
                encoder_inputs.append([idx])
            print(encoder_inputs)
            decoder_inputs = [[data_utils.GO_ID]]

            for x in range(FLAGS.feature_size):
                A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \
                                                             decoder_inputs, feature[x], gloveA_emb, gloveB_emb)
                #print(A)
                #outputs = [int(np.argmax(logit, axis=1)) for logit in outputs]
                outputs = [output_ids[0] for output_ids in outputs]
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                print(
                    feature[x], ':', " ".join([
                        tf.compat.as_str(rev_vocab[output])
                        for output in outputs
                    ]))
                print(log_prob)

            sys.stdout.write('> ')
            sys.stdout.flush()
            sentence = sys.stdin.readline()
예제 #19
0
def filetest():
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    if not os.path.exists('./logout/'):
        os.makedirs('./logout/')

    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)

    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    # glove embeddings
    gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
    gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]

    with tf.Session() as sess:
        # build the model
        model = StarGAN('G_test',
                        FLAGS.size,
                        FLAGS.num_layers,
                        FLAGS.vocab_size,
                        _buckets,
                        FLAGS.feature_size,
                        FLAGS.baseline,
                        FLAGS.lr,
                        FLAGS.lr_decay,
                        FLAGS.grad_norm,
                        critic=None,
                        use_attn=FLAGS.use_attn,
                        output_sample=True,
                        input_embed=True,
                        batch_size=FLAGS.batch_size,
                        D_lambda=FLAGS.lambda_dis,
                        G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                        dtype=tf.float32)
        #sess.run(tf.variables_initializer(tf.global_variables()))
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        print('read in model from {}'.format(ckpt.model_checkpoint_path))

        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        output_path = 'responses_six.txt'
        output_log = open(output_path, 'w')

        name_list = ['Monica', 'Joey', 'Chandler', 'Phoebe', 'Ross', 'Rachel']

        with open(FLAGS.test_path, 'r') as sentences:
            # step
            number = 0
            feature = []
            output_file = []
            output_list = []
            output_file.append('./logout/1.txt')
            output_file.append('./logout/2.txt')
            output_file.append('./logout/3.txt')
            output_file.append('./logout/4.txt')
            output_file.append('./logout/5.txt')
            output_file.append('./logout/6.txt')
            for f in range(FLAGS.feature_size):
                feature.append(
                    [[3 if x == f else 0 for x in range(FLAGS.feature_size)]])
                output_list.append([])

            for id, sentence in enumerate(sentences.readlines()):
                if id % 2 == 1:
                    continue
                number = number + 1

                token_ids = data_utils.sentence_to_token_ids(
                    tf.compat.as_bytes(sentence),
                    vocab,
                    normalize_digits=False)
                token_ids.append(data_utils.EOS_ID)
                if len(token_ids) > _buckets[-1][0]:
                    continue

                output_line = 'input : ' + sentence
                output_log.write(output_line)

                encoder_pad = [data_utils.PAD_ID
                               ] * (_buckets[-1][0] - len(token_ids))
                encoder_lens = [len(token_ids)]
                # feature in my implementation
                token_ids = list(token_ids) + encoder_pad
                encoder_inputs = []
                for idx in token_ids:
                    encoder_inputs.append([idx])
                decoder_inputs = [[data_utils.GO_ID]]

                for x in range(FLAGS.feature_size):
                    A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \
                                                             decoder_inputs, feature[x], gloveA_emb, gloveB_emb)
                    outputs = [output_ids[0] for output_ids in outputs]
                    if data_utils.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                    output_list[x].append(" ".join([
                        tf.compat.as_str(rev_vocab[output])
                        for output in outputs
                    ]))

                    output_line = name_list[x] + ': ' + " ".join([
                        tf.compat.as_str(rev_vocab[output])
                        for output in outputs
                    ]) + '\n'
                    output_log.write(output_line)
                output_log.write('\n')

                if number % 10 == 0:
                    print('parsing line ', number)

                if number == 1000:
                    output_log.close()
                    break

            for x in range(FLAGS.feature_size):
                with open(output_file[x], 'w') as op:
                    for line in output_list[x]:
                        op.write(line)
                        op.write('\n')
예제 #20
0
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import Text8Corpus

params = {
    'alpha': 0.05,
    'size': 100,
    'window': 5,
    'iter': 5,
    'min_count': 5,
    'sample': 1e-4,
    'sg': 1,
    'hs': 0,
    'negative': 5
}

model = Word2Vec(Text8Corpus('text8'), **params)
print(model)

from gensim.similarities.index import AnnoyIndexer

model.init_sims()
annoy_index = AnnoyIndexer(model, 100)


def GetWords(vector):
    return model.most_similar([vector], topn=5, indexer=annoy_index)


vector = model.wv.syn0norm[0]
GetWords(vector)
 def __iter__(self):
     corpus = Text8Corpus(self.fn)
     for doc in corpus:
         yield doc
예제 #22
0
def disctest():
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)
    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    # glove embeddings
    gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
    gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]

    with tf.Session() as sess:
        model = GAN('D_test',
                    FLAGS.size,
                    FLAGS.num_layers,
                    FLAGS.vocab_size,
                    _buckets,
                    FLAGS.feature_size,
                    FLAGS.baseline,
                    FLAGS.lr,
                    FLAGS.lr_decay,
                    FLAGS.grad_norm,
                    critic=None,
                    use_attn=FLAGS.use_attn,
                    output_sample=True,
                    input_embed=True,
                    batch_size=FLAGS.batch_size,
                    D_lambda=FLAGS.lambda_dis,
                    G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                    dtype=tf.float32)

        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        print('read in model from {}'.format(ckpt.model_checkpoint_path))

        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        sys.stdout.write('> ')
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            if sentence.strip() == 'exit()':
                break
            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), vocab, normalize_digits=False)
            token_ids.append(data_utils.EOS_ID)
            encoder_pad = [data_utils.PAD_ID
                           ] * (_buckets[-1][0] - len(token_ids))
            encoder_lens = [len(token_ids)]
            token_ids = list(token_ids) + encoder_pad
            encoder_inputs = []
            encoder_inputs.append([idx] for idx in token_ids)

            rf, c = model.dynamic_decode_D(sess, encoder_inputs, encoder_lens,
                                           gloveA_emb)

            print('rf : ', rf)
            print('c  : ', c)

            sys.stdout.write('> ')
            sys.stdout.flush()
            sentence = sys.stdin.readline()
import logging
from gensim.models.word2vec import Word2Vec, Text8Corpus

# Enable logging.
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

# Read data.
sentences = Text8Corpus('data/ja.text8')

# Train the model using Skip-gram.
model = Word2Vec(sentences, size=100, window=5, sg=1)
# Save the model.
model.save('models/model.bin')

####################################################
# Load the model.
model = Word2Vec.load('models/model.bin')

# 1. Get the word embedding.
ret1 = model.wv['猫']
print(ret1)
# [ 0.02126932  0.15553345  0.10472752  0.82008636  0.323413    0.42847344
#  -0.05886601 -0.03228    -0.49861073 -0.13823172  0.15907998 -0.2955121
#   0.15381564  0.02959213  0.45588103  0.04573615  0.10711562 -0.7498988
#   0.34765413  0.5668533   0.0082125   0.40620092  0.1419684  -0.15594704
#   0.2681074  -0.00587511 -0.17240909 -0.04313468  0.01801641 -0.08588244
#  -0.26890314 -0.58127177 -0.29637957  0.09391042  0.26176983 -0.09310274
#  -0.05240794  0.4360441   0.25374362  0.2856923  -0.09161343 -0.34498295
#   0.4329259   0.14602754  0.03789869 -0.16791926 -0.4877344   0.17858095
#   0.4094406  -0.0850195  -0.11097047 -0.22874318  0.20079853 -0.22009209
예제 #24
0
from util.utils import get_logger

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = get_logger(__name__)


class Word2VecModel:
    def __init__(self):
        self.default_model_path = 'text8.model'
        self.model = None

    def get_model(self):
        if self.model:
            return self.model
        logger.info('Start loading word2vec model...')
        word2Vec = Word2Vec()
        self.model = word2Vec.load_word2vec_format(os.getenv('WORD2VEC_MODEL_PATH', self.default_model_path),
                                                   binary=True)
        # self.model = word2Vec.load(os.getenv('WORD2VEC_MODEL_PATH', self.default_model_path))
        self.model.init_sims(replace=True)
        logger.info('Finish loading word2vec model...')
        return self.model


if __name__ == '__main__':
    # test word2vec
    text8Corpus = Text8Corpus(fname='/home/diepdt/Documents/word2vec/text8')
    model = Word2Vec(text8Corpus, workers=4)
    model.save('text8.model')
    model.most_similar(['apple'])
예제 #25
0
def train_GAN():
    os.environ['CUDA_VISIBLE_DEVICES'] = '1'
    from keras.backend.tensorflow_backend import set_session
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    set_session(tf.Session(config=config))

    if not os.path.exists(FLAGS.model_dir):
        os.makedirs(FLAGS.model_dir)

    if not os.path.exists(FLAGS.pretrain_dir):
        os.makedirs(FLAGS.pretrain_dir)

    if not os.path.exists(FLAGS.gan_dir):
        os.makedirs(FLAGS.gan_dir)

    def build_summaries():
        train_loss = tf.Variable(0.)
        tf.summary.scalar("train_loss", train_loss)
        summary_vars = [train_loss]
        summary_ops = tf.summary.merge_all()
        return summary_ops, summary_vars


    feature, data, train, data_voc, train_voc = \
        data_utils.prepare_data(FLAGS.feature_path, FLAGS.feature_size, FLAGS.data_dir, \
                        FLAGS.data_path, FLAGS.train_path, FLAGS.vocab_size)

    vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    data_utils.combine_corpus(data_voc, train_voc, vocab_path,
                              glove_corpus_path, 28)
    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)

    if not os.path.exists(modelA):
        gloveA = Glove(no_components=FLAGS.size, learning_rate=0.05)
        gloveA.fit(corpus.matrix, epochs=300, no_threads=4, verbose=True)
        gloveA.add_dictionary(corpus.dictionary)
        gloveA.save(modelA)  # 512

    if not os.path.exists(modelB):
        gloveB = Glove(no_components=int(FLAGS.size * 3 / 4),
                       learning_rate=0.05)
        gloveB.fit(corpus.matrix, epochs=300, no_threads=4, verbose=True)
        gloveB.add_dictionary(corpus.dictionary)
        gloveB.save(modelB)  # 384

    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

    with tf.Session() as sess:
        model = GAN('GAN',
                    FLAGS.size,
                    FLAGS.num_layers,
                    FLAGS.vocab_size,
                    _buckets,
                    FLAGS.feature_size,
                    FLAGS.baseline,
                    FLAGS.lr,
                    FLAGS.lr_decay,
                    FLAGS.grad_norm,
                    critic=None,
                    use_attn=FLAGS.use_attn,
                    output_sample=True,
                    input_embed=True,
                    batch_size=FLAGS.batch_size,
                    D_lambda=FLAGS.lambda_dis,
                    G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                    dtype=tf.float32)
        # build summary and intialize
        summary_ops, summary_vars = build_summaries()
        sess.run(tf.variables_initializer(tf.global_variables()))
        log_dir = os.path.join(FLAGS.model_dir, 'log')
        writer = tf.summary.FileWriter(log_dir, sess.graph)
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('read in model from {}'.format(ckpt.model_checkpoint_path))
            model.saver.restore(sess, ckpt.model_checkpoint_path)

        # load in train and dev(valid) data with buckets
        train_set = read_data_with_buckets(train, FLAGS.max_train_data_size)
        data_set = read_data_with_buckets(data, FLAGS.max_train_data_size)

        train_buckets_sizes = [len(train_set[b]) for b in range(len(_buckets))]
        train_total_size = float(sum(train_buckets_sizes))
        print('each buckets has: {d}'.format(d=train_buckets_sizes))
        train_buckets_scale = [
            sum(train_buckets_sizes[:i + 1]) / train_total_size
            for i in range(len(train_buckets_sizes))
        ]

        # main process
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []

        # glove embeddings
        gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
        gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]
        ### ------------------------------------------------------------ ###
        ###                           Pretrain                           ###
        ### ------------------------------------------------------------ ###
        while True:
            # get batch from a random selected bucket
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])  # random pick bucket

            # get batch for the pretraining data
            feature_inputs_f, encoder_inputs_f, decoder_inputs_f, weights_f, seq_lens_f, _,  \
            feature_inputs_b, encoder_inputs_b, decoder_inputs_b, weights_b, seq_lens_b, _,  = \
                get_batch_with_buckets(FLAGS.feature_size, data_set, FLAGS.batch_size, bucket_id)

            # pretrain start !
            start_time = time.time()
            forloss, _ , _, _ = model.train_previous(sess, encoder_inputs_f, feature_inputs_f, \
                                                decoder_inputs_f, weights_f, encoder_inputs_b, \
                                                feature_inputs_b, decoder_inputs_b, weights_b, \
                                                bucket_id, gloveA_emb, gloveB_emb, seq_lens_f, seq_lens_b)
            step_loss = forloss
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / (
                FLAGS.Gstep * 2 + FLAGS.Dstep + 1)
            #print('pretrain : ',step_loss)
            ### ------------------------------------------------------------ ###
            ###                         Train GAN                            ###
            ### ------------------------------------------------------------ ###
            for _ in range(FLAGS.Dstep):
                # get batch from a random selected bucket
                random_number_01 = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(train_buckets_scale))
                    if train_buckets_scale[i] > random_number_01
                ])  # random pick bucket

                # get batch for the pretraining data
                feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \
                feature_inputs_b, decoder_inputs_b, weights_b, \
                real_inputs, real_feature , real_seq_lens= \
                    get_gan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id)

                # D_step start !
                start_time = time.time()
                _, D_loss = model.train_gan(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \
                                                decoder_inputs_b, weights_b, feature_inputs_b, \
                                                real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \
                                                disc = True,real_seq_len=real_seq_lens, forward_seq_len=seq_lens_f)
                step_loss = D_loss
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / (
                    FLAGS.Gstep * 2 + FLAGS.Dstep + 1)
                #print('D_step : ', step_loss)
            for _ in range(FLAGS.Gstep):
                # get batch from a random selected bucket
                random_number_01 = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(train_buckets_scale))
                    if train_buckets_scale[i] > random_number_01
                ])  # random pick bucket

                # get batch for the pretraining data
                feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \
                feature_inputs_b, decoder_inputs_b, weights_b, \
                real_inputs, real_feature, real_seq_lens = \
                    get_gan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id)

                # G_step start !
                start_time = time.time()
                _, for_reward = model.train_gan(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \
                                                decoder_inputs_b, weights_b, feature_inputs_b, \
                                                real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \
                                                forward = True,real_seq_len=real_seq_lens , forward_seq_len=seq_lens_f)

                step_loss = for_reward
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / (
                    FLAGS.Gstep * 2 + FLAGS.Dstep + 1)
                #print('for_loss :', step_loss)
                # get batch from a random selected bucket
                random_number_01 = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(train_buckets_scale))
                    if train_buckets_scale[i] > random_number_01
                ])  # random pick bucket

                # get batch for the pretraining data
                feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \
                feature_inputs_b, decoder_inputs_b, weights_b, \
                real_inputs, real_feature, real_seq_lens = \
                    get_gan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id)

                # G_step start !
                start_time = time.time()
                _, back_reward = model.train_gan(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \
                                                decoder_inputs_b, weights_b, feature_inputs_b, \
                                                real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \
                                                backward = True,real_seq_len=real_seq_lens , forward_seq_len=seq_lens_f)

                step_loss = back_reward
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / (
                    FLAGS.Gstep * 2 + FLAGS.Dstep + 1)
                #print('back_loss :', step_loss)
            current_step += 1
            # log, save and eval
            if current_step % FLAGS.steps_per_checkpoint == 0:
                perplexity = math.exp(
                    float(loss)) if loss < 300 else float('inf')
                print(
                    "Generator step %d; learning rate %.4f; learning_rate_star %.6f; D_lr %6f; step-time %.2f; perplexity "
                    "%.2f; loss %.2f" %
                    (model.global_F_step.eval(), model.learning_rate.eval(),
                     model.learning_rate_star.eval(), model.D_lr.eval(),
                     step_time, perplexity, loss))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.op_lr_decay)
                    sess.run(model.op_D_lr_decay)
                    sess.run(model.learning_rate_star_decay)
                previous_losses.append(loss)

                # write summary
                feed_dict = {}
                feed_dict[summary_vars[0]] = loss
                summary_str = sess.run(summary_ops, feed_dict=feed_dict)
                writer.add_summary(summary_str, model.global_F_step.eval())
                writer.flush()
                # Save checkpoint and zero timer and loss.
                ckpt_path = os.path.join(FLAGS.model_dir, "ckpt")
                model.saver.save(sess,
                                 ckpt_path,
                                 global_step=model.global_F_step)

                gan_path = os.path.join(FLAGS.gan_dir, "ckpt_prev")
                model.saver.save(sess,
                                 gan_path,
                                 global_step=model.global_F_step)
                step_time, loss = 0.0, 0.0

                sys.stdout.flush()
예제 #26
0
 def train(self, pathCorpus, epochs=60, compute_loss=True):
     sentences = Text8Corpus(datapath(pathCorpus))
     self.model.train(sentences,
                      epochs=epochs,
                      total_examples=self.model.corpus_count,
                      compute_loss=compute_loss)
예제 #27
0
def filetest():
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)
    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    # glove embeddings
    gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
    gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]

    if not os.path.exists('./fileTest_log/'):
        os.makedirs('./fileTest_log/')

    with tf.Session() as sess:
        model = GAN('G_test',
                    FLAGS.size,
                    FLAGS.num_layers,
                    FLAGS.vocab_size,
                    _buckets,
                    FLAGS.feature_size,
                    FLAGS.baseline,
                    FLAGS.lr,
                    FLAGS.lr_decay,
                    FLAGS.grad_norm,
                    critic=None,
                    use_attn=FLAGS.use_attn,
                    output_sample=True,
                    input_embed=True,
                    batch_size=FLAGS.batch_size,
                    D_lambda=FLAGS.lambda_dis,
                    G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                    dtype=tf.float32)

        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        print('read in model from {}'.format(ckpt.model_checkpoint_path))

        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        with open(FLAGS.test_path, 'r') as sentences:
            # step
            number = 0
            feature = []
            output_file = []
            for idx in range(FLAGS.feature_size):
                output_file.append('./fileTest_log/%s.txt' % idx)

            output_list = []
            for f in range(FLAGS.feature_size):
                feature.append(
                    [[3 if x == f else 0 for x in range(FLAGS.feature_size)]])
                output_list.append([])

            for id, sentence in enumerate(sentences.readlines()):
                token_ids = data_utils.sentence_to_token_ids(
                    tf.compat.as_bytes(sentence),
                    vocab,
                    normalize_digits=False)
                token_ids.append(data_utils.EOS_ID)
                if len(token_ids) > _buckets[-1][0]:
                    continue
                encoder_pad = [data_utils.PAD_ID
                               ] * (_buckets[-1][0] - len(token_ids))
                encoder_lens = [len(token_ids)]
                token_ids = list(token_ids) + encoder_pad
                encoder_inputs = []
                encoder_inputs.append([idx] for idx in token_ids)
                decoder_inputs = [[data_utils.GO_ID]]

                for x in range(FLAGS.feature_size):
                    A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \
                                                             decoder_inputs, feature[x], gloveA_emb, gloveB_emb)
                    outputs = [output_ids[0] for output_ids in outputs]
                    if data_utils.EOS_ID in outputs:
                        outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                    output_list[x].append(" ".join([
                        tf.compat.as_str(rev_vocab[output])
                        for output in outputs
                    ]))

                if number % 1000 == 0:
                    print('fileTest: generating line %d' % number)
                if number == 1000:
                    break
                number += 1
            for x in range(FLAGS.feature_size):
                with open(output_file[x], 'w') as op:
                    for line in output_list[x]:
                        op.write(line)
                        op.write('\n')
#model.wv.save_word2vec_format('article_vec','article_vocab')

# sentences = LineSentence('new_data/word_seg')
# model = Word2Vec(sentences,min_count=5, size=200,window=5, negative=5, sg=1,
#                  hs=0, iter=1, workers=4 )
# model.save("word_seg_word2vec.model")
# model.get_latest_training_loss()
# 保存词典
#model.wv.save_word2vec_format('word_seg_vec','word_seg_vocab')

# 相似性搜索

# ---- text8 文件做示例----

# 加载数据,按照要求的格式
sentences = Text8Corpus('new_data/text8')
# 创建模型并训练
model = Word2Vec(sentences,
                 size=200,
                 window=5,
                 min_count=5,
                 workers=4,
                 sg=1,
                 hs=0,
                 negative=5,
                 iter=1)
# 保存模型
model.save("text8_word2vec.model")
# 查看某个单词的向量
print(model.wv['computer'])
# 相似性搜索,测试效果
예제 #29
0
def maxbleutest():
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)
    if not os.path.exists("./maxBleu_log/"):
        os.makedirs("./maxBleu_log/")

    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    # glove embeddings
    gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
    gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]

    with tf.Session() as sess:
        model = GAN('G_test',
                    FLAGS.size,
                    FLAGS.num_layers,
                    FLAGS.vocab_size,
                    _buckets,
                    FLAGS.feature_size,
                    FLAGS.baseline,
                    FLAGS.lr,
                    FLAGS.lr_decay,
                    FLAGS.grad_norm,
                    critic=None,
                    use_attn=FLAGS.use_attn,
                    output_sample=True,
                    input_embed=True,
                    batch_size=FLAGS.batch_size,
                    D_lambda=FLAGS.lambda_dis,
                    G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                    dtype=tf.float32)

        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        print('read in model from {}'.format(ckpt.model_checkpoint_path))

        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)
        group_data, _ = data_utils.training_data_grouping(
            FLAGS.train_path, FLAGS.feature_path, FLAGS.feature_size)

        with open("./maxBleu_log/" + FLAGS.file_head + "_ref_msg.txt",
                  'w') as fop:
            with open(FLAGS.test_path, 'r') as sentences:
                number = 0
                feature, output_file, output_list = [], [], []
                for i in range(FLAGS.feature_size):
                    output_file.append("./maxBleu_log/" + FLAGS.file_head +
                                       "_{}.txt".format(i))
                    feature.append([[
                        3 if x == i else 0 for x in range(FLAGS.feature_size)
                    ]])
                    output_list.append([])

                for sentence in sentences.readlines():
                    if number % 100 == 0:
                        print("maxBleuTesting: parsing line {}".format(number))

                    if number == 500:
                        break

                    token_ids = data_utils.sentence_to_token_ids(
                        tf.compat.as_bytes(sentence),
                        vocab,
                        normalize_digits=False)
                    token_ids.append(data_utils.EOS_ID)
                    if len(token_ids) > _buckets[-1][0]:
                        continue
                    encoder_pad = [data_utils.PAD_ID
                                   ] * (_buckets[-1][0] - len(token_ids))
                    encoder_lens = [len(token_ids)]
                    token_ids = list(token_ids) + encoder_pad
                    encoder_inputs = []
                    encoder_inputs.append([idx] for idx in token_ids)
                    decoder_inputs = [[data_utils.GO_ID]]

                    check = False
                    outputs_list = []
                    for x in range(FLAGS.feature_size):
                        A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \
                                                                decoder_inputs, feature[x], gloveA_emb, gloveB_emb)
                        outputs = [output_ids[0] for output_ids in outputs]
                        if data_utils.EOS_ID in outputs:
                            outputs = outputs[:outputs.index(data_utils.EOS_ID
                                                             )]
                        if data_utils.UNK_ID in outputs:
                            check = True
                            break
                        outputs_list.append(" ".join([
                            tf.compat.as_str(rev_vocab[output])
                            for output in outputs
                        ]))

                    if check:
                        continue
                    else:
                        number += 1
                        fop.write(sentence)
                        for x in range(FLAGS.feature_size):
                            output_list[x].append(outputs_list[x])

            for x in range(FLAGS.feature_size):
                with open(output_file[x], 'w') as op:
                    pos_lst = []
                    for i, line in enumerate(output_list[x]):
                        op.write(line)
                        op.write("\n")
                        grps_bleu = []
                        for _, sentence_lst in group_data.items():
                            bleu = -1
                            for ref in sentence_lst:
                                score = sentence_bleu(ref, line)
                                if score > bleu:
                                    bleu = score
                            grps_bleu.append(bleu)

                        max_bleu = max(grps_bleu)
                        #predicted persona: max_candidates
                        max_candidates = [
                            i for i, x in enumerate(grps_bleu) if x == max_bleu
                        ]
                        if len(max_candidates) > 1:
                            pos_lst.append(-1)
                        else:
                            pos_lst.append(max_candidates[0])

                    tranc_pos_lst = [
                        pos_lst[i] for i in range(len(pos_lst))
                        if pos_lst[i] != -1
                    ]
                    op.write("MaxBLEU Acc: {}".format(
                        tranc_pos_lst.count(x) / len(tranc_pos_lst)))
예제 #30
0
from gensim.models.word2vec import Word2Vec, BrownCorpus, Text8Corpus
import os, pdb, itertools

word_count = 0
sent_count = 0
for f in os.listdir("./rawdata/training-monolingual.tokenized.shuffled"):
	if not os.path.isfile("./rawdata/training-monolingual.tokenized.shuffled/" + f):
		continue
	with open("rawdata/training-monolingual.tokenized.shuffled/"+f,"r") as file:
		for line in file:
			sent_count += 1		
			word_count += len(line.strip().split(" "))
brown = BrownCorpus("/home/david/nltk_data/corpora/brown/")
        ## Gotta provide total_examples
text8 =  Text8Corpus("./rawdata/text8")
sent = itertools.chain(text8, brown)
for snt in sent:
	sent_count += 1 
	word_count += len(snt)


print "SENTENCE COUNT " + str(sent_count)
print "WORD COUNT " + str(word_count)