Exemplo n.º 1
0
def word_embedding(sentences,embedding_size,windows_len):
    """
    Verify that the square error diminishes with fitting
    """

     

    corpus_model = Corpus()

    corpus_model.fit(sentences,window=windows_len)

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=embedding_size, learning_rate=0.05)
    glove_model.fit(corpus_model.matrix,
                    epochs=0,
                    no_threads=2)

    log_cooc_mat = corpus_model.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())
    
    
    
    corpus_dict=corpus_model.dictionary
    corpus_inverse_dict=dict(map(reversed, corpus_dict.items()))

        
    

    return glove_model,corpus_dict,corpus_inverse_dict
Exemplo n.º 2
0
def test_supplied_dict_missing():

    dictionary = {'a': 1,
                  'naïve': 0}

    corpus = [['a', 'naïve', 'fox']]

    model = Corpus(dictionary=dictionary)
    model.fit(corpus, max_map_size=0, window=10)
Exemplo n.º 3
0
 def pretrain(self,data_src):
     if not os.path.isfile("glove.model"):
         data_src = DataClean([
                             ["[^a-z]"," "],  # only letters
                             [" [ ]+", " "],  # remove extra spaces
                             ],html_clean=True,split_words=True).fit(data_src).transform(data_src)
         corpus_model = Corpus()
         corpus_model.fit(data_src,window=self.window)
         glove = Glove(no_components=self.num_features,learning_rate=self.learning_rate)
         glove.fit(corpus_model.matrix,epochs=self.epochs,verbose=True)
         glove.add_dictionary(corpus_model.dictionary)
         glove.save("glove.model")
def main():
    corpus_model = Corpus()
    corpus_model = Corpus.load('bioc-corpus-AZ2.model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('bioc-glove-AZ2.model')
def train_glove(sentences):
    print 'training glove model...'
    t0 = time()
    
    num_features = 300    # Word vector dimensionality
    context = 5          # Context window size
    learning_rate = 0.05
    
    corpus = Corpus()
    corpus.fit(sentences, window=context)

    glove = Glove(no_components=num_features, learning_rate=learning_rate)
    glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True)
    glove.add_dictionary(corpus.dictionary)

    print 'took %0.5fs.' % (time() - t0)
    return glove
Exemplo n.º 6
0
def test_supplied_dictionary():

    dictionary = {'a': 2,
                  'naïve': 1,
                  'fox': 0}

    corpus = [['a', 'naïve', 'fox']]

    model = Corpus(dictionary=dictionary)
    model.fit(corpus, max_map_size=0, window=10)

    assert model.dictionary == dictionary

    assert model.matrix.shape == (len(dictionary),
                                  len(dictionary))

    assert (model.matrix.tocsr()[2]).sum() == 0
Exemplo n.º 7
0
    def run_glove(self):
        """ run global vector """
        #sentences = [["hi","good","to"],["see","u"]]
        sentences = self.get_sentences()

        print '\n' + '-'*80
        print "Fitting words into corpus"
        corpus = Corpus()
        corpus.fit(sentences, window=10)

        print "Running Glove"
        glove = Glove(no_components=200, learning_rate=0.05)
        glove.fit(corpus.matrix, epochs=5, no_threads=10, verbose=True)
        glove.add_dictionary(corpus.dictionary)

        print "Fitting words and vectors into unique_words and vectors200"
        unique_words = []
        vectors200 = []

        cnt1 = 0
        length1 = len(glove.inverse_dictionary)
        for word_id in glove.inverse_dictionary:
            cnt1 += 1
            unique_words.append(glove.inverse_dictionary[word_id])
            vectors200.append(glove.word_vectors[word_id])

            sys.stdout.write("\rStatus: %s / %s"%(cnt1, length1))
            sys.stdout.flush()

        print '\n' + "Processing vectors200"
        processed_vectors200 = []
        processed_vector = []

        cnt2 = 0
        length2 = len(vectors200)
        for vector in vectors200:
            cnt2 += 1
            for float_num in vector:
                processed_vector.append(float_num)

            processed_vectors200.append(processed_vector)

            sys.stdout.write("\rStatus: %s / %s"%(cnt2, length2))
            sys.stdout.flush()

        return unique_words, processed_vectors200
Exemplo n.º 8
0
def test_corpus_construction():

    corpus_words = ['a', 'naïve', 'fox']
    corpus = [corpus_words]

    model = Corpus()
    model.fit(corpus, max_map_size=0, window=10)

    for word in corpus_words:
        assert word in model.dictionary

    assert model.matrix.shape == (len(corpus_words),
                                  len(corpus_words))

    expected = [[0.0, 1.0, 0.5],
                [0.0, 0.0, 1.0],
                [0.0, 0.0, 0.0]]

    assert (model.matrix.todense().tolist()
            == expected)
Exemplo n.º 9
0
def build_model_glove(args):

    from glove import Glove, Corpus

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
def build_glove_embeddings(training, testing, args):
    
    ''' Trains the model on the sentiment140 dataset

    @Arguments:
        data:  the loaded sentiment140 dataset from module
        num_epochs: the number of epochs to train on
        num_threads: the number of threads to use
        num_components: the number of components the glove model should use
        learning_rate: the model's learning rate
        window_size: the size of the window to use when looking for word co-occurence
        verbose: boolean for whether or not extensive output should be printed to screen

    @Return:
        A trained glove model
    '''
        
    # initialize model
    glove = Glove(no_components = args.vecsize, learning_rate = args.learningRate)
    
    txtSource = chain( imap(lambda (txt,lbl): txt, training), imap(lambda (txt,lbl): txt, testing))
    
    # read in the data to train on
    corpus_model = Corpus()
    corpus_model.fit( imap(preprocess.tokenize, txtSource), window = args.window)
        
    # fit the model using the given parameters
    logging.info("Training GloVe")
    glove.fit(corpus_model.matrix, epochs = args.epochs, no_threads = args.parallelism, verbose = args.verbose)
    
    # add a dictionary just to make it easier for similarity queries
    glove.add_dictionary(corpus_model.dictionary)
    
    transformer = lambda words: glove.transform_paragraph(words, use_pca = args.pca)

    fromTraining = to_sklearn_format(transformer, training, args.vecsize)
    fromTesting = to_sklearn_format(transformer, testing, args.vecsize)
    
    return fromTraining, fromTesting
Exemplo n.º 11
0
def test_fitting():
    """
    Verify that the square error diminishes with fitting
    """

    num_sentences = 5000
    seed = 10

    corpus = Corpus()

    corpus.fit(generate_training_corpus(num_sentences,
                                        vocabulary_size=50,
                                        seed=seed))

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=0,
                    no_threads=2)

    log_cooc_mat = corpus.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0

    # Check that it is good with fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=500,
                    no_threads=2)

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
Exemplo n.º 12
0
def test_supplied_dict_missing_ignored():

    dictionary = {'a': 0,
                  'fox': 1}

    corpus = [['a', 'naïve', 'fox']]

    model = Corpus(dictionary=dictionary)
    model.fit(corpus, max_map_size=0, window=10, ignore_missing=True)

    assert model.dictionary == dictionary

    assert model.matrix.shape == (len(dictionary),
                                  len(dictionary))

    # Ensure that context windows and context window
    # weights are preserved. 
    full_model = Corpus()
    full_model.fit(corpus, window=10)

    assert (full_model.matrix.todense()[0, 2]
            == model.matrix.todense()[0, 1]
            == 0.5)
Exemplo n.º 13
0
                        default='',
                        help='Get closes words to this word.')
    args = parser.parse_args()


    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print('Pre-processing corpus')

        if args.wiki:
            print('Using wikipedia corpus')
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model')
        
        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
Exemplo n.º 14
0
def glove(docsents,
          n_dim,
          random_state=0,
          min_tf=1,
          docnames=None,
          keywords=None,
          **kwargs):
    '''
        Interface for Glove algorithm using python-glove package.
            
        Output: Returns DocModel object containing the results of the word embedding.
        
        Inputs: 
            docbows: Iterable of document token iterables (nested by document, will 
                be flattened).
            n_dim: Number of dimensions to use in the embedding model.
            random_state: Integer for seeding random generator. Allows
                for making reproducable embedding models.
            min_tf: Minimum number of times a token must appear in 
                the corpus to be added to the topic model.
            docnames: document names that will appear in DocModel
                for convenience.
            keywords: list of lists of words that will form the basis of a 
                hyper-rotated version of a new embedding model. The list 
                [['hat','dog'],['cat',]] would orient the first (arbitrary)
                dimension of the vector space to hat + dog, and the second 
                to the vector for cat minus the component in hat + dog to 
                preserve orthogonality.
            **kwargs: other keyword arguments fed directly into the 
                sklearn NMF function.
    '''

    # count frequencies
    fdist = Counter([w for s in pretendsents(docsents) for w in s])
    sfdist = list(sorted(fdist.items(), key=lambda x: x[1], reverse=True))
    dictionary = {wf[0]: i for i, wf in enumerate(sfdist)}
    cutoff = calc_cutoffind([f for w, f in sfdist], min_tf)

    # calculate corpus matrix
    corpus = Corpus(dictionary=dictionary)
    corpus.fit(pretendsents(docsents),
               window=10)  # GloVe found that bigger windows helped
    corpus.matrix = corpus.matrix.tocsr()[:cutoff, :cutoff].tocoo()

    # train glove model
    glove = Glove(no_components=n_dim,
                  learning_rate=0.05,
                  random_state=random_state)
    glove.fit(corpus.matrix, **kwargs)

    # modify dictionary after cutoff applied
    cutoff_dictionary = {
        wf[0]: i
        for i, wf in enumerate(sfdist) if wf[1] > min_tf
    }
    glove.add_dictionary(cutoff_dictionary)
    vocab = [w for w, f in sfdist if f > min_tf]

    # if keywords provided, transform vector space to new basis based on keywords
    if keywords is not None:
        glove = supervised_vectors(glove, keywords)

    # transform documents to single vectors
    transpar = lambda doc: glove_transform_paragraph(
        glove, doc, ignore_missing=True)
    docvectors = [transpar(doc) for doc in pretenddocs(docsents)]

    # words associated with each dimension of the embedding space
    dimwords = np.zeros((n_dim, len(vocab)))
    for dim in range(n_dim):
        # create natural basis unit vector
        e = np.zeros(n_dim)
        e[dim] = 1
        for i, w in enumerate(vocab):
            dimwords[dim, i] = glove_projection(glove, w, e)

    return DocModel(np.vstack(docvectors), dimwords, vocab, docnames=docnames)
Exemplo n.º 15
0
'''
from glove import Glove
from glove import Corpus
from gensim import corpora
import time

dic_file=r'/home/dannl/tmp/newstech/glove/news.dic'
corpus_file='/home/dannl/tmp/newstech/news.txt'
cooc_file='/home/dannl/tmp/newstech/glove/word.cooc'

def read_corpus(filename):
    with open(filename, 'r') as datafile:
        for line in datafile:
            yield line.split()[1:]

# get a cooccurrence matrix
oldtime=time.time()
dictionary = corpora.Dictionary.load(dic_file)

# corpus_cooc = Corpus()
# corpus_cooc.fit(read_corpus(corpus_file), window=10)

corpus_cooc = Corpus(dictionary=dictionary.token2id)
corpus_cooc.fit(read_corpus(corpus_file), window=10,ignore_missing=True)
corpus_cooc.save(cooc_file)

print('Dict size: %s' % len(corpus_cooc.dictionary))
print('Collocations: %s' % corpus_cooc.matrix.nnz)

print 'time cost:%.2f'%(time.time()-oldtime,)
Exemplo n.º 16
0
#-*- coding:utf-8 -*-
'''
Created on 2016-3-12

@author: dannl
'''
from glove import Glove
from glove import Corpus
import time

cooc_file='/home/dannl/tmp/newstech/glove/word.cooc'
model_file='/home/dannl/tmp/newstech/glove/glove.model'

oldtime=time.time()
# get a cooccurrence matrix
corpus_cooc = Corpus.load(cooc_file)

# get a model
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_cooc.matrix, epochs=5,no_threads=4, verbose=True)
glove.add_dictionary(corpus_cooc.dictionary)
glove.save(model_file)

# count=0
# for word,wid in corpus_cooc.dictionary.items():
#     count+=1
#     if count>100:
#         break
#     print word,wid
    
print('Dict size: %s' % len(corpus_cooc.dictionary))
Exemplo n.º 17
0
from __future__ import print_function
import argparse
import pprint
# import gensim
from glove import Glove
from glove import Corpus

sentense = [['你','是','谁'],['我','是','中国人']]
corpus_model = Corpus()
corpus_model.fit(sentense, window=10)
#corpus_model.save('corpus.model')
print('Dict size: %s' % len(corpus_model.dictionary))
print(corpus_model.dictionary)
print('Collocations: %s' % corpus_model.matrix.nnz)
print(corpus_model.matrix)
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=10,
          no_threads=1, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
glove.save('glove.model')
glove = Glove.load('glove.model')
corpus_model.save('corpus.model')
corpus_model = Corpus.load('corpus.model')
print("most similar to 我")
print(glove.most_similar('我', number=10))
# 全部词向量矩阵
print(glove.word_vectors)
# 指定词条词向量

print("你")
print(glove.word_vectors[glove.dictionary['你']])
Exemplo n.º 18
0
wordnet_lemmatizer = WordNetLemmatizer()
lines_with_lemmas = []

#stop words contain the set of stop words

for line in lines:
    temp_line = []
    for word in lines:
        temp_line.append(wordnet_lemmatizer.lemmatize(word))
    string = ' '
    lines_with_lemmas.append(string.join(temp_line))

lines = lines_with_lemmas

# creating a corpus object
corpus = Corpus()
# training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(lines, window=10)
# creating a Glove object which will use the matrix created in the above lines to create embeddings
# We can set the learning rate as it uses Gradient Descent and number of components
glove = Glove(no_components=5, learning_rate=0.05)

glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')




def main():
    corpus_model = Corpus()
    corpus_model.fit(itertexts(), window=10, max_map_size=1000000)
    corpus_model.save('bioc-corpus-AZ2.model')
Exemplo n.º 20
0
def test_supplied_dict_checks():

    dictionary = {'a': 4, 'naïve': 1, 'fox': 0}

    with pytest.raises(Exception):
        Corpus(dictionary=dictionary)
data_all = pd.concat([train, test])

len_train = train.shape[0]

qs = []
ts = []
ds = []
sentences = []
for q, t in zip(data_all['question1'].values.tolist(),
                data_all['question2'].values.tolist()):
    sentences.append(q.split(' '))
    sentences.append(t.split(' '))
    qs.append(q.split(' '))
    ts.append(t.split(' '))

corpus_model = Corpus()
corpus_model.fit(sentences, window=10)
corpus_model.save(path + 'corpus.mdl')

corpus_model = Corpus.load(path + 'corpus.mdl')

glove = Glove(no_components=200, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=10, no_threads=7, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
glove.save(path + 'glove.glv')
glove = Glove.load(path + 'glove.glv')
print glove

qt_sims_dists = []
qt_diff = []
Exemplo n.º 22
0
mlp10 = mlp_model(10)
mlp10_accuracy = train_test(mlp10, x, y, folds)

mlp100 = mlp_model(100)
mlp100_accuracy = train_test(mlp100, x, y, folds)

mlp1000 = mlp_model(1000)
mlp1000_accuracy = train_test(mlp1000, x, y, folds)

print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy))

#3CNN
#Glove Vectors from reviews
c = [review.split() for review in data.data]

corpus = Corpus()
corpus.fit(c, window=10)

glv = Glove(no_components=100, learning_rate=0.05)
glv.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)

glv.add_dictionary(corpus.dictionary)

embeddings_index = glv.dictionary

BASE_DIR = ''
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = 'txt_sentoken/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
Exemplo n.º 23
0
        for article in tom_data:
            article_body = article['article_body']            
            word_tokens = article_body.lower().translate(delchars).split(' ')
            yield [w for w in word_tokens if ( (not w in stop_words) and (not len(w) <= 4) )]  

if __name__ == '__main__':      

    want_TSNE = False
    want_GRAPH = True
    
    # Build the corpus dictionary and the cooccurrence matrix.
    print('Pre-processing corpus')

    filepattern = 'tom_articles_1*.json'

    corpus_model = Corpus()
    corpus_model.fit(read_corpus(filepattern) , window=5)
    corpus_model.save('corpus.model')
    
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)  

    print('Training the GloVe model')

    glove = Glove(no_components=50, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=100,
                no_threads=4, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save('glove.model')
Exemplo n.º 24
0
def main(model_select):
    data = pd.read_excel("./data/doc_set_final4.xlsx")
    data.token = data.token.apply(lambda x: literal_eval(x))
    data = data.sample(frac=1, random_state=1234)

    token_list = data.token.tolist()
    target = data[['new_class', 'new_small_class']]
    train_x_data, test_x_data, train_y, test_y = train_test_split(
        token_list,
        target,
        test_size=0.3,
        stratify=target,
        shuffle=True,
        random_state=1234)

    if model_select == 'w2v':
        w2v_name = 'base_token'
        print("모델 학습")
        word2vec_kargs = {
            'num_features': 300,
            'num_workers': 4,
            'window': 8,
            'seed': 1234,
            'min_word_count': 5,
            'min_alpha': 0.025,
            'iter': 30
        }
        model = word2vec_model(train_x_data, **word2vec_kargs)
        print("모델 저장")
        model_name = './model/word_embedding/Word2vec1({}).model'.format(
            w2v_name)
        model.save(model_name)

    elif model_select == 'd2v':
        TaggedDocument = namedtuple('TaggedDocument', 'words tags')
        tagged_train_docs = [
            TaggedDocument(d, [c[1]['new_class'], c[1]['new_small_class']])
            for d, c in zip(train_x_data, train_y.iterrows())
        ]
        print("모델 학습")
        doc2vec_kargs = {
            'size': 300,
            'window': 8,
            'min_count': 5,
            'alpha': 0.025,
            'min_alpha': 0.025,
            'workers': 4,
            'seed': 1234,
            'iter': 50
        }
        model = doc2vec_model(tagged_train_docs, **doc2vec_kargs)
        print("모델 저장")
        model.save('./model/word_embedding/Doc2vec_new_small2_4.model')

    elif model_select == 'fasttext':
        print("모델 학습")
        ft_kargs = {
            'size': 300,
            'window': 5,
            'min_count': 3,
            'workers': 4,
            'seed': 1234
        }
        model = fasttext_model(train_x_data, **ft_kargs)
        print("모델 저장")
        model.save('./model/word_embedding/FastText.model')

    elif model_select == 'glove':
        glove_kargs = {
            'size': 300,
            'lr': 0.005,
            'random_state': 1234,
            'no_threads': 4,
            'epoch': 30
        }
        corpus = Corpus()
        corpus.fit(train_x_data, window=8)
        glove = Glove(no_components=glove_kargs['size'],
                      learning_rate=glove_kargs['lr'])
        glove.fit(corpus.matrix,
                  epochs=glove_kargs['epoch'],
                  no_threads=glove_kargs['no_threads'],
                  verbose=True)
        glove.add_dictionary(corpus.dictionary)
        print("모델 저장")
        glove.save('./model/word_embedding/glove.model')
    else:
        print("3가지 방식 중에 고르시오")
Exemplo n.º 25
0
        return noPunctutations

    for l in jsonLines:
        temp = json.loads(l)
        #tokenize, lemmatize, stopword,punctutions removal
        contentList = preprocess(temp["content"])

        fulltext.append(contentList)

        #fulltext+=temp["content"]
    #print(fulltext)

    print(len(fulltext))

    print(len(fulltext[0]))

    print(len(fulltext[1]))

    #mainlist=[]
    #mainlist.append(fulltext)
    corpus = Corpus()
    corpus.fit(fulltext, window=10)
    glove = Glove(no_components=5, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
    print(glove.word_vectors[glove.dictionary['summary']])
    print(glove.word_vectors[glove.dictionary['mapped']])
    print(glove.word_vectors[glove.dictionary['low']])
    print(glove.most_similar('low'))
Exemplo n.º 26
0
def seriestest():
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)

    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    # glove embeddings
    gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
    gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]

    with tf.Session() as sess:
        # build the model
        model = StarGAN('G_test',
                        FLAGS.size,
                        FLAGS.num_layers,
                        FLAGS.vocab_size,
                        _buckets,
                        FLAGS.feature_size,
                        FLAGS.baseline,
                        FLAGS.lr,
                        FLAGS.lr_decay,
                        FLAGS.grad_norm,
                        critic=None,
                        use_attn=FLAGS.use_attn,
                        output_sample=True,
                        input_embed=True,
                        batch_size=FLAGS.batch_size,
                        D_lambda=FLAGS.lambda_dis,
                        G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                        dtype=tf.float32)
        #sess.run(tf.variables_initializer(tf.global_variables()))
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        model.saver.restore(sess, ckpt.model_checkpoint_path)
        print('read in model from {}'.format(ckpt.model_checkpoint_path))

        vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)
        vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

        sys.stdout.write('> ')
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            if sentence.strip() == 'exit()':
                break
            # step
            number = 0
            feature = []
            for f in range(FLAGS.feature_size):
                feature.append(
                    [[3 if x == f else 0 for x in range(FLAGS.feature_size)]])

            token_ids = data_utils.sentence_to_token_ids(
                tf.compat.as_bytes(sentence), vocab, normalize_digits=False)
            print(token_ids)
            token_ids.append(data_utils.EOS_ID)
            encoder_pad = [data_utils.PAD_ID
                           ] * (_buckets[-1][0] - len(token_ids))
            encoder_lens = [len(token_ids)]
            # feature in my implementation
            token_ids = list(token_ids) + encoder_pad
            encoder_inputs = []
            for idx in token_ids:
                encoder_inputs.append([idx])
            print(encoder_inputs)
            decoder_inputs = [[data_utils.GO_ID]]

            for x in range(FLAGS.feature_size):
                A, outputs, log_prob = model.dynamic_decode_G(sess, encoder_inputs, encoder_lens, \
                                                             decoder_inputs, feature[x], gloveA_emb, gloveB_emb)
                #print(A)
                #outputs = [int(np.argmax(logit, axis=1)) for logit in outputs]
                outputs = [output_ids[0] for output_ids in outputs]
                if data_utils.EOS_ID in outputs:
                    outputs = outputs[:outputs.index(data_utils.EOS_ID)]
                print(
                    feature[x], ':', " ".join([
                        tf.compat.as_str(rev_vocab[output])
                        for output in outputs
                    ]))
                print(log_prob)

            sys.stdout.write('> ')
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Exemplo n.º 27
0
def train_glove(inst, meta_data={}):

    start_total = datetime.now()

    meta_data["glove_params"] = settings.GLOVE_PARAMS

    glove_paramgrid = ParameterGrid(settings.GLOVE_PARAMS)

    for params in glove_paramgrid:

        start = datetime.now()
        # MAKE CORPUS
        # set corpus filepath
        corpus_fp = os.path.join(settings.WVEC_OPT_DIRP, '{}_window{}.glovecorpus'.format(
            settings.DATASET,
            params["window"]))
        # load if corpus exists
        if os.path.isfile(corpus_fp):
            logging.info("Loading existing corpus {}.".format(corpus_fp))
            corpus_model = Corpus.load(corpus_fp)
            logging.info("Successfully loaded existing corpus {}.".format(corpus_fp))
        # make a new coocurrence corpus if it does not exist
        else:
            logging.info("Creating new corpus at {}.".format(corpus_fp))
            corpus_model = Corpus()
            corpus_model.fit(inst, window=params["window"])
            os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True)
            corpus_model.save(corpus_fp)

        logging.info("Dict size: {}.".format(len(corpus_model.dictionary)))
        logging.info("Collocations: {}.".format(corpus_model.matrix.nnz))

        # GLOVE VECTOR TRAINING
        glove = Glove(no_components=params["dims"], learning_rate=params["lr"])

        logging.info("Start fitting GloVe with parameters: {}.".format(params))
        glove.fit(corpus_model.matrix, epochs=params["epochs"],
                  no_threads=params["njobs"], verbose=False)
        glove.add_dictionary(corpus_model.dictionary)

        os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True)
        model_name = 'glove.{}_w{}_lr{}_ep{}.{}d.glovemodel'.format(settings.DATASET,
                                                                    params["window"],
                                                                    params["lr"],
                                                                    params["epochs"],
                                                                    params["dims"])
        glove.save(os.path.join(settings.WVEC_OPT_DIRP, model_name))

        duration = (datetime.now() - start).total_seconds()
        meta_data["models"][model_name] = params
        meta_data["models"][model_name]["duration_training"] = duration

        logging.info("Finished fitting GloVe {} in {}s with parameters: {}.".format(
            model_name,
            duration,
            params))
        # SIMILARITY TEST
        for test_word in settings.TESTSIM_WORDS:
            if test_word not in meta_data["most_similar"]:
                meta_data["most_similar"][test_word] = {}

            logging.info("Querying model {} for {} most similar to \'{}\':".format(
                model_name,
                settings.N_TESTSIM,
                test_word))
            sim = glove.most_similar(test_word, number=settings.N_TESTSIM)
            meta_data["most_similar"][test_word][model_name] = sim

            logging.info(pprint.pformat(sim))

    total_duration = (datetime.now() - start_total).total_seconds()
    meta_data["glove_duration_training"] = total_duration

    return meta_data
Exemplo n.º 28
0
        for line in datafile:
            #print(line.lower().split(' '))
            yield line.lower().split(' ')


# Build the corpus dictionary and the cooccurrence matrix.
print('Pre-processing corpus')
print(
    'You can use saved Corpus Model. In order to do this, Enter nothing for the file name'
)
print('To train GloVe based on new dataset, Enter dataset\'s name')
file_name = input("Enter file name: ")
if file_name:
    get_data = read_corpus(file_name)

    corpus_model = Corpus()
    corpus_model.fit(get_data, window=10)
    corpus_model.save('corpus.model')

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
if not file_name:
    corpus_model = Corpus.load('corpus.model')
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

# Train the GloVe model and save it to disk.
print('Training the GloVe model')

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=int(10), no_threads=4, verbose=True)
Exemplo n.º 29
0
# reads .txt files
def read_corpus(filename):

    delchars = [chr(c) for c in range(256)]
    delchars = [x for x in delchars if not x.isalnum()]
    delchars.remove(' ')
    delchars = ''.join(delchars)
    table = str.maketrans(dict.fromkeys(delchars))

    with open(filename, 'r') as datafile:
        for line in datafile:
            yield line.lower().translate(table).split(' ')


get_data = read_corpus('data/articles.txt')
corpus_model = Corpus()
corpus_model.fit(get_data, window=10)
epochs = 1000
no_threads = 8
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_model.matrix,
          epochs=epochs,
          no_threads=no_threads,
          verbose=True)
glove.add_dictionary(corpus_model.dictionary)

print("Most similar to Male ==>" + str(glove.most_similar('male')))
print(
    "---------------------------------------------------------------------------"
)
print("Most similar to Population ==>" + str(glove.most_similar('population')))
Exemplo n.º 30
0
                previous_message[index] = -1
        except IndexError:
            previous_message[index] = -1

texts = []
classes = []
for row in csvsequence:
    texts.append(clean(row[3]).split())
    classes.append(row[0])

# Calculate distribution, to account for 95th percentile of messages.
max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts])))

print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length))

corpus = Corpus()
try:
    print("Loading pretrained corpus...")
    corpus = Corpus.load("cache/corpus.p")
except:
    print("Training corpus...")
    corpus.fit(texts, window=max_sentence_length)
    corpus.save("cache/corpus.p")

glove = Glove(no_components=number_components, learning_rate=0.05)
try:
    print("Loading pretrained GloVe vectors...")
    glove = Glove.load("cache/glove.p")
except:
    print("Training GloVe vectors...")
    # More epochs seems to make it worse
Exemplo n.º 31
0
"""
Created on Fri Sep 14 12:45:30 2018

@author: charlie
"""

import itertools
from gensim.models.word2vec import Text8Corpus
from glove import Corpus, Glove
import os

cur_dir = os.getcwd()
glove_fname = '/glove.model'
corpus_fname = "/corpus.model"
if os.path.exists(cur_dir + glove_fname):
    glove = Glove.load(cur_dir+glove_fname)
#    corpus = Corpus.load(cur_dir+corpus_fname)
else:
    sentences = list(itertools.islice(Text8Corpus('text/text8'), None))
    corpus = Corpus()
    corpus.fit(sentences, window = 10)
    
    glove = Glove(no_components=100, learning_rate = 0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    
    glove.save(cur_dir + glove_fname)
    corpus.save(cur_dir+corpus_fname)

glove.most_similar('men') # Parameters are hashable string not list
glove.word_vectors[glove.dictionary['perfect']]
Exemplo n.º 32
0
from glove import Glove, Corpus
from gensim import utils, corpora, matutils, models
import os

corpus_file_name = ''

wiki = models.word2vec.LineSentence(corpus_file_name)
id2word = corpora.Dictionary(wiki)
id2word.filter_extremes(keep_n=30000)
word2id = dict((word, id) for id, word in id2word.iteritems())

# Filter all wiki documents to contain only those 30k words.
filter_text = lambda text: [word for word in text if word in word2id]
filtered_wiki = lambda: (filter_text(text) for text in wiki)  # generator

corpus = Corpus()

corpus.fit(filtered_wiki(), window=10)

HERE = os.path.dirname(os.path.dirname(__file__))
PS_FILE = os.path.join(HERE, "glove_default_30k.model")

glove = Glove.load(PS_FILE)

glove.add_dictionary(corpus.dictionary)

glove.save('glove_default_30k_with_dict.model')
Exemplo n.º 33
0
    def __iter__(self):
        file_count = 0
        for file_path in file_path_list:
            file_count += 1
            print(f"Now file name:{file_path}, now file count:{file_count}")
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    if args.use_segment == 0:
                        yield list(line.strip())
                    else:
                        yield list(jieba.cut(line.strip()))


sentences = Text(file_path_list)
corpus_model = Corpus()
corpus_model.fit(sentences, window=args.window_size)

model = Glove(no_components=args.embedding_size)
model.fit(corpus_model.matrix, epochs=args.iters, no_threads=args.cpu_count)
model.add_dictionary(corpus_model.dictionary)

end_time = time.time()
use_time = round(end_time - start_time, 2)

model_name = args.output_path
if not os.path.isdir(model_name):
    model.save(model_name)
else:
    model_name = os.path.join(
        model_name, f"glove_{args.window_size}_{args.embedding_size}.model")
def matrix_glove_embedding(click_all,
                           flag,
                           mode,
                           threshold=0,
                           dim=100,
                           epochs=30,
                           learning_rate=0.5):
    """
        glove 原理 + 矩阵分解:
            窗口内 加权统计 共线性词频
        
        四种向量化方式:
            flag='item' mode='all':
                sku1 sku2 sku3 sku4 sku5 user
            flag='user' mode='all':
                user1 user2 user3 user4 user5 sku
            flag='item',mode='only':
                item1 item2 item3 item4 item5
            flag='user' mode='only'
                user1 user2 user3 user4 user5
    """
    import psutil
    from glove import Glove
    from glove import Corpus

    if flag == 'user':
        group_by_col, agg_col = 'item_id', 'user_id'
    if flag == 'item':
        group_by_col, agg_col = 'user_id', 'item_id'

    data_ = click_all.groupby([
        group_by_col
    ])[agg_col].agg(lambda x: ','.join(list(x))).reset_index()
    if mode == 'only':
        list_data = list(data_[agg_col].map(lambda x: x.split(',')))
    if mode == 'all':
        data_['concat'] = data_[agg_col] + ',' + data_[group_by_col].map(
            lambda x: 'all_' + x)
        list_data = data_['concat'].map(lambda x: x.split(','))

    corpus_model = Corpus()
    corpus_model.fit(list_data, window=999999)

    glove = Glove(no_components=dim, learning_rate=learning_rate)
    glove.fit(corpus_model.matrix,
              epochs=epochs,
              no_threads=psutil.cpu_count(),
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    keys = glove.dictionary.keys()
    if mode == 'only':
        glove_embedding = {flag: {}}
    if mode == 'all':
        glove_embedding = {'user': {}, 'item': {}}
    for k in keys:
        if 'all' not in k:
            glove_embedding[flag][k] = glove.word_vectors[glove.dictionary[k]]
        if 'all' in k:
            flag_ = group_by_col.split('_')[0]
            k_ = k.split('_')[1]
            glove_embedding[flag_][k_] = glove.word_vectors[
                glove.dictionary[k]]

    return glove_embedding
Exemplo n.º 35
0
import csv

year1 = 2007
year2 = 2008
filename = 'C:/Users/Marija/PyCharmProjects/scraping/arxivData '
data = []
type = 'title'

while year2 < 2018:
    with open(filename + str(year1) + '-' + str(year2) + '.csv',
              'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
        for row in reader:
            # Titles only
            sentence = row[0][2:len(row[0]) - 1]
            sentence = sentence.replace('\\n', ' ').replace('.', '').replace(
                ',', '').replace(':',
                                 '').replace(')',
                                             '').replace('(',
                                                         '').lower().split(' ')
            data.append(sentence)
    year1 += 1
    year2 += 1

corpus = Corpus()
corpus.fit(data, window=10)
glove = Glove(no_components=100, learning_rate=0.025)
glove.fit(corpus.matrix, epochs=5, no_threads=4)
glove.add_dictionary(corpus.dictionary)
glove.save(type + 'Glove.txt')
def topk_recall_glove_embedding(click_all,
                                dict_label,
                                k=100,
                                dim=88,
                                epochs=30,
                                learning_rate=0.5):

    import psutil
    from glove import Glove
    from glove import Corpus

    data_ = click_all.groupby(
        ['pred',
         'user_id'])['item_id'].agg(lambda x: ','.join(list(x))).reset_index()
    list_data = list(data_['item_id'].map(lambda x: x.split(',')))

    corpus_model = Corpus()
    corpus_model.fit(list_data, window=999999)

    glove = Glove(no_components=dim, learning_rate=learning_rate)
    glove.fit(corpus_model.matrix,
              epochs=epochs,
              no_threads=psutil.cpu_count(),
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    list_user_id = []
    list_item_similar = []
    list_score_similar = []
    print('------- glove 召回 ---------')
    for i, row in tqdm(data_.iterrows()):

        list_item_id = row['item_id'].split(',')

        dict_item_id_score = {}
        for i, item in enumerate(list_item_id[::-1]):
            most_topk = glove.most_similar(item, number=k)
            for item_similar, score_similar in most_topk:
                if item_similar not in list_item_id:
                    if item_similar not in dict_item_id_score:
                        dict_item_id_score[item_similar] = 0
                    sigma = 0.8
                    dict_item_id_score[item_similar] += 1.0 / (
                        1 + sigma * i) * score_similar
        dict_item_id_score_topk = sorted(dict_item_id_score.items(),
                                         key=lambda kv: kv[1],
                                         reverse=True)[:k]
        assert len(dict_item_id_score_topk) == k
        dict_item_id_set = set([
            item_similar
            for item_similar, score_similar in dict_item_id_score_topk
        ])
        assert len(dict_item_id_set) == k
        for item_similar, score_similar in dict_item_id_score_topk:
            list_item_similar.append(item_similar)
            list_score_similar.append(score_similar)
            list_user_id.append(row['user_id'])

    topk_recall = pd.DataFrame({
        'user_id': list_user_id,
        'item_similar': list_item_similar,
        'score_similar': list_score_similar
    })
    topk_recall['next_item_id'] = topk_recall['user_id'].map(dict_label)
    topk_recall['pred'] = topk_recall['user_id'].map(
        lambda x: 'train' if x in dict_label else 'test')

    return topk_recall
Exemplo n.º 37
0
## FastText Embedding
## =============================================================================
#
#modelft = FastText(masterList1, size=150, window=3, min_count=5, workers=10)
#
#modelft['forecast']
#modelft.most_similar('forecast')[:5]
#
#modelft.save("fasttext.model")
#
## =============================================================================
## GloVe Embedding
## =============================================================================
#
# creating a corpus object
corpus = Corpus()

#training the corpus to generate the co occurence matrix which is used in GloVe
corpus.fit(masterList1, window=5)
#creating a Glove object which will use the matrix created in the above lines to create embeddings
#We can set the learning rate as it uses Gradient Descent and number of components
glove = Glove(no_components=150, learning_rate=0.05)

glove.fit(corpus.matrix, epochs=30, no_threads=10, verbose=True)
glove.add_dictionary(corpus.dictionary)

glove.word_vectors[glove.dictionary['forecast']]

t1 = glove.most_similar('forecast')[:5]

glove.save('glove.model')
Exemplo n.º 38
0
from __future__ import print_function
from glove import Glove
from glove import Corpus
'''数据集导入'''
# 将文本行存入列表
i = 1
lines = []
for line in open("processed.txt", encoding='utf-8'):
    lines.append(line.split(' '))
    print("appending line " + str(i))
    i += 1

# 准备数据集
corpus_model = Corpus()
corpus_model.fit(lines, window=10)
#corpus_model.save('corpus.model')
print('Dictionary size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)
'''训练模型'''
gl = Glove(no_components=200, learning_rate=0.05)
gl.fit(corpus_model.matrix, epochs=5, no_threads=1, verbose=True)
gl.add_dictionary(corpus_model.dictionary)
'''模型保存'''
gl.save('glove.model')
Exemplo n.º 39
0
from glove import Corpus, Glove

corpus = Corpus()
sentences = [['나는', '정말', '화난다'], ['너도', '정말', '화나지']]
corpus.fit(sentences, window=5)
# 훈련 데이터로부터 GloVe에서 사용할 동시 등장 행렬 생성

glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
# 학습에 이용할 쓰레드의 개수는 4로 설정, 에포크는 20.

model_result1 = glove.most_similar("나는")
print(model_result1)
Exemplo n.º 40
0
cats = []
with open('yahoo_train.txt', 'r') as file:
    for line in file:
        d = json.loads(line)

        uris.append(d[0])
        questions.append(d[1])
        answers.append(d[2])
        cats.append(d[3])

def get_lines():
    for a in answers:
        yield a.split()

# Build the corpus dictionary and cooccurence matrix
corpus_model = Corpus()
corpus_model.fit(get_lines(), window=8)

print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)

# Train GloVe model
#glove = Glove(no_components = no_comp, learning_rate=0.05)
glove = Glove.load_stanford('vectors.6B.100d.txt')
glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True)
glove.add_dictionary(corpus_model.dictionary)

# Save
with open('model.glove', 'w+') as file:
    file.write('%i %i \n' % (len(glove.dictionary), no_comp))
    for (word, idx) in glove.dictionary.iteritems():
Exemplo n.º 41
0
def train_StarGAN():
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    from keras.backend.tensorflow_backend import set_session
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = 0.5
    set_session(tf.Session(config=config))

    if not os.path.exists(FLAGS.model_dir):
        os.makedirs(FLAGS.model_dir)

    if not os.path.exists(FLAGS.pretrain_dir):
        os.makedirs(FLAGS.pretrain_dir)

    if not os.path.exists(FLAGS.stargan_dir):
        os.makedirs(FLAGS.stargan_dir)

    def build_summaries():
        train_loss = tf.Variable(0.)
        tf.summary.scalar("train_loss", train_loss)
        summary_vars = [train_loss]
        summary_ops = tf.summary.merge_all()
        return summary_ops, summary_vars


    feature, data, train, data_voc, train_voc = \
        data_utils.prepare_data(FLAGS.feature_path, FLAGS.feature_size, FLAGS.data_dir, \
                        FLAGS.data_path, FLAGS.train_path, FLAGS.vocab_size)

    glove_corpus_path = FLAGS.glove_model + ".txt.voc%d" % FLAGS.vocab_size
    vocab_path = os.path.join(FLAGS.data_dir, "vocab%d" % FLAGS.vocab_size)

    data_utils.combine_corpus(data_voc, train_voc, vocab_path,
                              glove_corpus_path, 28)

    sentences = list(itertools.islice(Text8Corpus(glove_corpus_path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=30)
    modelA = FLAGS.glove_model + "_%d.model" % FLAGS.size
    modelB = FLAGS.glove_model + "_%d.model" % (FLAGS.size * 3 / 4)
    if not os.path.exists(modelA):
        gloveA = Glove(no_components=FLAGS.size, learning_rate=0.05)
        gloveA.fit(corpus.matrix, epochs=300, no_threads=4, verbose=True)
        gloveA.add_dictionary(corpus.dictionary)
        gloveA.save(modelA)  # 512

    if not os.path.exists(modelB):
        gloveB = Glove(no_components=int(FLAGS.size * 3 / 4),
                       learning_rate=0.05)
        gloveB.fit(corpus.matrix, epochs=300, no_threads=4, verbose=True)
        gloveB.add_dictionary(corpus.dictionary)
        gloveB.save(modelB)  # 384

    gloveA = Glove.load(modelA)
    gloveA.add_dictionary(corpus.dictionary)
    gloveB = Glove.load(modelB)
    gloveB.add_dictionary(corpus.dictionary)

    vocab, rev_vocab = data_utils.initialize_vocabulary(vocab_path)

    with tf.Session() as sess:
        # build the model
        model = StarGAN('StarGAN',
                        FLAGS.size,
                        FLAGS.num_layers,
                        FLAGS.vocab_size,
                        _buckets,
                        FLAGS.feature_size,
                        FLAGS.baseline,
                        FLAGS.lr,
                        FLAGS.lr_decay,
                        FLAGS.grad_norm,
                        critic=None,
                        use_attn=FLAGS.use_attn,
                        output_sample=True,
                        input_embed=True,
                        batch_size=FLAGS.batch_size,
                        D_lambda=FLAGS.lambda_dis,
                        G_lambda=(FLAGS.lambda_one, FLAGS.lambda_two),
                        dtype=tf.float32)
        # build summary and intialize
        summary_ops, summary_vars = build_summaries()
        sess.run(tf.variables_initializer(tf.global_variables()))
        log_dir = os.path.join(FLAGS.model_dir, 'log')
        writer = tf.summary.FileWriter(log_dir, sess.graph)
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('read in model from {}'.format(ckpt.model_checkpoint_path))
            model.saver.restore(sess, ckpt.model_checkpoint_path)

        # load in train and dev(valid) data with buckets
        train_set = read_data_with_buckets(train, FLAGS.max_train_data_size)
        data_set = read_data_with_buckets(data, FLAGS.max_train_data_size)

        train_buckets_sizes = [len(train_set[b]) for b in range(len(_buckets))]
        train_total_size = float(sum(train_buckets_sizes))
        print('each buckets has: {d}'.format(d=train_buckets_sizes))
        train_buckets_scale = [
            sum(train_buckets_sizes[:i + 1]) / train_total_size
            for i in range(len(train_buckets_sizes))
        ]

        # main process
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []

        # glove embeddings
        gloveA_emb = gloveA.word_vectors[:FLAGS.vocab_size, :]
        gloveB_emb = gloveB.word_vectors[:FLAGS.vocab_size, :]
        ### ------------------------------------------------------------ ###
        ###                           Pretrain                           ###
        ### ------------------------------------------------------------ ###
        while True:
            # get batch from a random selected bucket
            random_number_01 = np.random.random_sample()
            bucket_id = min([
                i for i in range(len(train_buckets_scale))
                if train_buckets_scale[i] > random_number_01
            ])  # random pick bucket

            # get batch for the pretraining data
            feature_inputs_f, encoder_inputs_f, decoder_inputs_f, weights_f, seq_lens_f, _,  \
            feature_inputs_b, encoder_inputs_b, decoder_inputs_b, weights_b, seq_lens_b, _,  = \
                get_batch_with_buckets(FLAGS.feature_size, data_set, FLAGS.batch_size, bucket_id)

            # pretrain start !
            start_time = time.time()
            forloss, _ , _, _ = model.train_previous(sess, encoder_inputs_f, feature_inputs_f, \
                                                decoder_inputs_f, weights_f, encoder_inputs_b, \
                                                feature_inputs_b, decoder_inputs_b, weights_b, \
                                                bucket_id, gloveA_emb, gloveB_emb, seq_lens_f, seq_lens_b)
            step_loss = forloss
            step_time += (time.time() -
                          start_time) / FLAGS.steps_per_checkpoint
            loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / (
                FLAGS.Gstep * 2 + FLAGS.Dstep + 1)
            #print('pretrain : ',step_loss)
            ### ------------------------------------------------------------ ###
            ###                         Train StarGAN                        ###
            ### ------------------------------------------------------------ ###
            for _ in range(FLAGS.Dstep):
                # get batch from a random selected bucket
                random_number_01 = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(train_buckets_scale))
                    if train_buckets_scale[i] > random_number_01
                ])  # random pick bucket

                # get batch for the pretraining data
                feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \
                feature_inputs_b, decoder_inputs_b, weights_b, \
                real_inputs, real_feature , real_seq_lens= \
                    get_stargan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id)

                # D_step start !
                start_time = time.time()
                _, D_loss = model.train_StarGAN(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \
                                                decoder_inputs_b, weights_b, feature_inputs_b, \
                                                real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \
                                                disc = True,real_seq_len=real_seq_lens, forward_seq_len=seq_lens_f)
                step_loss = D_loss
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / (
                    FLAGS.Gstep * 2 + FLAGS.Dstep + 1)
                #print('D_step : ', step_loss)
            for _ in range(FLAGS.Gstep):
                # get batch from a random selected bucket
                random_number_01 = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(train_buckets_scale))
                    if train_buckets_scale[i] > random_number_01
                ])  # random pick bucket

                # get batch for the pretraining data
                feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \
                feature_inputs_b, decoder_inputs_b, weights_b, \
                real_inputs, real_feature, real_seq_lens = \
                    get_stargan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id)

                # G_step start !
                start_time = time.time()
                _, for_reward = model.train_StarGAN(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \
                                                decoder_inputs_b, weights_b, feature_inputs_b, \
                                                real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \
                                                forward = True,real_seq_len=real_seq_lens , forward_seq_len=seq_lens_f)

                step_loss = for_reward
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / (
                    FLAGS.Gstep * 2 + FLAGS.Dstep + 1)
                #print('for_loss :', step_loss)
                # get batch from a random selected bucket
                random_number_01 = np.random.random_sample()
                bucket_id = min([
                    i for i in range(len(train_buckets_scale))
                    if train_buckets_scale[i] > random_number_01
                ])  # random pick bucket

                # get batch for the pretraining data
                feature_inputs_f, encoder_inputs_f, decoder_inputs_f, seq_lens_f, \
                feature_inputs_b, decoder_inputs_b, weights_b, \
                real_inputs, real_feature, real_seq_lens = \
                    get_stargan_data(feature, FLAGS.feature_size, train_set, FLAGS.batch_size, bucket_id)

                # G_step start !
                start_time = time.time()
                _, back_reward = model.train_StarGAN(sess, encoder_inputs_f, decoder_inputs_f, feature_inputs_f, \
                                                decoder_inputs_b, weights_b, feature_inputs_b, \
                                                real_inputs, real_feature, bucket_id, gloveA_emb, gloveB_emb, \
                                                backward = True,real_seq_len=real_seq_lens , forward_seq_len=seq_lens_f)

                step_loss = back_reward
                step_time += (time.time() -
                              start_time) / FLAGS.steps_per_checkpoint
                loss += np.mean(step_loss) / FLAGS.steps_per_checkpoint / (
                    FLAGS.Gstep * 2 + FLAGS.Dstep + 1)
                #print('back_loss :', step_loss)
            current_step += 1
            # log, save and eval
            if current_step % FLAGS.steps_per_checkpoint == 0:
                perplexity = math.exp(
                    float(loss)) if loss < 300 else float('inf')
                print(
                    "Generator step %d; learning rate %.4f; learning_rate_star %.6f; D_lr %6f; step-time %.2f; perplexity "
                    "%.2f; loss %.2f" %
                    (model.global_F_step.eval(), model.learning_rate.eval(),
                     model.learning_rate_star.eval(), model.D_lr.eval(),
                     step_time, perplexity, loss))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(
                        previous_losses[-3:]):
                    sess.run(model.op_lr_decay)
                    sess.run(model.op_D_lr_decay)
                    sess.run(model.learning_rate_star_decay)
                previous_losses.append(loss)

                # write summary
                feed_dict = {}
                feed_dict[summary_vars[0]] = loss
                summary_str = sess.run(summary_ops, feed_dict=feed_dict)
                writer.add_summary(summary_str, model.global_F_step.eval())
                writer.flush()
                # Save checkpoint and zero timer and loss.
                ckpt_path = os.path.join(FLAGS.model_dir, "ckpt")
                model.saver.save(sess,
                                 ckpt_path,
                                 global_step=model.global_F_step)

                stargan_path = os.path.join(FLAGS.stargan_dir, "ckpt_prev")
                model.saver.save(sess,
                                 stargan_path,
                                 global_step=model.global_F_step)
                step_time, loss = 0.0, 0.0

                sys.stdout.flush()
Exemplo n.º 42
0
                        default='',
                        help='Get closes words to this word.')
    args = parser.parse_args()


    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print('Pre-processing corpus')

        if args.wiki:
            print('Using wikipedia corpus')
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model')
        
        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
Exemplo n.º 43
0
def fit_corpus(corpus):

    model = Corpus()
    model.fit(corpus)

    return corpus
Exemplo n.º 44
0
def read_data(filenames):
    """
    input - filenames
    output - a list of words in the question
    """
    for f in filenames:
        for i, r in f.iterrows():
            for q in question_cols:
                yield question_to_wordlist(r[q])


filenames = [train_df, test_df]
#filenames = [train_df]
print("Preprocessing corpus")
get_data = read_data
corpus_model = Corpus()
corpus_model.fit(get_data(filenames), window=10)

#corpus_model.save(COMPUTE_DATA_PATH + '/corpus.model')

print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)
#corpus_model = Corpus.load(COMPUTE_DATA_PATH + '/corpus.model')

glove = Glove(no_components=300, learning_rate=0.05)
print("Starting training")
glove.fit(corpus_model.matrix, epochs=1000, no_threads=6, verbose=True)

#glove = Glove.load(COMPUTE_DATA_PATH + '/glove.model')

glove.add_dictionary(corpus_model.dictionary)
#salva o arquivo em formato pickled
if not os.path.exists('pickledData/'):
    print('criando diretorio para salvar o arquivo pickled ')
    os.makedirs('pickledData/')
with open('pickledData/' + DICTIONARY_FILE, 'wb') as f:
    pickle.dump(pickle_files, f)

print("Fim arquivos e variaveis em %s segundos", (time.time() - start_time))
# ===========================================

# ===========================================
print("Inicio criacao modelo glove")
start_time = time.time()

corpus = Corpus()
# treina o corpus para gerar a matrix de co ocorrencia utilizado no GloVe
#com o tamanho da janela considerando quantas palavras no contexto
corpus.fit(X_train, window=10)
# cria o arquivo GloVe, contendo a dimensao (no_componentes) e o learning_rate. Constantes declaradas no inicio
glove = Glove(no_components=EMBEDDING_DIM, learning_rate=GLOVE_LEARNING_RATE)
glove.fit(corpus.matrix,
          epochs=GLOVE_NUM_EPOCHS_TRAINING,
          no_threads=4,
          verbose=True
          )  #glove.fit(corpus.matrix, epochs=10, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

if not os.path.exists('models/'):
    print('criando diretorio para salvar os modelos')
    os.makedirs('models/')
Exemplo n.º 46
0
                     iter=args.epochs,
                     size=args.size,
                     sg=args.sg,
                     window=args.window,
                     min_count=args.min_count,
                     workers=args.workers)
    model.save(args.save + 'fastText.model')
    model = FastText.load(args.save + 'fastText.model')
    print("완성된fastText 임베딩 크기 확인:", model.wv.vectors.shape)
elif (args.model == 'word2vec'):
    model = Word2Vec(sentences=tokenized_data,
                     size=args.size,
                     window=args.window,
                     min_count=args.min_count,
                     workers=args.workers)
    model.save(args.save + 'Word2Vec.model')
    model = Word2Vec.load(args.save + 'Word2Vec.model')
    print("완성된word2vec 임베딩 크기 확인:", model.wv.vectors.shape)
elif (args.model == 'glove'):
    corpus = Corpus()
    corpus.fit(tokenized_data, window=5)
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    model.save(args.save + 'glove.model')
    model = FastText.load(args.save + 'glove.model')
    print("완성된glove 임베딩 크기 확인:", model.wv.vectors.shape)

print(model.wv.most_similar("핸드폰"))
print(model.wv.most_similar("도로"))
Exemplo n.º 47
0
    delchars.remove(' ')
    delchars = ''.join(delchars)

    with open(filename, 'r') as datafile:
        for line in datafile:
            # list of tokenized words
            yield line.lower().translate(None, delchars).split(' ')


if __name__ == '__main__':

    # initialize glove object
    glove = Glove(no_components=100, learning_rate=0.05)
    
    # read in the data to train on; this file is shakespeare text
    corpus_model = Corpus()
    corpus_model.fit(read_corpus("data/input.txt"), window=10)
        
    # fit the model using the given parameters
    glove.fit(corpus_model.matrix, epochs=10, no_threads=1, verbose=True)
              
    # add a dictionary just to make it easier for similarity queries
    glove.add_dictionary(corpus_model.dictionary)

    # save glove object to file
    glove.save_obj('glove.model.obj')
    
    # give me the 5 words most similar to each word in the words list in this 
    # corpus and show me how similar the words are in this corpus to each word
    # in the words list in general
    words = ['sky', 'queen', 'car']
Exemplo n.º 48
0
def main_procesing_corpus(korpus: str, size: int):

    [people_vect_dict, my_corpus] = read_corpus(in_path + korpus, size)

    corpus = Corpus()
    corpus.fit(my_corpus, window=10)

    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    # glove.save('glove.model')

    person_result_dict = {}
    f = open(r"C:\Users\tymon.czarnota\Desktop\PADT1\output_{}_{}.tsv".format(
        korpus, size),
             'w',
             encoding='utf-8')
    ff = open(
        r"C:\Users\tymon.czarnota\Desktop\PADT1\output_{}_{}_META.tsv".format(
            korpus, size),
        'w',
        encoding='utf-8')
    for key in people_vect_dict:
        ppl = str(key)
        for prof in people_dict:
            for mm in people_dict[prof]:
                if str(mm) == str(key):
                    ppl = ppl + "<--->" + prof + "\n"
        for l in people_vect_dict[key]:
            ff.write(ppl)
            a = [glove.word_vectors[glove.dictionary[w]] for w in l]
            a_mean = np.mean(a, axis=0, dtype=np.float64)
            if key not in person_result_dict:
                person_result_dict[key] = []
            person_result_dict[key].append(a_mean)
            text = ""
            for val in person_result_dict[key]:
                for single in val:
                    text = text + str(single) + "\t"
            text = text + "\n"
            f.write(text)

    f = open(out_path + r"output_{}_{}_WHOLE.tsv".format(korpus, size),
             'w',
             encoding='utf-8')
    ff = open(out_path + r"output_{}_{}_WHOLE_META.tsv".format(korpus, size),
              'w',
              encoding='utf-8')
    for key in person_result_dict:
        a = np.mean(person_result_dict[key], axis=0, dtype=np.float64)
        str_a = ""
        for el in a:
            str_a = str_a + str(el) + "\t"
        str_a = str_a + "\n"
        f.write(str_a)

        str_key = ""
        for prof in people_dict:
            for mm in people_dict[prof]:
                if str(mm) == str(key):
                    str_key = str(key) + "<--->" + prof + "\n"
        ff.write(str_key)
Exemplo n.º 49
0
    checkX.append((a1, a2))
    checkY.append(flag[check[i]["gold_label"]])
for i in range(len(test)):
    test[i] = json.loads(test[i])
    if flag.get(test[i]["gold_label"]) is None:
        continue
    a1 = test[i]["sentence1"].split()
    a2 = test[i]["sentence2"].split()
    sentense.append(a1)
    sentense.append(a2)
    maxlen1 = max(maxlen1, len(a1))
    maxlen2 = max(maxlen2, len(a2))
    testX.append((a1, a2))
    testY.append(flag[test[i]["gold_label"]])

corpus_model = Corpus()
corpus_model.fit(sentense, window=10)
print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)

embedding_dim=300
batch_size = 32
type_size=3
dict_size=len(corpus_model.dictionary)
maxlen = maxlen1 + maxlen2
glove = Glove(no_components=embedding_dim, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=1,no_threads=8, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
#glove.load('glove.model')
def get(X,Y,maxlen1,maxlen2):
    tmpX = np.zeros((len(X),maxlen1+maxlen2+1), dtype=int)
Exemplo n.º 50
0
        vocab = []
        vocab.extend(fulltext)
        vocab.extend(description)
        domain_vocab.append(vocab)
        #fulltext+=temp["content"]

    print("Total number of documents (description and content) ",
          len(vocab))  #1446
    #print(domain_vocab)
    maxlen_content = max([len(item) for item in fulltext])
    maxlen_description = max([len(item) for item in description])
    print("total no of words in content ", maxlen_content)  #59080
    print("total no of words in description", maxlen_description)  #1480

    #GloVe Implementation
    corpus = Corpus()
    corpus.fit(vocab, window=10)
    print(corpus)
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    #print(corpus.dictionary)

    word2idx = corpus.dictionary  # unique content word as key and index as value
    content_vector = glove.word_vectors  # content vector with word embeddings

    print("corpus.dictionary(word2idx)_lenth", len(word2idx))
    print("EMBEDDING VECTOR LENGTH", len(content_vector))
    """
    with open("/Users/prathibha/Documents/Project/Try1_embed.tsv","w+") as my_csv:
       
Exemplo n.º 51
0
from glove import Glove
from glove import Corpus

vocab_count = 50000
# write vocab to file
if not os.path.exists('Embedding/main_cat/glove'):
    os.makedirs('Embedding/main_cat/glove')


# In[ ]:


if not os.path.exists("Embedding/main_cat/glove/glove.model"):

    corpus_model = Corpus()
    corpus_model.fit(sentences, window=10)
    #corpus_model.save('corpus.model')
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    
    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=100,
              no_threads=10, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    
    glove.save('Embedding/main_cat/glove/glove.model') # 存模型
    corpus_model.save('Embedding/main_cat/glove/corpus.model') # 存字典


glove = Glove.load('Embedding/main_cat/glove/glove.model')
import itertools
from gensim.models.word2vec import Text8Corpus
from glove import Corpus, Glove

# for installing text8 corpus you should follow this commands

# wget http://mattmahoney.net/dc/text8.zip -P /tmp
# unzip text8.zip


sentences = list(itertools.islice(Text8Corpus('/tmp/text8'), None))
corpus = Corpus()
corpus.fit(sentences, window=10)
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

print glove.most_similar('frog', number=10)
print glove.most_similar('girl', number=10)
print glove.most_similar('car', number=10)