示例#1
0
文件: test_glove.py 项目: Ron3/tails
    def test_pythonObj(self):
        """
        :return:
        """

        class A():
            def __init__(self):
                self.b1 = B()
                self.b2 = B()
                self.list = [1000, 23424.2, 'asdf0', u'unicode编码', self.b1]
                self.dic = {
                    132323423412312311: 'utf8编码',
                    '232': self.b2,
                    self.b2: set([1,2]),
                    u'unicode编码': None,
                    123: (11,1,111),
                    11: (11,1,111),
                }

        class B():
            def __init__(self):
                self.none = None
                self.str = '1111'
                self.int = 15151515151515155151
                self.float = 11231231231212342323.
                self.list = [1,2,3,4]
                self.dict = {1:2, 2:3}
                self.tuple = (1,2,3, 4)

                return

        glove = Glove(A())
        glove.meaure()
        print glove.report
示例#2
0
def word_embedding(sentences,embedding_size,windows_len):
    """
    Verify that the square error diminishes with fitting
    """

     

    corpus_model = Corpus()

    corpus_model.fit(sentences,window=windows_len)

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=embedding_size, learning_rate=0.05)
    glove_model.fit(corpus_model.matrix,
                    epochs=0,
                    no_threads=2)

    log_cooc_mat = corpus_model.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())
    
    
    
    corpus_dict=corpus_model.dictionary
    corpus_inverse_dict=dict(map(reversed, corpus_dict.items()))

        
    

    return glove_model,corpus_dict,corpus_inverse_dict
示例#3
0
class MyGloVe:
    def initiate_model(self, input_corpus):
        self.corpus_model = Corpus()
        self.corpus_model.fit(self.__read_corpus(input_corpus), window=10)

        self.glove = Glove(no_components=100, learning_rate=0.05)
        self.glove.fit(self.corpus_model.matrix, epochs=200)
        self.glove.add_dictionary(self.corpus_model.dictionary)

    def cosine_similarity(self, first_text, second_text):
        first = self.__average_feature_vector(first_text)
        second = self.__average_feature_vector(second_text)

        return 1 - spatial.distance.cosine(first, second)

    def __read_corpus(self, input_corpus):
        for line in input_corpus:
            yield line

    def __average_feature_vector(self, text):
        words = text.split()
        words_no = 0
        feature_vector = numpy.zeros((100, ), dtype="float32")

        for word in words:
            if word in self.glove.dictionary:
                word_idx = self.glove.dictionary[word]
                words_no += 1
                feature_vector = numpy.add(feature_vector,
                                           self.glove.word_vectors[word_idx])

        if words_no > 0:
            feature_vector = numpy.divide(feature_vector, words_no)

        return feature_vector
示例#4
0
 def __init__(self):
     # Corpus model
     vocab = dict(torch.load("../data/dialogue.vocab.pt", "text"))
     self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi)
     # Model
     self.glove = Glove(no_components=args.no_components,
                        learning_rate=args.learning_rate)
示例#5
0
    def initiate_model(self, input_corpus):
        self.corpus_model = Corpus()
        self.corpus_model.fit(self.__read_corpus(input_corpus), window=10)

        self.glove = Glove(no_components=100, learning_rate=0.05)
        self.glove.fit(self.corpus_model.matrix, epochs=200)
        self.glove.add_dictionary(self.corpus_model.dictionary)
示例#6
0
def build_model_glove(args):

    from glove import Glove, Corpus

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
示例#7
0
    def Myself_Model(self,
                     cropus_path,
                     save=None,
                     back_corpus=None,
                     epochs=10,
                     no_threads=8,
                     no_components=100,
                     learning_rate=0.05):
        """
        sd
        """

        self.get_data = self.read_corpus(cropus_path)
        corpus_model = Corpus()
        corpus_model.fit(self.get_data, window=10)
        if back_corpus != None:
            yield corpus_model

        #self.glove = Glove()
        self.glove = Glove(no_components=no_components,
                           learning_rate=learning_rate)
        self.glove.fit(corpus_model.matrix,
                       epochs=epochs,
                       no_threads=no_threads,
                       verbose=True)
        self.glove.add_dictionary(corpus_model.dictionary)

        if save != None:
            #save = 'model/articles_glove.model'
            self.glove.save(save)

        self.model = self.glove
        return self.glove
示例#8
0
 def trainShake2(self):
     corpus = Corpus()
     shakespeare_words = self.shakespeare_lines()
     # corpus.fit(shakespeare_corpus + sonnets_corpus, window=10)
     corpus.fit(shakespeare_words, window=10)
     self.glove = Glove(no_components=100, learning_rate=0.05)
     self.glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
     self.glove.add_dictionary(corpus.dictionary)
示例#9
0
def getGloveEmbedding(seqs, size=300, window=10, epochs=20):
    corpus = Corpus()
    corpus.fit(seqs, window=window)

    glove = Glove(no_components=size, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=epochs, verbose=True)

    return corpus.dictionary, glove.word_vectors
示例#10
0
def test_word_vector_by_word_without_fitting():
    glove_model = Glove()
    glove_model.dictionary = {"word": 0}

    with pytest.raises(Exception) as ex:
        glove_model.word_vector_by_word("word")

    assert(ex.value.message == "Model must be fit before querying")
示例#11
0
def get_embedding(words):
    from glove import Glove
    global glove_obj
    if not glove_obj:
        glove_obj = Glove()
    if words not in glove_cache:
        glove_cache[words] = glove_obj.get_vector(words)
    return glove_cache[words]
示例#12
0
def test_word_vector_by_word_without_dictionary():
    glove_model = Glove()
    glove_model.word_vectors = [[1, 2, 3]]

    with pytest.raises(Exception) as ex:
        glove_model.word_vector_by_word("word")

    assert(ex.value.message == "No word dictionary supplied")
示例#13
0
def get_glove_model(co_occurrence_dic, epochs=25):
    assert isinstance(co_occurrence_dic, dict)

    model = Glove(co_occurrence_dic)
    for epoch in range(epochs):  # train the model
        err = model.train()
        print("epoch %d, error %.3f" % (epoch, err), flush=True)
    model
    return model.W
示例#14
0
    def fit_matrix(self, coo_mat):
        model = Glove(no_components=self.n_features, learning_rate=0.05)
        model.fit(coo_mat,
                  epochs=self.n_epochs,
                  no_threads=self.n_threads,
                  verbose=True)
        self.model = model

        return self
示例#15
0
    def Pre_Existing_Model(self, cropus_path):

        self.get_data = self.read_corpus(cropus_path)

        glove = Glove()
        stanford = glove.load_stanford(self.model_kind)
        self.model = stanford

        return stanford
示例#16
0
    def build_model_glove(self,
                          file_model_glove,
                          vector_size=300,
                          window=100,
                          learning_rate=0.05,
                          workers=4,
                          epochs=100,
                          use_stored_model=True):
        """ Build glove model
        
        Args:
        --------
        file_model_glove: str,
            Filename to save model (or load model if it exists under this name).
        vector_size: int,
            Dimensions of word vectors (default = 100) 
        window: int,
            Window size for context words (small for local context, 
            larger for global context, default = 50)
        learning_rate: float,
            Learning rate for training of GloVe model (defaut set to 0.05).
        workers: int,
            Number of threads to run the training on (should not be more than number of cores/threads, default = 4).
        epochs: int,
            Number of training iterations (default=100). 
        use_stored_model: bool,
            Load stored model if True, else train new model.
        """
        from glove import Corpus, Glove

        # Check if model already exists and should be loaded
        if os.path.isfile(file_model_glove) and use_stored_model:
            print("Load stored GloVe model ...")
            self.model_glove = Glove.load(file_model_glove)
        else:
            if use_stored_model:
                print("Stored GloVe model not found!")

            print("Calculating new GloVe model...")

            # Creating a corpus object
            corpus = Corpus()

            # Training the corpus to generate the co occurence matrix which is used in GloVe
            MS_docs = self.corpus
            corpus.fit(MS_docs, window=window)

            self.model_glove = Glove(no_components=vector_size,
                                     learning_rate=learning_rate)
            self.model_glove.fit(corpus.matrix,
                                 epochs=epochs,
                                 no_threads=workers,
                                 verbose=True)
            self.model_glove.add_dictionary(corpus.dictionary)

            # Save model
            self.model_glove.save(file_model_glove)
示例#17
0
def glove_benchmark(sentences_iterable, window=5, size=100, alpha=0.05,
                    epochs=10, num_threads=1):
    """ Wrapper de los pasos para crear el modelo de Glove para poder
    realizar mediciones del rendimiento.
    """
    corpus_model = Corpus()
    corpus_model.fit(sentences_iterable, window=window)
    glove = Glove(no_components=size, learning_rate=alpha)
    glove.fit(corpus_model.matrix, epochs=epochs,
              no_threads=1, verbose=False)
 def __init__(self, glove_components=300, min_df=5, max_df=0.4):
     self.glove_model = Glove(no_components=glove_components)
     self.tf_idf_model = TfidfVectorizer(min_df=min_df,
                                         max_df=max_df,
                                         token_pattern='[^\s]+',
                                         lowercase=False)
     self.word_mapping = None
     self.embedding_dim = glove_components
     self.wmd = None
     self._r = None
示例#19
0
    def __init__(self, options):
        super(TextModel, self).__init__()

        self.options = options
        self.glove = Glove()
        self._GloVe = self.glove.get_embeddings()
        self.embeddings = nn.EmbeddingBag(constants.VOCAB_SIZE,
                                          constants.EMBED_DIM,
                                          mode=constants.REDUCE_TYPE)
        self.embeddings.weight = nn.Parameter(torch.tensor(self._GloVe))
示例#20
0
def main():
    """
    load the glove model, and extract, save information as txt.
    :return:
    """
    glove_words_model = Glove.load('glove_words_0.001_1_win1.model')
    glove_chars_model = Glove.load('glove_chars_0.001_1_win3.model')
    save_words_path = 'glove_words_embed.txt'
    save_chars_path = 'glove_chars_embed.txt'
    etl_glove(glove_ana=glove_words_model, save_txt_path=save_words_path)
    etl_glove(glove_ana=glove_chars_model, save_txt_path=save_chars_path)
    def prepare_data(self):
        test, train, val = utils.load_test_train_val(self.data_num) # df

        train_texts = list(train.posts)

        glove = Glove()
        glove.create_custom_embedding([word for text in train_texts for word in text.split()])

        self.train_tuple = utils.process_data(train, glove, self.max_words, self.max_posts)
        self.test_tuple = utils.process_data(test, glove, self.max_words, self.max_posts)
        self.val_tuple = utils.process_data(val, glove, self.max_words, self.max_posts)
def glove_vectors(x,
                  embedding_size,
                  epochs=50,
                  lr=0.05,
                  alpha=0.75,
                  max_count=100,
                  tmp_loc='glove.w2vmodel'):
    # create dict ourselves so that the ids correspond to their location in the df, starting to count from first col downwards
    df = pd.DataFrame(x)
    word_id_dict = create_vocab_dict(df)
    # Creating a corpus object
    corpus = Corpus(dictionary=word_id_dict)
    # Training the corpus to generate the co occurence matrix which is used in GloVe
    # Distance scaling: standard glove reduces the occurence count based on how far a context word is from the focus word.
    # Should not be used since distance has no meaning for purely categorical variables.
    corpus.fit(df.values.tolist(),
               window=len(df.columns),
               distance_scaling=False)
    # alpha is the weighing of the loss, based on how likely a cooccurence is (Xij), less likely = less weight.
    glove = Glove(no_components=embedding_size,
                  learning_rate=lr,
                  alpha=alpha,
                  max_count=max_count)
    glove.fit(
        corpus.matrix, epochs=epochs, no_threads=1, verbose=True
    )  # glove paper: 50 epochs for dimensionality <300, 100 otherwise
    glove.add_dictionary(corpus.dictionary)
    glove.save_word2vec_format(tmp_loc)

    model = KeyedVectors.load_word2vec_format(tmp_loc)
    if os.path.exists(tmp_loc):
        os.remove(tmp_loc)
    return model
def main():
    corpus_model = Corpus()
    corpus_model = Corpus.load('bioc-corpus-AZ2.model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('bioc-glove-AZ2.model')
def get_embeddings(prepared_input):
    corpus = Corpus()
    corpus.fit(prepared_input, window=10)
    glove = Glove(no_components=5, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
示例#25
0
def train_glove_fashionrec(dimensionality, context, epochs):
    """ Train with Glove on IG corpora"""
    total_count, vocab_size = corpus_stats("data/clean2_corpus.txt")
    print("total word count: {}, vocabulary size: {}".format(
        total_count, vocab_size))
    fileName = "results/training/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_" + ".txt"
    corpus = readCorpus()
    lines = corpus.split("\n")
    linessplit = map(lambda x: x.split(" "), lines)
    corpus_model = Corpus()
    start_time = datetime.now()
    corpus_model.fit(linessplit, window=context)
    corpusModelFile = "trained/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_corpus" + ".model"
    corpus_model.save(corpusModelFile)
    glove = Glove(no_components=dimensionality, learning_rate=0.05)
    glove.fit(corpus_model.matrix,
              epochs=int(epochs),
              no_threads=8,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    time_elapsed = datetime.now() - start_time
    gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".model"
    glove.save(gloveModelFile)
    notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str(
        context) + " context, " + str(
            epochs) + " epochs \n" + "Training time: " + str(time_elapsed)
    save_to_file(fileName, notes)
    gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".vec"
    save_glove_bin_to_vec(glove, gloveVecFile)
示例#26
0
def train_glove(target_group, glove_para, src_file, save_model_name):
    """
    example: train_glove(target_group='words', glove_para=glove_para_word)
    after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model')
    :param target_group: 'words' or 'chars'
    :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4}
    :return:
    """
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(src_file=src_file,
                                 words_or_chars=target_group),
                     window=glove_para['window_size']
                     )  #avg word size is 6 for each sentence
    corpus_model.save('corpus_model_{}.model'.format(target_group))
    print target_group
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    print('Training the GloVe model')

    glove = Glove(no_components=glove_para['no_components'],
                  learning_rate=glove_para['learning_rate'])
    glove.fit(corpus_model.matrix,
              epochs=glove_para['no_epochs'],
              no_threads=glove_para['parallelism'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(save_model_name)
def train_glove(src_filename, dim=100):
    corpus = Corpus()
    corpus.fit(get_lines(src_filename), window=10)
    glove = Glove(no_components=dim, learning_rate=0.001)
    glove.fit(corpus.matrix, epochs=100, no_threads=20, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save(DATA_DIR + 'glove.{}d.model'.format(dim))
示例#28
0
def test_fitting():
    """
    Verify that the square error diminishes with fitting
    """

    num_sentences = 5000
    seed = 10

    corpus = Corpus()

    corpus.fit(generate_training_corpus(num_sentences, vocabulary_size=50, seed=seed))

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix, epochs=0, no_threads=2)

    log_cooc_mat = corpus.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0

    # Check that it is good with fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix, epochs=500, no_threads=2)

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
示例#29
0
def train_and_save_model(data_dir,
                         model_name='LeGlove',
                         num_epochs=10,
                         parallel_threads=1):
    '''
    This function processes all the data into a training
    corpus and fits a GloVe model to this corpus. 

    Parameters:
        data_dir (string):          master directory containing all jurisdiction-level directories
        model_name (string):        name of model to be used for output
        num_epochs (int):           number of epochs for which to train model
        parallel_threads (int):     number of parallel threads to use for training

    The trained model is saved as "[model_name].model" into the current directory.
    '''

    corpus_model = Corpus()
    corpus_model.fit(read_corpus(data_dir), window=CONTEXT_WINDOW)

    glove = Glove(no_components=NUM_COMPONENTS, learning_rate=LEARNING_RATE)
    glove.fit(corpus_model.matrix,
              epochs=num_epochs,
              no_threads=parallel_threads,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(model_name + '.model')
示例#30
0
def train_glove(path):
    import itertools
    from gensim.models.word2vec import Text8Corpus
    from gensim.scripts.glove2word2vec import glove2word2vec
    from glove import Corpus, Glove
    #import os
    #import struct
    sentences = list(itertools.islice(Text8Corpus(path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES)
    glove.save(file_name)
    glove2word2vec(file_name, file_name + '_modified')
    """
    command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified'
    os.system(command)
    with open(file_name+'_modified', mode='rb') as file: # b is important -> binary
        fileContent = file.read()
        print 'Content',fileContent
    """
    print 'Finished'
    return glove
示例#31
0
 def __init__(self, data):
     self.data = data
     self.corpus = None
     self.liu = LiuLexicon()
     self.subj = SubjLexicon()
     self.buildTweetCorpus()
     self.word_vec_model = Word2Vec(self.corpus)
     self.glove_vec_model = Glove(100, self.corpus)
     self.clusters = Cluster(100)
     self.initEncoders()
     self.topicVecs = self.word_vec_model.getVectorsForTopics(
         self.topicenc.classes_)
     self.collectTopUnigrams()
     self.collectTopBigrams()
示例#32
0
def train(path, freq, window, dim, lr, epochs):
    lines = []
    dic = {}
    print("Start of train method")
    try:
        for f in os.listdir(path):
            text = open(path + '/' + f, 'r').read()
            text = re.sub('\n', ' ', text)
            text = text.split()
            for word in text:
                if word in dic.keys():
                    dic[word] += 1
                else:
                    dic[word] = 1
        print("Created Dictionary for frequencies of words.")
        for f in os.listdir(path):
            text = open(path + '/' + f, 'r').read()
            text = re.sub('\n', ' ', text)
            text = text.split()
            text = [word for word in text if dic[word] > freq]
            lines.append(text)
        print(
            "Converted preprocessed text data in input format of array of array of words."
        )
        corpus = Corpus()
        corpus.fit(lines, window=window)
        glove = Glove(no_components=dim, learning_rate=lr)
        glove.fit(corpus.matrix, epochs=epochs, verbose=True)
        glove.add_dictionary(corpus.dictionary)
        glove.save('glove.model')
        print("Saved the trained model to glove.model.")
    except:
        print("Error occured in training glove model")
示例#33
0
def train_model(line):
    corpus = Corpus()
    corpus.fit(line)
    glove = Glove(no_components=5, learning_rate=0.05, random_state=0)
    glove.fit(corpus.matrix, epochs=10, no_threads=100, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
    return glove
def get_model():
    ''' lazy initialization for glove model so it works in pool '''
    global model
    if model == None:
        print 'loading the glove model...'
        model = Glove.load('w2v/glove_lemma_stopwords')
    return model
示例#35
0
 def __init__(self,data_src,num_features=100,window=10,learning_rate=0.05,epochs=10):
     self.learning_rate = learning_rate
     self.num_features = num_features
     self.window = window
     self.epochs = epochs
     self.pretrain(data_src)
     self.model = Glove.load("glove.model")
def train_glove(sentences):
    print 'training glove model...'
    t0 = time()
    
    num_features = 300    # Word vector dimensionality
    context = 5          # Context window size
    learning_rate = 0.05
    
    corpus = Corpus()
    corpus.fit(sentences, window=context)

    glove = Glove(no_components=num_features, learning_rate=learning_rate)
    glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True)
    glove.add_dictionary(corpus.dictionary)

    print 'took %0.5fs.' % (time() - t0)
    return glove
示例#37
0
文件: Glove.py 项目: Denffer/yelp-re
    def run_glove(self):
        """ run global vector """
        #sentences = [["hi","good","to"],["see","u"]]
        sentences = self.get_sentences()

        print '\n' + '-'*80
        print "Fitting words into corpus"
        corpus = Corpus()
        corpus.fit(sentences, window=10)

        print "Running Glove"
        glove = Glove(no_components=200, learning_rate=0.05)
        glove.fit(corpus.matrix, epochs=5, no_threads=10, verbose=True)
        glove.add_dictionary(corpus.dictionary)

        print "Fitting words and vectors into unique_words and vectors200"
        unique_words = []
        vectors200 = []

        cnt1 = 0
        length1 = len(glove.inverse_dictionary)
        for word_id in glove.inverse_dictionary:
            cnt1 += 1
            unique_words.append(glove.inverse_dictionary[word_id])
            vectors200.append(glove.word_vectors[word_id])

            sys.stdout.write("\rStatus: %s / %s"%(cnt1, length1))
            sys.stdout.flush()

        print '\n' + "Processing vectors200"
        processed_vectors200 = []
        processed_vector = []

        cnt2 = 0
        length2 = len(vectors200)
        for vector in vectors200:
            cnt2 += 1
            for float_num in vector:
                processed_vector.append(float_num)

            processed_vectors200.append(processed_vector)

            sys.stdout.write("\rStatus: %s / %s"%(cnt2, length2))
            sys.stdout.flush()

        return unique_words, processed_vectors200
def main():
    corpus_model = Corpus()
    corpus_model = Corpus.load('bioc-corpus-AZ2.model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('bioc-glove-AZ2.model')
示例#39
0
文件: main.py 项目: agude/attalos
def load_wv_model(word_vector_file, word_vector_type):
    if word_vector_type == WordVectorTypes.glove.name:
        from glove import Glove
        glove_model = Glove.load_stanford(word_vector_file)
        wv_model = GloveWrapper(glove_model)
    else: 
        import word2vec
        w2v_model = word2vec.load(word_vector_file)
        wv_model = W2VWrapper(w2v_model)
    return wv_model
示例#40
0
def create_vectors_dataset(input_files, vector_files, max_len=500):
    print('Creating word vectors file')

    training_set_file, test_set_file = input_files
    train_word_file, test_word_file = vector_files
    
    train_stories = pickle.load(open(training_set_file,'r'))
    test_stories = pickle.load(open(test_set_file,'r'))

    train_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in train_stories]
    test_stories = [(reduce(lambda x,y: x + y, map(list,fact)),q) for fact,q in test_stories]

    vocab = sorted(reduce(lambda x, y: x | y, (set(story + [answer]) for story, answer in train_stories + test_stories)))

    # Reserve 0 for masking via pad_sequences
    vocab_size = len(vocab) + 1
    story_maxlen = max(map(len, (x for x, _ in train_stories + test_stories)))


    print('-')
    print('Vocab size:', vocab_size, 'unique words')
    print('Story max length:', story_maxlen, 'words')
    print('Number of training stories:', len(train_stories))
    print('Number of test stories:', len(test_stories))
    print('-')
    print('Here\'s what a "story" tuple looks like (input, query, answer):')
    print(train_stories[0])
    print('-')
    print('Vectorizing the word sequences...')

    word_idx = dict((c, i + 1) for i, c in enumerate(vocab))

    answer_vocab = sorted(reduce(lambda x, y: x | y, (set([answer]) for _, answer in train_stories + test_stories)))
    # Reserve 0 for masking via pad_sequences
    answer_dict = dict((word, i) for i, word in enumerate(answer_vocab))
    print('Answers dict len: {0}'.format(len(answer_dict)))

    # I need to check also if this exist
    word_vectors_dir = 'word_vectors/glove.42B.300d.txt'
    word_vectors_model = Glove.load_stanford(word_vectors_dir)

    inputs_train, answers_train = get_word_vectors(train_stories, answer_dict, 
                                                   max_len, word_vectors_model)
    inputs_test, answers_test = get_word_vectors(test_stories, answer_dict, max_len,
                                                 word_vectors_model)

    with h5py.File(train_word_file,'w') as train_f:
        _ = train_f.create_dataset('inputs',data=inputs_train)
        _ = train_f.create_dataset('answers',data=answers_train)
    with h5py.File(test_word_file,'w') as test_f:
        _ = test_f.create_dataset('inputs',data=inputs_test)
        _ = test_f.create_dataset('answers',data=answers_test)
        
    return (inputs_train, answers_train),(inputs_test, answers_test)
示例#41
0
文件: test_glove.py 项目: Ron3/tails
    def test_measure(self):
        """

        :return:
        """
        class A():
            pass

        a = A()

        for i in xrange(100):
            a1 = A()
            for j in xrange(100):
                a2 = A()
                setattr(a1, 'a%s' % j, a2)
            setattr(a, 'a%s' % i, a1)

        glove = Glove(a)
        glove.meaure()
        print glove.report
	def __init__(self, data):
		self.data = data
		self.corpus = None
		self.liu = LiuLexicon()
		self.subj = SubjLexicon()
		self.buildTweetCorpus()
		self.word_vec_model = Word2Vec(self.corpus)
		self.glove_vec_model = Glove(100, self.corpus)
		self.clusters = Cluster(100)
		self.initEncoders()
		self.topicVecs = self.word_vec_model.getVectorsForTopics(self.topicenc.classes_)
		self.collectTopUnigrams()
		self.collectTopBigrams()
示例#43
0
 def pretrain(self,data_src):
     if not os.path.isfile("glove.model"):
         data_src = DataClean([
                             ["[^a-z]"," "],  # only letters
                             [" [ ]+", " "],  # remove extra spaces
                             ],html_clean=True,split_words=True).fit(data_src).transform(data_src)
         corpus_model = Corpus()
         corpus_model.fit(data_src,window=self.window)
         glove = Glove(no_components=self.num_features,learning_rate=self.learning_rate)
         glove.fit(corpus_model.matrix,epochs=self.epochs,verbose=True)
         glove.add_dictionary(corpus_model.dictionary)
         glove.save("glove.model")
def glove_vector_download_and_save(url, outdir, maxmegabytes):

    # construct filenames
    filename_full = os.path.basename(url)
    filename_name = os.path.splitext(filename_full)[0]

    # create file-specific output directory
    dirname_file = "{}/{}".format(outdir, filename_name)
    if not os.path.isdir(dirname_file):
        os.mkdir(dirname_file)

    # download file
    filename_save = "{}/{}".format(dirname_file, filename_full)
    if not os.path.isfile(filename_save):
        print("downloading {}...".format(filename_save))
        urllib.urlretrieve(url, filename_save)

    # extract zip
    print("extracting {}...".format(filename_save))
    with zipfile.ZipFile(filename_save, "r") as z:
        z.extractall(dirname_file)

    # build model for each file
    file_pattern = "{}/*.txt".format(dirname_file)
    for file_glove_in in glob.glob(file_pattern):

        try:
            # ensure file isn't too big
            filesize = os.path.getsize(file_glove_in) / 1024 / 1024
            if filesize > maxmegabytes:
                print("skipping {}M file {}...".format(filesize, file_glove_in))

            else:

                # load vectors
                print("importing glove vectors from {}".format(file_glove_in))
                model = Glove.load_stanford(file_glove_in)

                # save model object
                file_glove_out = "{}.obj".format(os.path.splitext(file_glove_in)[0])
                print("saving glove model to {}...".format(file_glove_out))
                model.save_obj(file_glove_out)

                # delete extracted file
                os.remove(file_glove_in)

        except MemoryError as e:
            print e.strerror
示例#45
0
def test_stanford_loading():

    model = Glove.load_stanford('glove/tests/stanford_test.txt')

    assert model.word_vectors is not None
    assert model.word_vectors.shape == (100, 25)
    assert len(model.dictionary) == 100

    # Python 2/3 compatibility. Check the ellipsis
    # character is in the dictionary.
    try:
        # Python 2
        assert unichr(8230) in model.dictionary
    except NameError:
        # Pyton 3
        assert '…' in model.dictionary
    def from_glove_model(cls, vector_file):
        """
        WARNING: `glove_python` is required to use this function!

        Load a GloVe vector model.
        :param vector_path: path to glove model
        :return: a `Vectors` object
        """
        from glove import Glove

        model = Glove.load_stanford(vector_file) if isinstance(vector_file, str) else vector_file
        vocab = model.dictionary.keys()

        vectors = {}

        dims = model.no_components  # vector dimensionality

        dimension_names = ['f%02d' % i for i in range(dims)]
        for word in vocab:
            vectors[word] = zip(dimension_names, model.word_vectors[model.dictionary[word]])

        return Vectors(vectors)
示例#47
0
    def download_and_save_vectors_glove(self, url, outdir, datafile=None, maxmegabytes=None):
        '''
            download and save pre-trained glove model
        '''

        # download file
        dirname_file = self.download_and_extract_file(url, outdir)

        # extract file
        file_in = "{}/{}.txt".format(dirname_file, datafile)

        # build output filename
        fullpath_out = self.download_fullpath(outdir, datafile)

        # catch memory exceptions
        try:

            # ensure file isn't too big
            filesize = os.path.getsize(file_in) / 1024 / 1024
            filesize_ok = (not maxmegabytes or filesize <= int(maxmegabytes))

            # download specific file and/or files under specific limit
            if filesize_ok:
                print("importing glove vectors from {}".format(file_in))
                model = Glove.load_stanford(file_in)

                # save model object to specified output directory
                print("saving glove model to {}...".format(fullpath_out))
                model.save_obj(fullpath_out)
            else:
                print("skipping file {}...".format(file_in))


        except MemoryError as e:
            print e.strerror

        # remove extracted directory
        shutil.rmtree(dirname_file)
示例#48
0
def get_data(args):

    feature_set_names = CONFIG['train']['features']
    if set(feature_set_names).intersection(['word2vec', 'doc2vec']) and not args.embedding:
        raise RuntimeError("--embedding argument must be supplied")

    # get Y labels
    training_set = read_tsv(args.train)
    y_labels = training_set["sentiment"]

    sentences = [obj['review'] for obj in read_json_lines(args.sentences)]

    if not args.embedding or feature_set_names == ['bow']:
        # don't drop NaNs -- have a sparse matrix here
        return False, (get_bow_features(sentences), y_labels)

    # load embedding
    if CONFIG['pretrain']['algorithm'] == 'word2vec':
        embedding = word2vec.Word2Vec.load(args.embedding)
    elif CONFIG['pretrain']['algorithm'] == 'glove':
        embedding = Glove.load(args.embedding)
        # dynamicaly add GloveWrapper mixin
        embedding.__class__ = type('MyGlove', (Glove, GloveWrapper), {})

    # get feature vectors
    if 'doc2vec' in CONFIG['train']['features']:
        embedding_vectors = get_doc2vec_features(sentences, embedding)
    elif 'word2vec' in CONFIG['train']['features']:
        embedding_vectors = get_word2vec_features(sentences, embedding)
    else:
        raise RuntimeError("Invalid config setting train:features=%s" % CONFIG['train']['features'])

    if 'bow' in feature_set_names:
        return True, get_mixed_features(sentences, embedding_vectors, y_labels)
    else:
        # matrix is dense -- drop NaNs
        return False, drop_nans(embedding_vectors, y_labels)
示例#49
0
def get_data(args):

    feature_set_names = CONFIG['train']['features']
    if set(feature_set_names).intersection(['embedding']) and not args.embedding:
        raise RuntimeError("--embedding argument must be supplied")

    # get input data
    sentences, y_labels = sample_by_y(args)

    if not args.embedding or feature_set_names == ['bow']:
        # don't drop NaNs -- have a sparse matrix here
        X = get_bow_features(sentences)
        return False, (X, y_labels)

    # load embedding
    if CONFIG['pretrain']['algorithm'] == 'word2vec':
        from gensim.models import word2vec
        embedding = word2vec.Word2Vec.load(args.embedding)
    elif CONFIG['pretrain']['algorithm'] == 'glove':
        from glove import Glove
        embedding = Glove.load(args.embedding)
        # dynamicaly add GloveWrapper mixin
        embedding.__class__ = type('MyGlove', (Glove, GloveWrapper), {})

    # get feature vectors
    if 'embedding' in CONFIG['train']['features']:
        embedding_vectors = get_word2vec_features(sentences, embedding)
    else:
        raise RuntimeError("Invalid config setting train:features=%s" % CONFIG['train']['features'])

    if 'bow' in feature_set_names:
        X, y_labels = get_mixed_features(sentences, embedding_vectors, y_labels)
        return True, (X, y_labels)
    else:
        # matrix is dense -- drop NaNs
        X, y_labels = drop_nans(embedding_vectors, y_labels)
        return False, (X, y_labels)
def build_glove_embeddings(training, testing, args):
    
    ''' Trains the model on the sentiment140 dataset

    @Arguments:
        data:  the loaded sentiment140 dataset from module
        num_epochs: the number of epochs to train on
        num_threads: the number of threads to use
        num_components: the number of components the glove model should use
        learning_rate: the model's learning rate
        window_size: the size of the window to use when looking for word co-occurence
        verbose: boolean for whether or not extensive output should be printed to screen

    @Return:
        A trained glove model
    '''
        
    # initialize model
    glove = Glove(no_components = args.vecsize, learning_rate = args.learningRate)
    
    txtSource = chain( imap(lambda (txt,lbl): txt, training), imap(lambda (txt,lbl): txt, testing))
    
    # read in the data to train on
    corpus_model = Corpus()
    corpus_model.fit( imap(preprocess.tokenize, txtSource), window = args.window)
        
    # fit the model using the given parameters
    logging.info("Training GloVe")
    glove.fit(corpus_model.matrix, epochs = args.epochs, no_threads = args.parallelism, verbose = args.verbose)
    
    # add a dictionary just to make it easier for similarity queries
    glove.add_dictionary(corpus_model.dictionary)
    
    transformer = lambda words: glove.transform_paragraph(words, use_pca = args.pca)

    fromTraining = to_sklearn_format(transformer, training, args.vecsize)
    fromTesting = to_sklearn_format(transformer, testing, args.vecsize)
    
    return fromTraining, fromTesting
示例#51
0
def test_fitting():
    """
    Verify that the square error diminishes with fitting
    """

    num_sentences = 5000
    seed = 10

    corpus = Corpus()

    corpus.fit(generate_training_corpus(num_sentences,
                                        vocabulary_size=50,
                                        seed=seed))

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=0,
                    no_threads=2)

    log_cooc_mat = corpus.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0

    # Check that it is good with fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=500,
                    no_threads=2)

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
import argparse

from glove import Glove


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description=('Load GloVe model and test similarity scores'))
    parser.add_argument('--model', '-m', action='store',
                        required=True,
                        help='The saved GloVe model object')
    args = parser.parse_args()


    # load the model
    glove = Glove.load_obj(args.model)

    # give me the 5 words most similar to each word in the words list in this
    # corpus and show me how similar the words are in this corpus to each word
    # in the words list in general
    words = ['sky', 'queen', 'car', 'run', 'medical']

    for word in words:

        # get most similar words
        similarities = glove.most_similar(word)

        # print out results
        print("Most similar to {}:".format(word))
        for match, score in similarities:
          print("\t{0:15} {1:.2f}".format(match, score))
示例#53
0
	def loadGloveModel(self, modelFile = MODEL_FILE):
		print("Loading pre-trained GloVe model \"{}\"...").format(modelFile)
		self.glove = Glove.load(modelFile)
		print("Done loading.")
		print("")
示例#54
0
mlp100 = mlp_model(100)
mlp100_accuracy = train_test(mlp100, x, y, folds)

mlp1000 = mlp_model(1000)
mlp1000_accuracy = train_test(mlp1000, x, y, folds)

print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy))

#3CNN
#Glove Vectors from reviews
c = [review.split() for review in data.data]

corpus = Corpus()
corpus.fit(c, window=10)

glv = Glove(no_components=100, learning_rate=0.05)
glv.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)

glv.add_dictionary(corpus.dictionary)

embeddings_index = glv.dictionary

BASE_DIR = ''
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = 'txt_sentoken/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
示例#55
0
 def fit(self,X,y=None):
     self.model = Glove.load("glove.model")
     return self
示例#56
0
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)

        print('Training the GloVe model')

        glove = Glove(no_components=100, learning_rate=0.05)
        glove.fit(corpus_model.matrix, epochs=int(args.train),
                  no_threads=args.parallelism, verbose=True)
        glove.add_dictionary(corpus_model.dictionary)

        glove.save('glove.model')

    if args.query:
        # Finally, query the model for most similar words.
        if not args.train:
            print('Loading pre-trained GloVe model')
            glove = Glove.load('glove.model')

        print('Querying for %s' % args.query)
        pprint.pprint(glove.most_similar(args.query, number=10))
示例#57
0
def embedding_func(gridded_words_overall,embedding_size):
    
    """***************
     GLOVE for Video
     ***************"""
     
    
    glove_bins=np.asarray(gridded_words_overall)
    print(glove_bins)
    glove_shape=glove_bins.shape
    glove_weights=np.ones((glove_shape))
    #bovw_shape=(3,5)
    #bovw_bins = np.random.randint(9,13, size=bovw_shape)
    #bovw_weights = np.random.randint(2, size=bovw_shape)
    
    
    
    
    #print('Bovw bins')
    #print(bovw_bins)
    #print('Bovw weights')
    #print(bovw_weights)
     
    
    
    
    
    dictionary = {}
    rows = []
    cols = []
    data = array.array('f')
     
    k=0 
    #print(bovw_bins)
    
    for frame in glove_bins:
            for i, first_word in enumerate(frame):
                first_word_idx = dictionary.setdefault(first_word,
                                                       len(dictionary))
                w1=glove_weights[k,i]                                    
                for j, second_word in enumerate(frame):
                    second_word_idx = dictionary.setdefault(second_word,
                                                            len(dictionary))
                    w2=glove_weights[k,j]            
                    distance = 1
                    w=w1*w2
    
                    if first_word_idx == second_word_idx:
                        pass
                    elif first_word_idx < second_word_idx:
                        rows.append(first_word_idx)
    
                        cols.append(second_word_idx)
                        data.append(np.double(w*np.double(1.0) / distance))
                    else:
                        rows.append(second_word_idx)
                        cols.append(first_word_idx)
                        data.append(np.double(w*np.double(1.0) / distance))
            k=k+1
         
                            
     
    
    x=sp.coo_matrix((data, (rows, cols)),
                             shape=(len(dictionary),
                                    len(dictionary)),
                             dtype=np.double).tocsr().tocoo()      
    print(dictionary)     
           
 
    
                  
    xarr=x.toarray()                         
    xarr/=np.amax(xarr)
    print("coocurance matrix")
    print(xarr)
    xsparse=sp.coo_matrix(xarr)   
    
    glove_model = Glove(no_components=embedding_size, learning_rate=0.05)
    glove_model.fit(xsparse,
                        epochs=500,
                        no_threads=2)
    
    
    new_word_representation=glove_model.word_vectors


    return new_word_representation,dictionary
示例#58
0
    for line in file:
        d = json.loads(line)

        uris.append(d[0])
        questions.append(d[1])
        answers.append(d[2])
        cats.append(d[3])

def get_lines():
    for a in answers:
        yield a.split()

# Build the corpus dictionary and cooccurence matrix
corpus_model = Corpus()
corpus_model.fit(get_lines(), window=8)

print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)

# Train GloVe model
#glove = Glove(no_components = no_comp, learning_rate=0.05)
glove = Glove.load_stanford('vectors.6B.100d.txt')
glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True)
glove.add_dictionary(corpus_model.dictionary)

# Save
with open('model.glove', 'w+') as file:
    file.write('%i %i \n' % (len(glove.dictionary), no_comp))
    for (word, idx) in glove.dictionary.iteritems():
        file.write('%s %s \n' % (word, ' '.join(str(n) for n in glove.word_vectors[idx])))
                        required=True,
                        help='The filename of the stored GloVe model.')
    parser.add_argument('--encode', '-e', action='store_true',
                        default=False,
                        help=('If True, words from the '
                              'evaluation set will be utf-8 encoded '
                              'before looking them up in the '
                              'model dictionary'))
    parser.add_argument('--parallelism', '-p', action='store',
                        default=1,
                        help=('Number of parallel threads to use'))

    args = parser.parse_args()

    # Load the GloVe model
    glove = Glove.load(args.model)


    if args.encode:
        encode = lambda words: [x.lower().encode('utf-8') for x in words]
    else:
        encode = lambda words: [unicode(x.lower()) for x in words]


    # Load the analogy task dataset. One example can be obtained at
    # https://word2vec.googlecode.com/svn/trunk/questions-words.txt
    sections = defaultdict(list)
    evaluation_words = [sections[section].append(encode(words)) for section, words in
                        metrics.read_analogy_file(args.test)]

    section_ranks = []
示例#60
0
@author: dannl
'''
from glove import Glove
from glove import Corpus
import time

cooc_file='/home/dannl/tmp/newstech/glove/word.cooc'
model_file='/home/dannl/tmp/newstech/glove/glove.model'

oldtime=time.time()
# get a cooccurrence matrix
corpus_cooc = Corpus.load(cooc_file)

# get a model
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_cooc.matrix, epochs=5,no_threads=4, verbose=True)
glove.add_dictionary(corpus_cooc.dictionary)
glove.save(model_file)

# count=0
# for word,wid in corpus_cooc.dictionary.items():
#     count+=1
#     if count>100:
#         break
#     print word,wid
    
print('Dict size: %s' % len(corpus_cooc.dictionary))
print('Collocations: %s' % corpus_cooc.matrix.nnz)

print 'time cost:%.2f'%(time.time()-oldtime)