Exemplo n.º 1
0
def parse_Word2Vec(full_content):
    corpus = Corpus()
    corpus.fit(full_content, window=10)
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)

    # Open file to write the results
    f2 = open('/home/ubuntu/corpus/results.txt', 'w')

    # Loop through all the article types in the file
    with open('/home/ubuntu/corpus/article_types.csv', 'r') as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            article_type = row[0]
            translator = str.maketrans({key: '' for key in string.punctuation})
            article_type_no_punctuation = article_type.translate(translator)
            wordnet = WordNetLemmatizer()
            article_type_clean = wordnet.lemmatize(article_type_no_punctuation)
            try:
                match = glove.most_similar(article_type_clean, number=10)
                matched_item = match[0][0]
                print(article_type_clean + ' -> ' + str(matched_item))
                f2.write(article_type + '\n')
                f2.write(str(matched_item + '\n'))
            except:
                pass
                print('failed for: ' + article_type)
    f2.close()
Exemplo n.º 2
0
    def Myself_Model(self,
                     cropus_path,
                     save=None,
                     back_corpus=None,
                     epochs=10,
                     no_threads=8,
                     no_components=100,
                     learning_rate=0.05):
        """
        sd
        """

        self.get_data = self.read_corpus(cropus_path)
        corpus_model = Corpus()
        corpus_model.fit(self.get_data, window=10)
        if back_corpus != None:
            yield corpus_model

        #self.glove = Glove()
        self.glove = Glove(no_components=no_components,
                           learning_rate=learning_rate)
        self.glove.fit(corpus_model.matrix,
                       epochs=epochs,
                       no_threads=no_threads,
                       verbose=True)
        self.glove.add_dictionary(corpus_model.dictionary)

        if save != None:
            #save = 'model/articles_glove.model'
            self.glove.save(save)

        self.model = self.glove
        return self.glove
Exemplo n.º 3
0
def glove_embed(data, embed_dim, window_size, epochs_, step_size):
    '''
    DESCRIPTION : Perform Global Vectors for word embeddings for tokens in data set

    INPUT:
        |--- train: list of tweets
        |--- embed_size: [int] integer representing embedding dimension
        |--- window_size: [int] integer representing the size of the window of tokens considered during training for each token
        |--- epochs: [int] integer for number of epochs for Word2Vec training
        |--- step_size: [float] learning step for the SGD for Word2Vec training 

    OUTPUT:
        |--- embeddings: [dict] dictionnary with tweets as keys and 1D array of feature vector as values
        |--- vocab: [dict] dictionnary with tokens as keys and index of each token in vocab as values
        |--- glove: [Global Vectors Model] GloVe model trained on data
    '''
    sentences = get_tokens(data)

    model = Corpus()
    model.fit(sentences, window=window_size)

    glove = Glove(no_components=embed_dim, learning_rate=step_size)
    glove.fit(model.matrix, epochs=epochs_, no_threads=1, verbose=True)
    glove.add_dictionary(model.dictionary)

    embeddings = np.zeros((len([*glove.dictionary]), embed_dim))
    for w, id_ in glove.dictionary.items():
        embeddings[id_, :] = np.array([glove.word_vectors[id_]])

    vocab = dict()
    for idx, line in enumerate([*glove.dictionary]):
        vocab[line.strip()] = idx

    return embeddings, vocab, glove
def get_embeddings(prepared_input):
    corpus = Corpus()
    corpus.fit(prepared_input, window=10)
    glove = Glove(no_components=5, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
Exemplo n.º 5
0
def train_glove(save_dir, size):
    print('START')
    f_corpus = get_full_corpus()
    corpus = Corpus()
    print('CREATE CORPUS')
    corpus.fit(f_corpus, window=10)
    word_dict = corpus.dictionary.keys()
    glove = Glove(no_components=size, learning_rate=0.05)
    print('START LEARNING')
    glove.fit(corpus.matrix, epochs=60, no_threads=8, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    dict_in_bin = dict()
    print('START SAVE')
    for item in word_dict:
        word_indx = glove.dictionary[item]
        dict_in_bin[item] = glove.word_vectors[word_indx]
    with open(save_dir, "wb") as file:
        pickle.dump(dict_in_bin, file)
    print('COMMON TEST')
    while True:
        try:
            s = input("Введите строку: ")
            print(glove.most_similar(s, number=10))
            word_indx = glove.dictionary[s]
            print(glove.word_vectors[word_indx])
        except:
            continue
Exemplo n.º 6
0
def build_model_glove(args):

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Exemplo n.º 7
0
def train_glove(src_filename, dim=100):
    corpus = Corpus()
    corpus.fit(get_lines(src_filename), window=10)
    glove = Glove(no_components=dim, learning_rate=0.001)
    glove.fit(corpus.matrix, epochs=100, no_threads=20, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save(DATA_DIR + 'glove.{}d.model'.format(dim))
Exemplo n.º 8
0
def word_embedding(sentences,embedding_size,windows_len):
    """
    Verify that the square error diminishes with fitting
    """

     

    corpus_model = Corpus()

    corpus_model.fit(sentences,window=windows_len)

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=embedding_size, learning_rate=0.05)
    glove_model.fit(corpus_model.matrix,
                    epochs=0,
                    no_threads=2)

    log_cooc_mat = corpus_model.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())
    
    
    
    corpus_dict=corpus_model.dictionary
    corpus_inverse_dict=dict(map(reversed, corpus_dict.items()))

        
    

    return glove_model,corpus_dict,corpus_inverse_dict
def generate_glove_corpus():
    global article_info_path, output_path

    write_log('GloVe Load article info : Start')
    with open(article_info_path, 'r') as f_art:
        article_info = json.load(f_art)
    write_log('GloVe Load article info : End')

    write_log('GloVe Generate sentences : Start')
    sentences = []
    for url, dict_info in article_info.items():
        sentence_header = dict_info.get('sentence_header', None)
        sentence_body = dict_info.get('sentence_body', None)

        if (sentence_header == None) or (sentence_body == None):
            continue

        words = []
        #for sentence in sentence_header + sentence_body:
        for sentence in sentence_header:
            for word in sentence.split(' '):
                words.append(word)

        sentences.append(words)
    write_log('GloVe Generate sentences : End')

    write_log('GloVe Generate corpus : Start')
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    write_log('GloVe Generate corpus : End')

    corpus.save(output_path)
Exemplo n.º 10
0
def train_glove(target_group, glove_para, src_file, save_model_name):
    """
    example: train_glove(target_group='words', glove_para=glove_para_word)
    after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model')
    :param target_group: 'words' or 'chars'
    :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4}
    :return:
    """
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(src_file=src_file,
                                 words_or_chars=target_group),
                     window=glove_para['window_size']
                     )  #avg word size is 6 for each sentence
    corpus_model.save('corpus_model_{}.model'.format(target_group))
    print target_group
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    print('Training the GloVe model')

    glove = Glove(no_components=glove_para['no_components'],
                  learning_rate=glove_para['learning_rate'])
    glove.fit(corpus_model.matrix,
              epochs=glove_para['no_epochs'],
              no_threads=glove_para['parallelism'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(save_model_name)
Exemplo n.º 11
0
class MyGloVe:
    def initiate_model(self, input_corpus):
        self.corpus_model = Corpus()
        self.corpus_model.fit(self.__read_corpus(input_corpus), window=10)

        self.glove = Glove(no_components=100, learning_rate=0.05)
        self.glove.fit(self.corpus_model.matrix, epochs=200)
        self.glove.add_dictionary(self.corpus_model.dictionary)

    def cosine_similarity(self, first_text, second_text):
        first = self.__average_feature_vector(first_text)
        second = self.__average_feature_vector(second_text)

        return 1 - spatial.distance.cosine(first, second)

    def __read_corpus(self, input_corpus):
        for line in input_corpus:
            yield line

    def __average_feature_vector(self, text):
        words = text.split()
        words_no = 0
        feature_vector = numpy.zeros((100, ), dtype="float32")

        for word in words:
            if word in self.glove.dictionary:
                word_idx = self.glove.dictionary[word]
                words_no += 1
                feature_vector = numpy.add(feature_vector,
                                           self.glove.word_vectors[word_idx])

        if words_no > 0:
            feature_vector = numpy.divide(feature_vector, words_no)

        return feature_vector
Exemplo n.º 12
0
def build_glove_embeddings(corpus):
    """
    DESCRIPTION: 
             Applies the Glove python SGD algorithm given by glove_python library and build the
             word embeddings from our training set.
    INPUT:
            corpus: a list of lists where each sub-list represent a tweet. The outer list represents
                    the whole training dataset.
    OUTPUT: 
            words: python dictionary of the form (word, [vector of embeddings])
    """
    words = load_glove_embeddings_from_txt_file(
        MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE)
    if words != None:
        return words
    model = Corpus()
    model.fit(corpus, window=algorithm['options']['WE']['window_size'])

    glove = Glove(no_components=algorithm['options']['WE']['we_features'],
                  learning_rate=algorithm['options']['WE']['learning_rate'])
    print('\nFitting Glove Python Embeddings')
    glove.fit(model.matrix, epochs=algorithm['options']['WE']['epochs'])
    glove.add_dictionary(model.dictionary)

    words = {}
    for w, id_ in glove.dictionary.items():
        words[w] = np.array(glove.word_vectors[id_])

    store_embeddings_to_txt_file(words, MY_GLOVE_PYTHON_EMBEDDINGS_TXT_FILE)
    return words
Exemplo n.º 13
0
def build_model_glove(args):

    from glove import Glove, Corpus

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Exemplo n.º 14
0
def train_glove(corpus,
                vocabulary,
                zero_init_indices=0,
                rand_init_indices=1,
                embedding_dim=300):
    """Use glove to train on corpus to obtain embedding
    Here we use a python implementation of Glove, but the official glove implementation of C version
    is also highly recommended: https://github.com/stanfordnlp/GloVe/blob/master/demo.sh

    Args:
        corpus: list of tokenized texts, corpus to train on
        vocabulary: dict, a mapping of words to indices
        zero_init_indices: int or a list, the indices which use zero-initialization. These indices
                           usually represent padding token.
        rand_init_indices: int or a list, the indices which use randomly-initialization.These
                           indices usually represent other special tokens, such as "unk" token.
        embedding_dim: int, dimensionality of embedding

    Returns: np.array, a word embedding matrix.

    """
    corpus_model = Corpus()
    corpus_model.fit(corpus, window=10)
    glove = Glove(no_components=embedding_dim, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    word_vectors = dict(
        (w, glove.word_vectors[glove.dictionary[w]]) for w in glove.dictionary)
    emb = filter_embeddings(word_vectors, embedding_dim, vocabulary,
                            zero_init_indices, rand_init_indices)
    return emb
Exemplo n.º 15
0
def train(path, freq, window, dim, lr, epochs):
    lines = []
    dic = {}
    print("Start of train method")
    try:
        for f in os.listdir(path):
            text = open(path + '/' + f, 'r').read()
            text = re.sub('\n', ' ', text)
            text = text.split()
            for word in text:
                if word in dic.keys():
                    dic[word] += 1
                else:
                    dic[word] = 1
        print("Created Dictionary for frequencies of words.")
        for f in os.listdir(path):
            text = open(path + '/' + f, 'r').read()
            text = re.sub('\n', ' ', text)
            text = text.split()
            text = [word for word in text if dic[word] > freq]
            lines.append(text)
        print(
            "Converted preprocessed text data in input format of array of array of words."
        )
        corpus = Corpus()
        corpus.fit(lines, window=window)
        glove = Glove(no_components=dim, learning_rate=lr)
        glove.fit(corpus.matrix, epochs=epochs, verbose=True)
        glove.add_dictionary(corpus.dictionary)
        glove.save('glove.model')
        print("Saved the trained model to glove.model.")
    except:
        print("Error occured in training glove model")
Exemplo n.º 16
0
def train_glove(path):
    import itertools
    from gensim.models.word2vec import Text8Corpus
    from gensim.scripts.glove2word2vec import glove2word2vec
    from glove import Corpus, Glove
    #import os
    #import struct
    sentences = list(itertools.islice(Text8Corpus(path), None))
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    file_name = 'embeddings_models/model_glove_' + str(TRAINING_SENTENCES)
    glove.save(file_name)
    glove2word2vec(file_name, file_name + '_modified')
    """
    command = 'python -m gensim.scripts.glove2word2vec -i ' +file_name+' -o '+file_name+'_modified'
    os.system(command)
    with open(file_name+'_modified', mode='rb') as file: # b is important -> binary
        fileContent = file.read()
        print 'Content',fileContent
    """
    print 'Finished'
    return glove
Exemplo n.º 17
0
class GloVeFilter(object):
    def __init__(self):
        # Corpus model
        vocab = dict(torch.load("../data/dialogue.vocab.pt", "text"))
        self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi)
        # Model
        self.glove = Glove(no_components=args.no_components,
                           learning_rate=args.learning_rate)

    def load_corpus_from_txt(self):
        print('Reading corpus statistics...')
        #texts = [self.pp.preprocessing(l.strip().decode("utf8", "ignore")) for l in open(args.data_path)]
        texts = [
            l.strip().decode("utf8", "ignore").split(" ")
            for l in open(args.data_path)
        ]
        self.corpus_model.fit(texts, window=args.window, ignore_missing=True)
        self.corpus_model.save(args.corpus_model_path)
        print('Dict size: %s' % len(self.corpus_model.dictionary))
        print('Collocations: %s' % self.corpus_model.matrix.nnz)

    def load_corpus_from_model(self):
        print('Reading corpus statistics...')
        self.corpus_model = Corpus.load(args.corpus_model_path)
        print('Dict size: %s' % len(self.corpus_model.dictionary))
        print('Collocations: %s' % self.corpus_model.matrix.nnz)

    def train(self):
        print('Training the GloVe model...')
        self.glove.fit(self.corpus_model.matrix,
                       epochs=args.epochs,
                       verbose=True)
        self.glove.add_dictionary(self.corpus_model.dictionary)
        self.glove.save(args.model_path)
        print('Training finished')
def getWordEmbeddings(processed_text):
    corpus = Corpus()
    corpus.fit(processed_text, window=3)
    glove = Glove(no_components=500, learning_rate=0.001)
    glove.fit(corpus.matrix, epochs=300000, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    print(glove.most_similar('price'))
Exemplo n.º 19
0
def train_and_save_model(data_dir,
                         model_name='LeGlove',
                         num_epochs=10,
                         parallel_threads=1):
    '''
    This function processes all the data into a training
    corpus and fits a GloVe model to this corpus. 

    Parameters:
        data_dir (string):          master directory containing all jurisdiction-level directories
        model_name (string):        name of model to be used for output
        num_epochs (int):           number of epochs for which to train model
        parallel_threads (int):     number of parallel threads to use for training

    The trained model is saved as "[model_name].model" into the current directory.
    '''

    corpus_model = Corpus()
    corpus_model.fit(read_corpus(data_dir), window=CONTEXT_WINDOW)

    glove = Glove(no_components=NUM_COMPONENTS, learning_rate=LEARNING_RATE)
    glove.fit(corpus_model.matrix,
              epochs=num_epochs,
              no_threads=parallel_threads,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(model_name + '.model')
def feature_extract(path_dataset):
    feature_extract_dataset = []
    speeches = read_csv(path_dataset, sep="|")
    speeches['Classe'] = speeches['Classe'].replace(1, 1)
    #Falas mistas de Estamira e sua Família
    speeches['Classe'] = speeches['Classe'].replace(0, -1)

    # Para cada fala
    for indice, fala in enumerate(speeches.Fala):
        #inicialização do método para pegar co-ocorrência
        dataset = Corpus()
        grafo = Graph()
        lsa = TruncatedSVD(n_components=1)

        tolkenizado = [simple_preprocess(str(fala), deacc=True)]

        quantas_palavras = shape(tolkenizado)[1]

        dataset.fit(tolkenizado, window=79)

        graph = Graph(dataset.matrix)
        values_lsa = lsa.fit_transform(dataset.matrix)

        values_mean = mean(values_lsa, axis=0)
        values_std = std(values_lsa, axis=0)

        feature_extract_dataset.append([
            average_clustering(G=graph),
            average_shortest_path_length(G=graph),
            speeches.comprimento[indice],
            values_mean.item(),
            values_std.item(), quantas_palavras
        ])

    return DataFrame(feature_extract_dataset), speeches['Classe'].values
Exemplo n.º 21
0
def test_fitting():
    """
    Verify that the square error diminishes with fitting
    """

    num_sentences = 5000
    seed = 10

    corpus = Corpus()

    corpus.fit(generate_training_corpus(num_sentences, vocabulary_size=50, seed=seed))

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix, epochs=0, no_threads=2)

    log_cooc_mat = corpus.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0

    # Check that it is good with fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix, epochs=500, no_threads=2)

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
Exemplo n.º 22
0
def train_glove_fashionrec(dimensionality, context, epochs):
    """ Train with Glove on IG corpora"""
    total_count, vocab_size = corpus_stats("data/clean2_corpus.txt")
    print("total word count: {}, vocabulary size: {}".format(
        total_count, vocab_size))
    fileName = "results/training/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_" + ".txt"
    corpus = readCorpus()
    lines = corpus.split("\n")
    linessplit = map(lambda x: x.split(" "), lines)
    corpus_model = Corpus()
    start_time = datetime.now()
    corpus_model.fit(linessplit, window=context)
    corpusModelFile = "trained/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_corpus" + ".model"
    corpus_model.save(corpusModelFile)
    glove = Glove(no_components=dimensionality, learning_rate=0.05)
    glove.fit(corpus_model.matrix,
              epochs=int(epochs),
              no_threads=8,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    time_elapsed = datetime.now() - start_time
    gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".model"
    glove.save(gloveModelFile)
    notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str(
        context) + " context, " + str(
            epochs) + " epochs \n" + "Training time: " + str(time_elapsed)
    save_to_file(fileName, notes)
    gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".vec"
    save_glove_bin_to_vec(glove, gloveVecFile)
def glove_vectors(x,
                  embedding_size,
                  epochs=50,
                  lr=0.05,
                  alpha=0.75,
                  max_count=100,
                  tmp_loc='glove.w2vmodel'):
    # create dict ourselves so that the ids correspond to their location in the df, starting to count from first col downwards
    df = pd.DataFrame(x)
    word_id_dict = create_vocab_dict(df)
    # Creating a corpus object
    corpus = Corpus(dictionary=word_id_dict)
    # Training the corpus to generate the co occurence matrix which is used in GloVe
    # Distance scaling: standard glove reduces the occurence count based on how far a context word is from the focus word.
    # Should not be used since distance has no meaning for purely categorical variables.
    corpus.fit(df.values.tolist(),
               window=len(df.columns),
               distance_scaling=False)
    # alpha is the weighing of the loss, based on how likely a cooccurence is (Xij), less likely = less weight.
    glove = Glove(no_components=embedding_size,
                  learning_rate=lr,
                  alpha=alpha,
                  max_count=max_count)
    glove.fit(
        corpus.matrix, epochs=epochs, no_threads=1, verbose=True
    )  # glove paper: 50 epochs for dimensionality <300, 100 otherwise
    glove.add_dictionary(corpus.dictionary)
    glove.save_word2vec_format(tmp_loc)

    model = KeyedVectors.load_word2vec_format(tmp_loc)
    if os.path.exists(tmp_loc):
        os.remove(tmp_loc)
    return model
Exemplo n.º 24
0
def train_glove(corpus, params, exp_id, save_dir, save_dict=False):
    dictionary = load_glove_dictionary(exp_id, save_dir)
    # Build the corpus dictionary and the cooccurrence matrix.
    print('Pre-processing corpus')

    dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id))
    if os.path.exists(dict_path):
        corpus_model = Corpus.load(dict_path)
    else:
        corpus_model = Corpus(dictionary)
        corpus_model.fit(corpus,
                         window=params['window'] * 2,
                         ignore_missing=True)
        if save_dict:
            corpus_model.save(dict_path)

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=100, learning_rate=params['alpha'])
    glove.fit(corpus_model.matrix,
              epochs=50,
              no_threads=params['workers'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Exemplo n.º 25
0
def main(args):

    ############################
    # corpus_model = Corpus()
    # corpus_model.fit(read_corpus(args.corpus))
    # corpus_model.save('corpus_select.model')

    ############################
    # corpus_model = Corpus().load('corpus_select.model')
    # print('Dict size: %s' % len(corpus_model.dictionary))
    # print('Collocations: %s' % corpus_model.matrix.nnz)

    # with open('global_vocab.pkl', 'wb') as handle:
    #     pickle.dump(corpus_model.dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)
    ############################

    # opening vocab to create the corpus object
    with open('global_vocab.pkl', 'rb') as f:
        vocab_dict = pickle.load(f)
    doc_model = Corpus(dictionary=vocab_dict)
    
    texts = list(read_corpus(args.corpus))
    
    #opening weight csv
    diff_bias = pd.read_csv(args.diff_bias, header=0)

    
    #col 2 is science/arts, col 3 is weapons/instruments

    total = {}
    # for i in range(10):
    for i in tqdm(range(len(texts))):
        doc = [texts[i]]
        doc_model.fit(doc)

        # we might not even need to save it, just put it into one matrix and save that

        coo = doc_model.matrix.todok()
        weight = diff_bias.iloc[i, 2]
        coo = {k:weight*v for k,v in coo.items()}
        total = Counter(coo) + Counter(total)


    def _dict_to_csr(term_dict):
        term_dict_v = term_dict.values()
        term_dict_k = term_dict.keys()
        term_dict_k_zip = zip(*term_dict_k)
        term_dict_k_zip_list = list(term_dict_k_zip)

        shape = (len(term_dict_k_zip_list[0]), len(term_dict_k_zip_list[1]))
        csr = sp.csr_matrix((list(term_dict_v), list(map(list, zip(*term_dict_k)))), shape = shape)
        coo = csr.tocoo()
        return coo

    total = dict(total)
    total = _dict_to_csr(total)
    print(total.get_shape())
    
    with open('doc_matrices_weighted.pkl', 'wb') as handle:
        pickle.dump(total, handle, protocol=pickle.HIGHEST_PROTOCOL)
Exemplo n.º 26
0
 def trainShake2(self):
     corpus = Corpus()
     shakespeare_words = self.shakespeare_lines()
     # corpus.fit(shakespeare_corpus + sonnets_corpus, window=10)
     corpus.fit(shakespeare_words, window=10)
     self.glove = Glove(no_components=100, learning_rate=0.05)
     self.glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
     self.glove.add_dictionary(corpus.dictionary)
Exemplo n.º 27
0
def getGloveEmbedding(seqs, size=300, window=10, epochs=20):
    corpus = Corpus()
    corpus.fit(seqs, window=window)

    glove = Glove(no_components=size, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=epochs, verbose=True)

    return corpus.dictionary, glove.word_vectors
Exemplo n.º 28
0
def train_model(line):
    corpus = Corpus()
    corpus.fit(line)
    glove = Glove(no_components=5, learning_rate=0.05, random_state=0)
    glove.fit(corpus.matrix, epochs=10, no_threads=100, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save('glove.model')
    return glove
Exemplo n.º 29
0
def build_glove_word_vectors(data_frame, vec_dim, vectorizer, window_size,
                             niter):
    corpus = Corpus(vectorizer.vocabulary_)
    corpus.fit(data_frame.post, window=window_size, ignore_missing=True)
    glove = Glove(no_components=vec_dim, learning_rate=0.01)
    glove.fit(corpus.matrix, epochs=niter, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    return glove
Exemplo n.º 30
0
def buildCorpus(data_path=None, context_window=5):
    # function that loads in wikipedia data and fits corpus model
    print('Fitting data...')

    # intialize and fit corpus
    corpus = Corpus()
    corpus.fit(textGenerator(data_path), window=context_window)
    return corpus
Exemplo n.º 31
0
def test_supplied_dict_missing():

    dictionary = {'a': 1,
                  'naïve': 0}

    corpus = [['a', 'naïve', 'fox']]

    model = Corpus(dictionary=dictionary)
    model.fit(corpus, max_map_size=0, window=10)
    def glove_feat(df, feat, length):
        corpus = Corpus() 
        corpus.fit(df[feat], window=20)
        glove = Glove(no_components=length, learning_rate=0.05)

        glove.fit(corpus.matrix, epochs=10, no_threads=10, verbose=True)
        glove.add_dictionary(corpus.dictionary)

        return glove
Exemplo n.º 33
0
 def pretrain(self,data_src):
     if not os.path.isfile("glove.model"):
         data_src = DataClean([
                             ["[^a-z]"," "],  # only letters
                             [" [ ]+", " "],  # remove extra spaces
                             ],html_clean=True,split_words=True).fit(data_src).transform(data_src)
         corpus_model = Corpus()
         corpus_model.fit(data_src,window=self.window)
         glove = Glove(no_components=self.num_features,learning_rate=self.learning_rate)
         glove.fit(corpus_model.matrix,epochs=self.epochs,verbose=True)
         glove.add_dictionary(corpus_model.dictionary)
         glove.save("glove.model")
def train_glove(sentences):
    print 'training glove model...'
    t0 = time()
    
    num_features = 300    # Word vector dimensionality
    context = 5          # Context window size
    learning_rate = 0.05
    
    corpus = Corpus()
    corpus.fit(sentences, window=context)

    glove = Glove(no_components=num_features, learning_rate=learning_rate)
    glove.fit(corpus.matrix, epochs=30, no_threads=8, verbose=True)
    glove.add_dictionary(corpus.dictionary)

    print 'took %0.5fs.' % (time() - t0)
    return glove
Exemplo n.º 35
0
    def run_glove(self):
        """ run global vector """
        #sentences = [["hi","good","to"],["see","u"]]
        sentences = self.get_sentences()

        print '\n' + '-'*80
        print "Fitting words into corpus"
        corpus = Corpus()
        corpus.fit(sentences, window=10)

        print "Running Glove"
        glove = Glove(no_components=200, learning_rate=0.05)
        glove.fit(corpus.matrix, epochs=5, no_threads=10, verbose=True)
        glove.add_dictionary(corpus.dictionary)

        print "Fitting words and vectors into unique_words and vectors200"
        unique_words = []
        vectors200 = []

        cnt1 = 0
        length1 = len(glove.inverse_dictionary)
        for word_id in glove.inverse_dictionary:
            cnt1 += 1
            unique_words.append(glove.inverse_dictionary[word_id])
            vectors200.append(glove.word_vectors[word_id])

            sys.stdout.write("\rStatus: %s / %s"%(cnt1, length1))
            sys.stdout.flush()

        print '\n' + "Processing vectors200"
        processed_vectors200 = []
        processed_vector = []

        cnt2 = 0
        length2 = len(vectors200)
        for vector in vectors200:
            cnt2 += 1
            for float_num in vector:
                processed_vector.append(float_num)

            processed_vectors200.append(processed_vector)

            sys.stdout.write("\rStatus: %s / %s"%(cnt2, length2))
            sys.stdout.flush()

        return unique_words, processed_vectors200
Exemplo n.º 36
0
def test_supplied_dictionary():

    dictionary = {'a': 2,
                  'naïve': 1,
                  'fox': 0}

    corpus = [['a', 'naïve', 'fox']]

    model = Corpus(dictionary=dictionary)
    model.fit(corpus, max_map_size=0, window=10)

    assert model.dictionary == dictionary

    assert model.matrix.shape == (len(dictionary),
                                  len(dictionary))

    assert (model.matrix.tocsr()[2]).sum() == 0
Exemplo n.º 37
0
def test_corpus_construction():

    corpus_words = ['a', 'naïve', 'fox']
    corpus = [corpus_words]

    model = Corpus()
    model.fit(corpus, max_map_size=0, window=10)

    for word in corpus_words:
        assert word in model.dictionary

    assert model.matrix.shape == (len(corpus_words),
                                  len(corpus_words))

    expected = [[0.0, 1.0, 0.5],
                [0.0, 0.0, 1.0],
                [0.0, 0.0, 0.0]]

    assert (model.matrix.todense().tolist()
            == expected)
def build_glove_embeddings(training, testing, args):
    
    ''' Trains the model on the sentiment140 dataset

    @Arguments:
        data:  the loaded sentiment140 dataset from module
        num_epochs: the number of epochs to train on
        num_threads: the number of threads to use
        num_components: the number of components the glove model should use
        learning_rate: the model's learning rate
        window_size: the size of the window to use when looking for word co-occurence
        verbose: boolean for whether or not extensive output should be printed to screen

    @Return:
        A trained glove model
    '''
        
    # initialize model
    glove = Glove(no_components = args.vecsize, learning_rate = args.learningRate)
    
    txtSource = chain( imap(lambda (txt,lbl): txt, training), imap(lambda (txt,lbl): txt, testing))
    
    # read in the data to train on
    corpus_model = Corpus()
    corpus_model.fit( imap(preprocess.tokenize, txtSource), window = args.window)
        
    # fit the model using the given parameters
    logging.info("Training GloVe")
    glove.fit(corpus_model.matrix, epochs = args.epochs, no_threads = args.parallelism, verbose = args.verbose)
    
    # add a dictionary just to make it easier for similarity queries
    glove.add_dictionary(corpus_model.dictionary)
    
    transformer = lambda words: glove.transform_paragraph(words, use_pca = args.pca)

    fromTraining = to_sklearn_format(transformer, training, args.vecsize)
    fromTesting = to_sklearn_format(transformer, testing, args.vecsize)
    
    return fromTraining, fromTesting
Exemplo n.º 39
0
def test_supplied_dict_missing_ignored():

    dictionary = {'a': 0,
                  'fox': 1}

    corpus = [['a', 'naïve', 'fox']]

    model = Corpus(dictionary=dictionary)
    model.fit(corpus, max_map_size=0, window=10, ignore_missing=True)

    assert model.dictionary == dictionary

    assert model.matrix.shape == (len(dictionary),
                                  len(dictionary))

    # Ensure that context windows and context window
    # weights are preserved. 
    full_model = Corpus()
    full_model.fit(corpus, window=10)

    assert (full_model.matrix.todense()[0, 2]
            == model.matrix.todense()[0, 1]
            == 0.5)
Exemplo n.º 40
0
def test_fitting():
    """
    Verify that the square error diminishes with fitting
    """

    num_sentences = 5000
    seed = 10

    corpus = Corpus()

    corpus.fit(generate_training_corpus(num_sentences,
                                        vocabulary_size=50,
                                        seed=seed))

    # Check that the performance is poor without fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=0,
                    no_threads=2)

    log_cooc_mat = corpus.matrix.copy()
    log_cooc_mat.data = np.log(log_cooc_mat.data)
    log_cooc_mat = np.asarray(log_cooc_mat.todense())

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() > 30000.0

    # Check that it is good with fitting
    glove_model = Glove(no_components=100, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=500,
                    no_threads=2)

    repr_matrix = _reproduce_input_matrix(glove_model)

    assert ((repr_matrix - log_cooc_mat) ** 2).sum() < 1500.0
def main():
    corpus_model = Corpus()
    corpus_model.fit(itertexts(), window=10, max_map_size=1000000)
    corpus_model.save('bioc-corpus-AZ2.model')
Exemplo n.º 42
0
                        help='Get closes words to this word.')
    args = parser.parse_args()


    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print('Pre-processing corpus')

        if args.wiki:
            print('Using wikipedia corpus')
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_cooc = Corpus()
        corpus_cooc.fit(get_data(args.create), window=10)
        corpus_cooc.save('corpus.model')
        
        print('Dict size: %s' % len(corpus_cooc.dictionary))
        print('Collocations: %s' % corpus_cooc.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_cooc = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_cooc.dictionary))
            print('Collocations: %s' % corpus_cooc.matrix.nnz)
Exemplo n.º 43
0
                        help='Get closes words to this word.')
    args = parser.parse_args()


    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print('Pre-processing corpus')

        if args.wiki:
            print('Using wikipedia corpus')
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model')
        
        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)
Exemplo n.º 44
0
mlp10_accuracy = train_test(mlp10, x, y, folds)

mlp100 = mlp_model(100)
mlp100_accuracy = train_test(mlp100, x, y, folds)

mlp1000 = mlp_model(1000)
mlp1000_accuracy = train_test(mlp1000, x, y, folds)

print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy))

#3CNN
#Glove Vectors from reviews
c = [review.split() for review in data.data]

corpus = Corpus()
corpus.fit(c, window=10)

glv = Glove(no_components=100, learning_rate=0.05)
glv.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)

glv.add_dictionary(corpus.dictionary)

embeddings_index = glv.dictionary

BASE_DIR = ''
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = 'txt_sentoken/'
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
Exemplo n.º 45
0
    delchars = ''.join(delchars)

    with open(filename, 'r') as datafile:
        for line in datafile:
            # list of tokenized words
            yield line.lower().translate(None, delchars).split(' ')


if __name__ == '__main__':

    # initialize glove object
    glove = Glove(no_components=100, learning_rate=0.05)
    
    # read in the data to train on; this file is shakespeare text
    corpus_model = Corpus()
    corpus_model.fit(read_corpus("data/input.txt"), window=10)
        
    # fit the model using the given parameters
    glove.fit(corpus_model.matrix, epochs=10, no_threads=1, verbose=True)
              
    # add a dictionary just to make it easier for similarity queries
    glove.add_dictionary(corpus_model.dictionary)

    # save glove object to file
    glove.save_obj('glove.model.obj')
    
    # give me the 5 words most similar to each word in the words list in this 
    # corpus and show me how similar the words are in this corpus to each word
    # in the words list in general
    words = ['sky', 'queen', 'car']
    
Exemplo n.º 46
0
'''
from glove import Glove
from glove import Corpus
from gensim import corpora
import time

dic_file=r'/home/dannl/tmp/newstech/glove/news.dic'
corpus_file='/home/dannl/tmp/newstech/news.txt'
cooc_file='/home/dannl/tmp/newstech/glove/word.cooc'

def read_corpus(filename):
    with open(filename, 'r') as datafile:
        for line in datafile:
            yield line.split()[1:]

# get a cooccurrence matrix
oldtime=time.time()
dictionary = corpora.Dictionary.load(dic_file)

# corpus_cooc = Corpus()
# corpus_cooc.fit(read_corpus(corpus_file), window=10)

corpus_cooc = Corpus(dictionary=dictionary.token2id)
corpus_cooc.fit(read_corpus(corpus_file), window=10,ignore_missing=True)
corpus_cooc.save(cooc_file)

print('Dict size: %s' % len(corpus_cooc.dictionary))
print('Collocations: %s' % corpus_cooc.matrix.nnz)

print 'time cost:%.2f'%(time.time()-oldtime,)
Exemplo n.º 47
0
def fit_corpus(corpus):

    model = Corpus()
    model.fit(corpus)

    return corpus
Exemplo n.º 48
0
with open('yahoo_train.txt', 'r') as file:
    for line in file:
        d = json.loads(line)

        uris.append(d[0])
        questions.append(d[1])
        answers.append(d[2])
        cats.append(d[3])

def get_lines():
    for a in answers:
        yield a.split()

# Build the corpus dictionary and cooccurence matrix
corpus_model = Corpus()
corpus_model.fit(get_lines(), window=8)

print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)

# Train GloVe model
#glove = Glove(no_components = no_comp, learning_rate=0.05)
glove = Glove.load_stanford('vectors.6B.100d.txt')
glove.fit(corpus_model.matrix, epochs=10, no_threads=4, verbose=True)
glove.add_dictionary(corpus_model.dictionary)

# Save
with open('model.glove', 'w+') as file:
    file.write('%i %i \n' % (len(glove.dictionary), no_comp))
    for (word, idx) in glove.dictionary.iteritems():
        file.write('%s %s \n' % (word, ' '.join(str(n) for n in glove.word_vectors[idx])))
import itertools
from gensim.models.word2vec import Text8Corpus
from glove import Corpus, Glove

# for installing text8 corpus you should follow this commands

# wget http://mattmahoney.net/dc/text8.zip -P /tmp
# unzip text8.zip


sentences = list(itertools.islice(Text8Corpus('/tmp/text8'), None))
corpus = Corpus()
corpus.fit(sentences, window=10)
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

print glove.most_similar('frog', number=10)
print glove.most_similar('girl', number=10)
print glove.most_similar('car', number=10)
Exemplo n.º 50
0
for row in csvsequence:
    texts.append(clean(row[3]).split())
    classes.append(row[0])

# Calculate distribution, to account for 95th percentile of messages.
max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts])))

print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length))

corpus = Corpus()
try:
    print("Loading pretrained corpus...")
    corpus = Corpus.load("cache/corpus.p")
except:
    print("Training corpus...")
    corpus.fit(texts, window=max_sentence_length)
    corpus.save("cache/corpus.p")

glove = Glove(no_components=number_components, learning_rate=0.05)
try:
    print("Loading pretrained GloVe vectors...")
    glove = Glove.load("cache/glove.p")
except:
    print("Training GloVe vectors...")
    # More epochs seems to make it worse
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save("cache/glove.p")

# Convert input text
print("Vectorizing input sentences...")