Exemplo n.º 1
0
def train_glove(corpus, params, exp_id, save_dir, save_dict=False):
    dictionary = load_glove_dictionary(exp_id, save_dir)
    # Build the corpus dictionary and the cooccurrence matrix.
    print('Pre-processing corpus')

    dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id))
    if os.path.exists(dict_path):
        corpus_model = Corpus.load(dict_path)
    else:
        corpus_model = Corpus(dictionary)
        corpus_model.fit(corpus,
                         window=params['window'] * 2,
                         ignore_missing=True)
        if save_dict:
            corpus_model.save(dict_path)

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=100, learning_rate=params['alpha'])
    glove.fit(corpus_model.matrix,
              epochs=50,
              no_threads=params['workers'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
def main():
    corpus_model = Corpus()
    corpus_model = Corpus.load('bioc-corpus-AZ2.model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('bioc-glove-AZ2.model')
Exemplo n.º 3
0
def build_model_glove(args):

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Exemplo n.º 4
0
def build_model_glove(args):

    from glove import Glove, Corpus

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
def main():
    corpus_model = Corpus()
    corpus_model = Corpus.load('bioc-corpus-AZ2.model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, no_threads=16, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('bioc-glove-AZ2.model')
Exemplo n.º 6
0
def generate_glove_map():
    """
        generate a map of glove 
        :return: none
    """
    global article_info_path, output_path, embedding_dimension, corpus_path

    write_log('GloVe Load article info : Start')
    with open(article_info_path, 'r') as f_art:
        article_info = json.load(f_art)
    write_log('GloVe Load article info : End')

    write_log('GloVe Generate set of words : Start')
    words = set([])
    for url, dict_info in article_info.items():
        sentence_header = dict_info.get('sentence_header', None)
        sentence_body = dict_info.get('sentence_body', None)

        if (sentence_header == None) or (sentence_body == None):
            continue

        #for sentence in sentence_header + sentence_body:
        for sentence in sentence_header:
            for word in sentence.split(' '):
                words.update([word])

    write_log('GloVe Generate set of words - {}  : End'.format(len(words)))

    write_log('GloVe Load corpus from {}: Start'.format(corpus_path))
    corpus = Corpus.load(corpus_path)
    write_log('GloVe Load corpus : End')

    write_log('GloVe learning : Start')
    glove = Glove(no_components=embedding_dimension, learning_rate=0.05)
    glove.fit(corpus.matrix, epochs=400, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    write_log('GloVe learning : End')

    dict_a2g = {}
    for word in words:
        #word_vector = np.array(glove.word_vectors[glove.dictionary[word]])
        word_vector = glove.word_vectors[glove.dictionary[word]].tolist()
        assert (len(word_vector) == embedding_dimension)
        dict_a2g[word] = word_vector

    write_log('GloVe result dump : Start')
    with open(output_path, 'wb') as f_out:
        pickle.dump(dict_a2g, f_out)
    write_log('GloVe result dump : End')
Exemplo n.º 7
0
def create_glove_corpus():
    corpus_path = '%s/Embedding/glove/glove.corpus'%(folder)
    if not os.path.exists(corpus_path):
        corpus = Corpus() # 建立glove corpus物件,並設定matrix scan window大小
        corpus.fit(embedding_corpus, window=10) 

        corpus.fit(embedding_corpus, window=10)
        print('Dict size: %s' % len(corpus.dictionary))
        print('Collocations: %s' % corpus.matrix.nnz)
        corpus.save('%s/Embedding/glove/glove.corpus'%(folder)) # 存字典

    else:
        corpus = Corpus.load('%s/Embedding/glove/glove.corpus'%(folder))
        print('Already get glove corpus')
    return corpus
Exemplo n.º 8
0
def get_glove_corpus_model(setting):
    if not force_gen and os.path.isfile("models/" + setting_string(**setting) + "__glove_corpus_model"):
        return Corpus.load("models/" + setting_string(**setting) + "__glove_corpus_model")
    else:
        token2index_map = json.load(open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json"))

        if setting['granularity'] == 'documents':
            item_generator = get_all_documents_as_token_list(setting['token_method'], setting['data_basis'])
        elif setting['granularity'] == 'paragraphs':
            item_generator = get_all_docs_paragrahps_as_token_list(setting['token_method'], setting['data_basis'])
        else:
            raise

        corpus = (filter(lambda token: token in token2index_map, doc[1]) for doc in item_generator)
        corpus_model = Corpus(dictionary=token2index_map)
        corpus_model.fit(corpus)
        corpus_model.save("models/" + setting_string(**setting) + "__glove_corpus_model")

        return corpus_model
qs = []
ts = []
ds = []
sentences = []
for q, t in zip(data_all['question1'].values.tolist(),
                data_all['question2'].values.tolist()):
    sentences.append(q.split(' '))
    sentences.append(t.split(' '))
    qs.append(q.split(' '))
    ts.append(t.split(' '))

corpus_model = Corpus()
corpus_model.fit(sentences, window=10)
corpus_model.save(path + 'corpus.mdl')

corpus_model = Corpus.load(path + 'corpus.mdl')

glove = Glove(no_components=200, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=10, no_threads=7, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
glove.save(path + 'glove.glv')
glove = Glove.load(path + 'glove.glv')
print glove

qt_sims_dists = []
qt_diff = []


def calc_cosine_dist(text_a, text_b, metric='euclidean'):
    return pairwise_distances([text_a], [text_b], metric=metric)[0][0]
Exemplo n.º 10
0
def train_glove(inst, meta_data={}):

    start_total = datetime.now()

    meta_data["glove_params"] = settings.GLOVE_PARAMS

    glove_paramgrid = ParameterGrid(settings.GLOVE_PARAMS)

    for params in glove_paramgrid:

        start = datetime.now()
        # MAKE CORPUS
        # set corpus filepath
        corpus_fp = os.path.join(settings.WVEC_OPT_DIRP, '{}_window{}.glovecorpus'.format(
            settings.DATASET,
            params["window"]))
        # load if corpus exists
        if os.path.isfile(corpus_fp):
            logging.info("Loading existing corpus {}.".format(corpus_fp))
            corpus_model = Corpus.load(corpus_fp)
            logging.info("Successfully loaded existing corpus {}.".format(corpus_fp))
        # make a new coocurrence corpus if it does not exist
        else:
            logging.info("Creating new corpus at {}.".format(corpus_fp))
            corpus_model = Corpus()
            corpus_model.fit(inst, window=params["window"])
            os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True)
            corpus_model.save(corpus_fp)

        logging.info("Dict size: {}.".format(len(corpus_model.dictionary)))
        logging.info("Collocations: {}.".format(corpus_model.matrix.nnz))

        # GLOVE VECTOR TRAINING
        glove = Glove(no_components=params["dims"], learning_rate=params["lr"])

        logging.info("Start fitting GloVe with parameters: {}.".format(params))
        glove.fit(corpus_model.matrix, epochs=params["epochs"],
                  no_threads=params["njobs"], verbose=False)
        glove.add_dictionary(corpus_model.dictionary)

        os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True)
        model_name = 'glove.{}_w{}_lr{}_ep{}.{}d.glovemodel'.format(settings.DATASET,
                                                                    params["window"],
                                                                    params["lr"],
                                                                    params["epochs"],
                                                                    params["dims"])
        glove.save(os.path.join(settings.WVEC_OPT_DIRP, model_name))

        duration = (datetime.now() - start).total_seconds()
        meta_data["models"][model_name] = params
        meta_data["models"][model_name]["duration_training"] = duration

        logging.info("Finished fitting GloVe {} in {}s with parameters: {}.".format(
            model_name,
            duration,
            params))
        # SIMILARITY TEST
        for test_word in settings.TESTSIM_WORDS:
            if test_word not in meta_data["most_similar"]:
                meta_data["most_similar"][test_word] = {}

            logging.info("Querying model {} for {} most similar to \'{}\':".format(
                model_name,
                settings.N_TESTSIM,
                test_word))
            sim = glove.most_similar(test_word, number=settings.N_TESTSIM)
            meta_data["most_similar"][test_word][model_name] = sim

            logging.info(pprint.pformat(sim))

    total_duration = (datetime.now() - start_total).total_seconds()
    meta_data["glove_duration_training"] = total_duration

    return meta_data
Exemplo n.º 11
0
            get_data = read_corpus

        corpus_cooc = Corpus()
        corpus_cooc.fit(get_data(args.create), window=10)
        corpus_cooc.save('corpus.model')
        
        print('Dict size: %s' % len(corpus_cooc.dictionary))
        print('Collocations: %s' % corpus_cooc.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_cooc = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_cooc.dictionary))
            print('Collocations: %s' % corpus_cooc.matrix.nnz)

        print('Training the GloVe model')

        glove = Glove(no_components=100, learning_rate=0.05)
        glove.fit(corpus_cooc.matrix, epochs=int(args.train),
                  no_threads=args.parallelism, verbose=True)
        glove.add_dictionary(corpus_cooc.dictionary)

        glove.save('glove.model')

    if args.query:
        # Finally, query the model for most similar words.
def build_model():

    # Set up command line parameters.
    parser = argparse.ArgumentParser(description='Fit a GloVe model.')

    parser.add_argument('--create',
                        '-c',
                        action='store',
                        default=None,
                        help=('The filename of the corpus to pre-process. '
                              'The pre-processed corpus will be saved '
                              'and will be ready for training.'))
    parser.add_argument(
        '--train',
        '-t',
        action='store',
        default=0,
        help=('Train the GloVe model with this number of epochs.'
              'If not supplied, '
              'We\'ll attempt to load a trained model'))

    parser.add_argument(
        '--parallelism',
        '-p',
        action='store',
        default=1,
        help=('Number of parallel threads to use for training'))

    parser.add_argument('--query',
                        '-q',
                        action='store',
                        default='',
                        help='Get closes words to this word.')
    args = parser.parse_args()

    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.

        print('Pre-processing corpus')
        data = read_data(args.create)
        corpus_model = Corpus()
        corpus_model.fit(data, window=10)
        corpus_model.save('corpus.model')

        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:

        # Train the GloVe model and save it to disk.
        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)

        print('Training the GloVe model')

        glove = Glove(no_components=50, learning_rate=0.05)
        glove.fit(corpus_model.matrix,
                  epochs=int(args.train),
                  no_threads=args.parallelism,
                  verbose=True)
        glove.add_dictionary(corpus_model.dictionary)
        glove.save('glove.model')

    if args.query:
        # Finally, query the model for most similar words.
        if not args.train:
            print('Loading pre-trained GloVe model')
            glove = Glove.load('glove.model')

        print('Querying for %s' % args.query)
        pprint.pprint(glove.most_similar(args.query, number=10))
Exemplo n.º 13
0
parser = argparse.ArgumentParser()
parser.add_argument('--epochs', default=25, type=int)
args = parser.parse_args()

DATA = 'data/articles.csv'
PATH = f'custom'
CORPUS_PATH = os.path.join(PATH, 'corpus.pkl')
SEED = 34985734958
epochs = args.epochs

if not os.path.isdir(PATH):
    os.mkdir(PATH)

if os.path.exists(CORPUS_PATH):
    print('Found existing corpus.')
    corpus = Corpus.load(CORPUS_PATH)
else:
    print('Could not find existing corpus. Creating new one.')

    class Iterable:
        def __init__(self, df, col='text'):
            self.df = df
            self.col = col

        def __iter__(self):
            for article in self.df[self.col].values:
                yield preprocess(article)

    corpus = Corpus()
    start = time.time()
    corpus.fit(Iterable(pd.read_csv(DATA)))
Exemplo n.º 14
0
texts = []
classes = []
for row in csvsequence:
    texts.append(clean(row[3]).split())
    classes.append(row[0])

# Calculate distribution, to account for 95th percentile of messages.
max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts])))

print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length))

corpus = Corpus()
try:
    print("Loading pretrained corpus...")
    corpus = Corpus.load("cache/corpus.p")
except:
    print("Training corpus...")
    corpus.fit(texts, window=max_sentence_length)
    corpus.save("cache/corpus.p")

glove = Glove(no_components=number_components, learning_rate=0.05)
try:
    print("Loading pretrained GloVe vectors...")
    glove = Glove.load("cache/glove.p")
except:
    print("Training GloVe vectors...")
    # More epochs seems to make it worse
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save("cache/glove.p")
Exemplo n.º 15
0
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model', 'corpus.pmi')

        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)
            # import pdb; pdb.set_trace()

        print('Training the GloVe model')

        glove = Glove(no_components=100, learning_rate=0.05)
        glove.fit(corpus_model.matrix,
                  epochs=int(args.train),
                  no_threads=args.parallelism,
                  verbose=True)
        glove.add_dictionary(corpus_model.dictionary)

        glove.save('glove.model')
Exemplo n.º 16
0
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save('%s/Embedding/glove/glove.model' % (folder))  # 存模型
    corpus_model.save('%s/Embedding/glove/corpus.model' % (folder))  # 存字典

    #透過gensim以text_data建立字典
    dictionary = corpora.Dictionary(embedding_corpus)
    dictionary.save('%s/Embedding/glove/dictionary.gensim' % (folder))

glove = Glove.load('%s/Embedding/glove/glove.model' % (folder))
corpus_model = Corpus.load('%s/Embedding/glove/corpus.model' % (folder))
dictionary = gensim.corpora.Dictionary.load(
    '%s/Embedding/glove/dictionary.gensim' % (folder))

# write vocab to file
vocab_file = "%s/Embedding/glove/word.vocab" % (folder)
if not os.path.exists(vocab_file):
    #     vocab_count = len(glove.dictionary)
    vocab_count = 0
    print("Writing vocab file...")
    with open(vocab_file, 'w', encoding='utf-8') as writer:
        for word, idx in glove.dictionary.items():
            try:
                word_id = dictionary.token2id[word]
                word_freq = dictionary.dfs[word_id]
                if word_freq < 2: continue
Exemplo n.º 17
0
    corpus_model.fit(sentences, window=10)
    #corpus_model.save('corpus.model')
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    
    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=100,
              no_threads=10, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    
    glove.save('Embedding/main_cat/glove/glove.model') # 存模型
    corpus_model.save('Embedding/main_cat/glove/corpus.model') # 存字典


glove = Glove.load('Embedding/main_cat/glove/glove.model')
corpus_model = Corpus.load('Embedding/main_cat/glove/corpus.model')


# In[ ]:


vocab_file = "Embedding/main_cat/glove/word.vocab"

if not os.path.exists(vocab_file):
#     vocab_count = len(glove.dictionary)    
    vocab_count = 0
    print("Writing vocab file...")
    with open(vocab_file, 'w',encoding='utf-8') as writer:
        for word,idx in glove.dictionary.items():
            if word in vocab._word_to_id.keys():
                vocab_count += 1
Exemplo n.º 18
0
#-*- coding:utf-8 -*-
'''
Created on 2016-3-12

@author: dannl
'''
from glove import Glove
from glove import Corpus
import time

cooc_file='/home/dannl/tmp/newstech/glove/word.cooc'
model_file='/home/dannl/tmp/newstech/glove/glove.model'

oldtime=time.time()
# get a cooccurrence matrix
corpus_cooc = Corpus.load(cooc_file)

# get a model
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus_cooc.matrix, epochs=5,no_threads=4, verbose=True)
glove.add_dictionary(corpus_cooc.dictionary)
glove.save(model_file)

# count=0
# for word,wid in corpus_cooc.dictionary.items():
#     count+=1
#     if count>100:
#         break
#     print word,wid
    
print('Dict size: %s' % len(corpus_cooc.dictionary))
Exemplo n.º 19
0
parser = argparse.ArgumentParser(description='Related artists PCA demo')
parser.add_argument('QUERY', action='store', default='', help='Demo PCA using this artist')
parser.add_argument('--corpus', '-c', default=CORPUS_FILE, help='Specify corpus file to read')
parser.add_argument('--glove', '-g', default=GLOVE_MODEL_FILE, help='Specify glove model file to read')
args = parser.parse_args()

CORPUS_FILE = args.corpus
GLOVE_MODEL_FILE = args.glove

if not os.path.exists(RESULT_DIR):
    os.mkdir(RESULT_DIR)

# MAIN
if os.path.exists(CORPUS_FILE):
    print('[{}] Reading corpus from file...'.format(chalk.yellow(CORPUS_FILE)))
    corpus = Corpus.load(CORPUS_FILE)
else:
    print('[{}] Error reading corpus file.'.format(chalk.red(CORPUS_FILE)))
    quit(0)

if os.path.exists(GLOVE_MODEL_FILE):
    print('[{}] Reading glove model from file...'.format(chalk.yellow(GLOVE_MODEL_FILE)))
    glove = Glove.load(GLOVE_MODEL_FILE)
else:
    print('[{}] Error reading glove file.'.format(chalk.red(GLOVE_MODEL_FILE)))
    quit(0)

matrix = glove.word_vectors
dictionary = glove.dictionary

if args.QUERY not in dictionary:
Exemplo n.º 20
0
    corpus_model = Corpus()
    corpus_model.fit(sentences, window=10)
    #corpus_model.save('corpus.model')
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save('Embedding/category/glove/glove.model')  # 存模型
    corpus_model.save('Embedding/category/glove/corpus.model')  # 存字典

glove = Glove.load('Embedding/category/glove/glove.model')
corpus_model = Corpus.load('Embedding/category/glove/corpus.model')

# In[56]:

vocab_file = "Embedding/category/glove/word.vocab"

if not os.path.exists(vocab_file):
    #     vocab_count = len(glove.dictionary)
    vocab_count = 0
    print("Writing vocab file...")
    with open(vocab_file, 'w', encoding='utf-8') as writer:
        for word, idx in glove.dictionary.items():
            if word in vocab._word_to_id.keys():
                vocab_count += 1
                writer.write(word + ' ' + str(idx) +
                             '\n')  # Output vocab count
Exemplo n.º 21
0
 def load_corpus_from_model(self):
     print('Reading corpus statistics...')
     self.corpus_model = Corpus.load(args.corpus_model_path)
     print('Dict size: %s' % len(self.corpus_model.dictionary))
     print('Collocations: %s' % self.corpus_model.matrix.nnz)
Exemplo n.º 22
0
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model')
        
        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)

        print('Training the GloVe model')

        glove = Glove(no_components=100, learning_rate=0.05)
        glove.fit(corpus_model.matrix, epochs=int(args.train),
                  no_threads=args.parallelism, verbose=True)
        glove.add_dictionary(corpus_model.dictionary)

        glove.save('glove.model')

    if args.query:
        # Finally, query the model for most similar words.
Exemplo n.º 23
0
from glove import Glove, Corpus

inputFile = "/media/charles/data/nlp/zzz1000"
corpusModelFile = "/media/charles/data/nlp/corpus_wiki.model"
outputFile = "/media/charles/data/nlp/glove_wiki.model"
epochs = 10
nb_threads = 4


def get_text(fin):
    f = open(fin)     
    for line in f:    
        yield line[:-1].split(' ')
        
#corpus_model = Corpus()  
#print("computing coocurrence matrix...")       
#corpus_model.fit(get_text(inputFile), window=10)
#print("saving coocurrence matrix...")
#corpus_model.save(corpusModelFile)
corpus_model = Corpus.load(corpusModelFile)
print("fitting model...")
glove = Glove(no_components=200, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=epochs,
                  no_threads=nb_threads, verbose=True)

glove.add_dictionary(corpus_model.dictionary)
print("saving model to "+outputFile+" ...")
glove.save(outputFile)
Exemplo n.º 24
0
from __future__ import print_function
import argparse
import pprint
import gensim

from glove import Glove
from glove import Corpus

if __name__ == '__main__':
    print('Reading corpus statistics')
    corpus_model = Corpus.load('corpus.model')
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    print('Training the GloVe model')
    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=10, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('glove.model')
    print('Training finished')
Exemplo n.º 25
0
 def load(glove_corpus_filename, d, p=None):
     corpus_model = Corpus.load(glove_corpus_filename)
     M = corpus_model.matrix.todense()  # an upper triangular matrix with diagonal values of zero
     M = M + M.T  # convert to a symmetric matrix
     return SVDModelFromGloVeCorpus(np.asarray(M), corpus_model.dictionary, d, p)