Exemplo n.º 1
0
def build_model_glove(args):

    from glove import Glove, Corpus

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Exemplo n.º 2
0
def train_glove(target_group, glove_para, src_file, save_model_name):
    """
    example: train_glove(target_group='words', glove_para=glove_para_word)
    after save the mode, u can use it by : glove_ana = Glove.load('glove_words.model')
    :param target_group: 'words' or 'chars'
    :param glove_para: glove_para_word = {'window_size':4, 'no_components':300, 'learning_rate':0.05, 'no_epochs':2, 'parallelism':4}
    :return:
    """
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(src_file=src_file,
                                 words_or_chars=target_group),
                     window=glove_para['window_size']
                     )  #avg word size is 6 for each sentence
    corpus_model.save('corpus_model_{}.model'.format(target_group))
    print target_group
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    print('Training the GloVe model')

    glove = Glove(no_components=glove_para['no_components'],
                  learning_rate=glove_para['learning_rate'])
    glove.fit(corpus_model.matrix,
              epochs=glove_para['no_epochs'],
              no_threads=glove_para['parallelism'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save(save_model_name)
Exemplo n.º 3
0
def train_glove(corpus, params, exp_id, save_dir, save_dict=False):
    dictionary = load_glove_dictionary(exp_id, save_dir)
    # Build the corpus dictionary and the cooccurrence matrix.
    print('Pre-processing corpus')

    dict_path = os.path.join(save_dir, 'glove_dict_{}.model'.format(exp_id))
    if os.path.exists(dict_path):
        corpus_model = Corpus.load(dict_path)
    else:
        corpus_model = Corpus(dictionary)
        corpus_model.fit(corpus,
                         window=params['window'] * 2,
                         ignore_missing=True)
        if save_dict:
            corpus_model.save(dict_path)

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=100, learning_rate=params['alpha'])
    glove.fit(corpus_model.matrix,
              epochs=50,
              no_threads=params['workers'],
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
Exemplo n.º 4
0
class GloVeFilter(object):
    def __init__(self):
        # Corpus model
        vocab = dict(torch.load("../data/dialogue.vocab.pt", "text"))
        self.corpus_model = Corpus(dictionary=vocab['tgt'].stoi)
        # Model
        self.glove = Glove(no_components=args.no_components,
                           learning_rate=args.learning_rate)

    def load_corpus_from_txt(self):
        print('Reading corpus statistics...')
        #texts = [self.pp.preprocessing(l.strip().decode("utf8", "ignore")) for l in open(args.data_path)]
        texts = [
            l.strip().decode("utf8", "ignore").split(" ")
            for l in open(args.data_path)
        ]
        self.corpus_model.fit(texts, window=args.window, ignore_missing=True)
        self.corpus_model.save(args.corpus_model_path)
        print('Dict size: %s' % len(self.corpus_model.dictionary))
        print('Collocations: %s' % self.corpus_model.matrix.nnz)

    def load_corpus_from_model(self):
        print('Reading corpus statistics...')
        self.corpus_model = Corpus.load(args.corpus_model_path)
        print('Dict size: %s' % len(self.corpus_model.dictionary))
        print('Collocations: %s' % self.corpus_model.matrix.nnz)

    def train(self):
        print('Training the GloVe model...')
        self.glove.fit(self.corpus_model.matrix,
                       epochs=args.epochs,
                       verbose=True)
        self.glove.add_dictionary(self.corpus_model.dictionary)
        self.glove.save(args.model_path)
        print('Training finished')
Exemplo n.º 5
0
def train_glove_fashionrec(dimensionality, context, epochs):
    """ Train with Glove on IG corpora"""
    total_count, vocab_size = corpus_stats("data/clean2_corpus.txt")
    print("total word count: {}, vocabulary size: {}".format(
        total_count, vocab_size))
    fileName = "results/training/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_" + ".txt"
    corpus = readCorpus()
    lines = corpus.split("\n")
    linessplit = map(lambda x: x.split(" "), lines)
    corpus_model = Corpus()
    start_time = datetime.now()
    corpus_model.fit(linessplit, window=context)
    corpusModelFile = "trained/glove_fashion_epochs" + str(
        epochs) + "_d" + str(dimensionality) + "_c" + str(
            context) + "_corpus" + ".model"
    corpus_model.save(corpusModelFile)
    glove = Glove(no_components=dimensionality, learning_rate=0.05)
    glove.fit(corpus_model.matrix,
              epochs=int(epochs),
              no_threads=8,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    time_elapsed = datetime.now() - start_time
    gloveModelFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".model"
    glove.save(gloveModelFile)
    notes = "Glove Fashion Data," + str(dimensionality) + " dim, " + str(
        context) + " context, " + str(
            epochs) + " epochs \n" + "Training time: " + str(time_elapsed)
    save_to_file(fileName, notes)
    gloveVecFile = "trained/glove_fashion_epochs" + str(epochs) + "_d" + str(
        dimensionality) + "_c" + str(context) + "_vecs" + ".vec"
    save_glove_bin_to_vec(glove, gloveVecFile)
Exemplo n.º 6
0
def build_model_glove(args):

    if not os.path.exists(args.corpus_model) or \
            max(map(os.path.getmtime, args.input)) >= os.path.getmtime(args.corpus_model):

        # Build the corpus dictionary and the cooccurrence matrix.
        logging.info('Pre-processing corpus')

        corpus_model = Corpus()
        corpus_model.fit(get_sentences(args), window=CONFIG['glove']['window'])
        corpus_model.save(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)
    else:
        # Try to load a corpus from disk.
        logging.info('Reading corpus statistics')
        corpus_model = Corpus.load(args.corpus_model)

        logging.info('Dict size: %s' % len(corpus_model.dictionary))
        logging.info('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    logging.info('Training the GloVe model')

    glove = Glove(no_components=CONFIG['glove']['size'], learning_rate=CONFIG['glove']['learning_rate'])
    glove.fit(corpus_model.matrix, epochs=CONFIG['glove']['epochs'],
              no_threads=args.workers, verbose=args.verbose)
    glove.add_dictionary(corpus_model.dictionary)
    return glove
def generate_glove_corpus():
    global article_info_path, output_path

    write_log('GloVe Load article info : Start')
    with open(article_info_path, 'r') as f_art:
        article_info = json.load(f_art)
    write_log('GloVe Load article info : End')

    write_log('GloVe Generate sentences : Start')
    sentences = []
    for url, dict_info in article_info.items():
        sentence_header = dict_info.get('sentence_header', None)
        sentence_body = dict_info.get('sentence_body', None)

        if (sentence_header == None) or (sentence_body == None):
            continue

        words = []
        #for sentence in sentence_header + sentence_body:
        for sentence in sentence_header:
            for word in sentence.split(' '):
                words.append(word)

        sentences.append(words)
    write_log('GloVe Generate sentences : End')

    write_log('GloVe Generate corpus : Start')
    corpus = Corpus()
    corpus.fit(sentences, window=10)
    write_log('GloVe Generate corpus : End')

    corpus.save(output_path)
Exemplo n.º 8
0
def glove_single(domain_name):
    corpus_model = Corpus()
    corpus_model.fit(labeled_reviews(domain_name), window=10)
    corpus_model.save('../work/%s/corpus.model'% domain_name)
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    print('Training the GloVe model')
    model = Glove(no_components=300, learning_rate=0.05)
    model.fit(corpus_model.matrix, epochs=int(10),
              no_threads=6, verbose=True)
    model.add_dictionary(corpus_model.dictionary)
    model.save('../work/%s/glove.model' % domain_name) 
    return
Exemplo n.º 9
0
def train_glove(sequence_file_path, output_folder, no_components, epochs):
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(sequence_file_path), window=10)
    corpus_model.save(get_corpus_model_path(output_folder))

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=int(no_components), learning_rate=0.05)
    glove.fit(corpus_model.matrix,
              epochs=int(epochs),
              no_threads=50,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save(get_glove_model_path(output_folder))
Exemplo n.º 10
0
def create_glove_corpus():
    corpus_path = '%s/Embedding/glove/glove.corpus'%(folder)
    if not os.path.exists(corpus_path):
        corpus = Corpus() # 建立glove corpus物件,並設定matrix scan window大小
        corpus.fit(embedding_corpus, window=10) 

        corpus.fit(embedding_corpus, window=10)
        print('Dict size: %s' % len(corpus.dictionary))
        print('Collocations: %s' % corpus.matrix.nnz)
        corpus.save('%s/Embedding/glove/glove.corpus'%(folder)) # 存字典

    else:
        corpus = Corpus.load('%s/Embedding/glove/glove.corpus'%(folder))
        print('Already get glove corpus')
    return corpus
Exemplo n.º 11
0
def get_glove_corpus_model(setting):
    if not force_gen and os.path.isfile("models/" + setting_string(**setting) + "__glove_corpus_model"):
        return Corpus.load("models/" + setting_string(**setting) + "__glove_corpus_model")
    else:
        token2index_map = json.load(open("derived_data/" + setting_string(**setting) + "__processed_token2index_map.json"))

        if setting['granularity'] == 'documents':
            item_generator = get_all_documents_as_token_list(setting['token_method'], setting['data_basis'])
        elif setting['granularity'] == 'paragraphs':
            item_generator = get_all_docs_paragrahps_as_token_list(setting['token_method'], setting['data_basis'])
        else:
            raise

        corpus = (filter(lambda token: token in token2index_map, doc[1]) for doc in item_generator)
        corpus_model = Corpus(dictionary=token2index_map)
        corpus_model.fit(corpus)
        corpus_model.save("models/" + setting_string(**setting) + "__glove_corpus_model")

        return corpus_model
def prepare_corpus(args):
    logging.info('Preparing corpus')
    word_counts = Counter()
    for tokens in map(str.split, open(args.data_path)):
        word_counts.update(tokens)
    logging.info('Counted {} unique words.'.format(len(word_counts)))
    logging.info('Truncating vocabulary at min_count {}, max_tokens {}'.format(
        args.min_count, args.max_tokens))
    tokens = {
        token
        for token, count in word_counts.most_common(args.max_tokens)
        if count >= args.min_count
    }
    dictionary = {token: i for i, token in enumerate(tokens)}
    logging.info('Using vocabulary of size {}'.format(len(dictionary)))
    corpus = Corpus(dictionary)
    logging.info('Counting co-occurrences. Window size {}'.format(args.window))
    corpus.fit(map(str.split, open(args.data_path)),
               window=args.window,
               ignore_missing=True)
    corpus.save(args.co_path)
    return corpus
len_train = train.shape[0]

qs = []
ts = []
ds = []
sentences = []
for q, t in zip(data_all['question1'].values.tolist(),
                data_all['question2'].values.tolist()):
    sentences.append(q.split(' '))
    sentences.append(t.split(' '))
    qs.append(q.split(' '))
    ts.append(t.split(' '))

corpus_model = Corpus()
corpus_model.fit(sentences, window=10)
corpus_model.save(path + 'corpus.mdl')

corpus_model = Corpus.load(path + 'corpus.mdl')

glove = Glove(no_components=200, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=10, no_threads=7, verbose=True)
glove.add_dictionary(corpus_model.dictionary)
glove.save(path + 'glove.glv')
glove = Glove.load(path + 'glove.glv')
print glove

qt_sims_dists = []
qt_diff = []


def calc_cosine_dist(text_a, text_b, metric='euclidean'):
Exemplo n.º 14
0
def train_glove(inst, meta_data={}):

    start_total = datetime.now()

    meta_data["glove_params"] = settings.GLOVE_PARAMS

    glove_paramgrid = ParameterGrid(settings.GLOVE_PARAMS)

    for params in glove_paramgrid:

        start = datetime.now()
        # MAKE CORPUS
        # set corpus filepath
        corpus_fp = os.path.join(settings.WVEC_OPT_DIRP, '{}_window{}.glovecorpus'.format(
            settings.DATASET,
            params["window"]))
        # load if corpus exists
        if os.path.isfile(corpus_fp):
            logging.info("Loading existing corpus {}.".format(corpus_fp))
            corpus_model = Corpus.load(corpus_fp)
            logging.info("Successfully loaded existing corpus {}.".format(corpus_fp))
        # make a new coocurrence corpus if it does not exist
        else:
            logging.info("Creating new corpus at {}.".format(corpus_fp))
            corpus_model = Corpus()
            corpus_model.fit(inst, window=params["window"])
            os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True)
            corpus_model.save(corpus_fp)

        logging.info("Dict size: {}.".format(len(corpus_model.dictionary)))
        logging.info("Collocations: {}.".format(corpus_model.matrix.nnz))

        # GLOVE VECTOR TRAINING
        glove = Glove(no_components=params["dims"], learning_rate=params["lr"])

        logging.info("Start fitting GloVe with parameters: {}.".format(params))
        glove.fit(corpus_model.matrix, epochs=params["epochs"],
                  no_threads=params["njobs"], verbose=False)
        glove.add_dictionary(corpus_model.dictionary)

        os.makedirs(settings.WVEC_OPT_DIRP, exist_ok=True)
        model_name = 'glove.{}_w{}_lr{}_ep{}.{}d.glovemodel'.format(settings.DATASET,
                                                                    params["window"],
                                                                    params["lr"],
                                                                    params["epochs"],
                                                                    params["dims"])
        glove.save(os.path.join(settings.WVEC_OPT_DIRP, model_name))

        duration = (datetime.now() - start).total_seconds()
        meta_data["models"][model_name] = params
        meta_data["models"][model_name]["duration_training"] = duration

        logging.info("Finished fitting GloVe {} in {}s with parameters: {}.".format(
            model_name,
            duration,
            params))
        # SIMILARITY TEST
        for test_word in settings.TESTSIM_WORDS:
            if test_word not in meta_data["most_similar"]:
                meta_data["most_similar"][test_word] = {}

            logging.info("Querying model {} for {} most similar to \'{}\':".format(
                model_name,
                settings.N_TESTSIM,
                test_word))
            sim = glove.most_similar(test_word, number=settings.N_TESTSIM)
            meta_data["most_similar"][test_word][model_name] = sim

            logging.info(pprint.pformat(sim))

    total_duration = (datetime.now() - start_total).total_seconds()
    meta_data["glove_duration_training"] = total_duration

    return meta_data
Exemplo n.º 15
0
def main():
    corpus_model = Corpus()
    corpus_model.fit(itertexts(), window=10, max_map_size=1000000)
    corpus_model.save('bioc-corpus-AZ2.model')
Exemplo n.º 16
0
    args = parser.parse_args()


    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print('Pre-processing corpus')

        if args.wiki:
            print('Using wikipedia corpus')
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_cooc = Corpus()
        corpus_cooc.fit(get_data(args.create), window=10)
        corpus_cooc.save('corpus.model')
        
        print('Dict size: %s' % len(corpus_cooc.dictionary))
        print('Collocations: %s' % corpus_cooc.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_cooc = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_cooc.dictionary))
            print('Collocations: %s' % corpus_cooc.matrix.nnz)
Exemplo n.º 17
0
'''
from glove import Glove
from glove import Corpus
from gensim import corpora
import time

dic_file=r'/home/dannl/tmp/newstech/glove/news.dic'
corpus_file='/home/dannl/tmp/newstech/news.txt'
cooc_file='/home/dannl/tmp/newstech/glove/word.cooc'

def read_corpus(filename):
    with open(filename, 'r') as datafile:
        for line in datafile:
            yield line.split()[1:]

# get a cooccurrence matrix
oldtime=time.time()
dictionary = corpora.Dictionary.load(dic_file)

# corpus_cooc = Corpus()
# corpus_cooc.fit(read_corpus(corpus_file), window=10)

corpus_cooc = Corpus(dictionary=dictionary.token2id)
corpus_cooc.fit(read_corpus(corpus_file), window=10,ignore_missing=True)
corpus_cooc.save(cooc_file)

print('Dict size: %s' % len(corpus_cooc.dictionary))
print('Collocations: %s' % corpus_cooc.matrix.nnz)

print 'time cost:%.2f'%(time.time()-oldtime,)
Exemplo n.º 18
0
        action='store',
        type=int,
        default=10,
        help=
        'The length of the (symmetric) context window used for co-occurrence.')
    parser.add_argument('--max_count',
                        '-m',
                        action='store',
                        type=int,
                        default=100,
                        help='The max co-occurrence count.')
    args = parser.parse_args()

    print('Pre-processing corpus')
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(args.corpus), window=args.window)
    corpus_model.save('%s.corpus.model' % args.out)
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    print('Training the GloVe model')
    glove = Glove(no_components=args.components,
                  learning_rate=args.learning_rate,
                  max_count=args.max_count)
    glove.fit(corpus_model.matrix,
              epochs=int(args.train),
              no_threads=args.parallelism,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('%s.glove.model' % args.out)
Exemplo n.º 19
0
    args = parser.parse_args()


    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print('Pre-processing corpus')

        if args.wiki:
            print('Using wikipedia corpus')
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model')
        
        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)
Exemplo n.º 20
0
    texts.append(clean(row[3]).split())
    classes.append(row[0])

# Calculate distribution, to account for 95th percentile of messages.
max_sentence_length = int(np.mean([len(x) for x in texts]) + (norm.ppf(0.95) * np.std([len(x) for x in texts])))

print("Max sentence length: {}, put that in settings.json.".format(max_sentence_length))

corpus = Corpus()
try:
    print("Loading pretrained corpus...")
    corpus = Corpus.load("cache/corpus.p")
except:
    print("Training corpus...")
    corpus.fit(texts, window=max_sentence_length)
    corpus.save("cache/corpus.p")

glove = Glove(no_components=number_components, learning_rate=0.05)
try:
    print("Loading pretrained GloVe vectors...")
    glove = Glove.load("cache/glove.p")
except:
    print("Training GloVe vectors...")
    # More epochs seems to make it worse
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    glove.save("cache/glove.p")

# Convert input text
print("Vectorizing input sentences...")
X = vectify(texts, previous_message, glove.dictionary, max_sentence_length, contextual)
Exemplo n.º 21
0
                        help='Get closes words to this word.')
    args = parser.parse_args()

    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print('Pre-processing corpus')

        if args.wiki:
            print('Using wikipedia corpus')
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model', 'corpus.pmi')

        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)
            # import pdb; pdb.set_trace()
Exemplo n.º 22
0
#importing the glove library
from glove import Corpus, Glove
import pandas as pd
from tqdm import tqdm
pruned_tagset = pd.read_csv("termstr_all.csv", index_col=0)
pruned_tagset = pruned_tagset[pruned_tagset['termstr'].notnull()]
tqdm.pandas(desc="split tagset string")
pruned_tagset = list(
    pruned_tagset['termstr'].progress_apply(lambda x: x.split(';')))
#creating a corpus object
corpus = Corpus()

#training the corpus to generate the co occurence matrix which is used in GloVe
#
corpus.fit(pruned_tagset, window=3)
corpus.save('corpus.model')

#creating a Glove object which will use the matrix created in the above lines to create embeddings
#We can set the learning rate as it uses Gradient Descent and number of components
glove = Glove(no_components=150, learning_rate=0.05)

glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

#print(glove.dictionary)
termvec = glove.word_vectors
termdic = glove.dictionary
temp1 = glove.most_similar('rock', number=10)
print(temp1)
import pickle
def main():
    corpus_model = Corpus()
    corpus_model.fit(itertexts(), window=10, max_map_size=1000000)
    corpus_model.save('bioc-corpus-AZ2.model')
Exemplo n.º 24
0
if not os.path.exists("Embedding/main_cat/glove/glove.model"):

    corpus_model = Corpus()
    corpus_model.fit(sentences, window=10)
    #corpus_model.save('corpus.model')
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)
    
    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=100,
              no_threads=10, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    
    glove.save('Embedding/main_cat/glove/glove.model') # 存模型
    corpus_model.save('Embedding/main_cat/glove/corpus.model') # 存字典


glove = Glove.load('Embedding/main_cat/glove/glove.model')
corpus_model = Corpus.load('Embedding/main_cat/glove/corpus.model')


# In[ ]:


vocab_file = "Embedding/main_cat/glove/word.vocab"

if not os.path.exists(vocab_file):
#     vocab_count = len(glove.dictionary)    
    vocab_count = 0
    print("Writing vocab file...")
Exemplo n.º 25
0
"""
Created on Fri Sep 14 12:45:30 2018

@author: charlie
"""

import itertools
from gensim.models.word2vec import Text8Corpus
from glove import Corpus, Glove
import os

cur_dir = os.getcwd()
glove_fname = '/glove.model'
corpus_fname = "/corpus.model"
if os.path.exists(cur_dir + glove_fname):
    glove = Glove.load(cur_dir+glove_fname)
#    corpus = Corpus.load(cur_dir+corpus_fname)
else:
    sentences = list(itertools.islice(Text8Corpus('text/text8'), None))
    corpus = Corpus()
    corpus.fit(sentences, window = 10)
    
    glove = Glove(no_components=100, learning_rate = 0.05)
    glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
    glove.add_dictionary(corpus.dictionary)
    
    glove.save(cur_dir + glove_fname)
    corpus.save(cur_dir+corpus_fname)

glove.most_similar('men') # Parameters are hashable string not list
glove.word_vectors[glove.dictionary['perfect']]
Exemplo n.º 26
0
    print('Could not find existing corpus. Creating new one.')

    class Iterable:
        def __init__(self, df, col='text'):
            self.df = df
            self.col = col

        def __iter__(self):
            for article in self.df[self.col].values:
                yield preprocess(article)

    corpus = Corpus()
    start = time.time()
    corpus.fit(Iterable(pd.read_csv(DATA)))
    print(f'finished co_occur in {int(time.time() - start)} seconds.')
    corpus.save(CORPUS_PATH)


def train_dim(size):
    """
    Trains and saves a SIZE-dimensional glove embedding.
    """
    glove = Glove(no_components=size, random_state=random.seed(SEED))
    start = time.time()
    glove.fit(corpus.matrix, epochs=epochs, no_threads=12, verbose=True)
    print(
        f'finished {size}d vectors in {(time.time() - start)/60:.2f} minutes.')

    with open(f'{PATH}/custom.{size}d.txt', 'w') as f:
        for word, i in corpus.dictionary.items():
            word += ' '
def build_model():

    # Set up command line parameters.
    parser = argparse.ArgumentParser(description='Fit a GloVe model.')

    parser.add_argument('--create',
                        '-c',
                        action='store',
                        default=None,
                        help=('The filename of the corpus to pre-process. '
                              'The pre-processed corpus will be saved '
                              'and will be ready for training.'))
    parser.add_argument(
        '--train',
        '-t',
        action='store',
        default=0,
        help=('Train the GloVe model with this number of epochs.'
              'If not supplied, '
              'We\'ll attempt to load a trained model'))

    parser.add_argument(
        '--parallelism',
        '-p',
        action='store',
        default=1,
        help=('Number of parallel threads to use for training'))

    parser.add_argument('--query',
                        '-q',
                        action='store',
                        default='',
                        help='Get closes words to this word.')
    args = parser.parse_args()

    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.

        print('Pre-processing corpus')
        data = read_data(args.create)
        corpus_model = Corpus()
        corpus_model.fit(data, window=10)
        corpus_model.save('corpus.model')

        print('Dict size: %s' % len(corpus_model.dictionary))
        print('Collocations: %s' % corpus_model.matrix.nnz)

    if args.train:

        # Train the GloVe model and save it to disk.
        if not args.create:
            # Try to load a corpus from disk.
            print('Reading corpus statistics')
            corpus_model = Corpus.load('corpus.model')

            print('Dict size: %s' % len(corpus_model.dictionary))
            print('Collocations: %s' % corpus_model.matrix.nnz)

        print('Training the GloVe model')

        glove = Glove(no_components=50, learning_rate=0.05)
        glove.fit(corpus_model.matrix,
                  epochs=int(args.train),
                  no_threads=args.parallelism,
                  verbose=True)
        glove.add_dictionary(corpus_model.dictionary)
        glove.save('glove.model')

    if args.query:
        # Finally, query the model for most similar words.
        if not args.train:
            print('Loading pre-trained GloVe model')
            glove = Glove.load('glove.model')

        print('Querying for %s' % args.query)
        pprint.pprint(glove.most_similar(args.query, number=10))
Exemplo n.º 28
0
if not os.path.exists('%s/Embedding/glove' % (folder)):
    os.makedirs('%s/Embedding/glove' % (folder))

if not os.path.exists("%s/Embedding/glove/glove.model" % (folder)):
    corpus_model = Corpus()
    corpus_model.fit(embedding_corpus, window=10)
    #corpus_model.save('corpus.model')
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save('%s/Embedding/glove/glove.model' % (folder))  # 存模型
    corpus_model.save('%s/Embedding/glove/corpus.model' % (folder))  # 存字典

    #透過gensim以text_data建立字典
    dictionary = corpora.Dictionary(embedding_corpus)
    dictionary.save('%s/Embedding/glove/dictionary.gensim' % (folder))

glove = Glove.load('%s/Embedding/glove/glove.model' % (folder))
corpus_model = Corpus.load('%s/Embedding/glove/corpus.model' % (folder))
dictionary = gensim.corpora.Dictionary.load(
    '%s/Embedding/glove/dictionary.gensim' % (folder))

# write vocab to file
vocab_file = "%s/Embedding/glove/word.vocab" % (folder)
if not os.path.exists(vocab_file):
    #     vocab_count = len(glove.dictionary)
    vocab_count = 0
Exemplo n.º 29
0
# In[55]:

if not os.path.exists("Embedding/category/glove/glove.model"):

    corpus_model = Corpus()
    corpus_model.fit(sentences, window=10)
    #corpus_model.save('corpus.model')
    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    glove = Glove(no_components=300, learning_rate=0.05)
    glove.fit(corpus_model.matrix, epochs=100, no_threads=10, verbose=True)
    glove.add_dictionary(corpus_model.dictionary)

    glove.save('Embedding/category/glove/glove.model')  # 存模型
    corpus_model.save('Embedding/category/glove/corpus.model')  # 存字典

glove = Glove.load('Embedding/category/glove/glove.model')
corpus_model = Corpus.load('Embedding/category/glove/corpus.model')

# In[56]:

vocab_file = "Embedding/category/glove/word.vocab"

if not os.path.exists(vocab_file):
    #     vocab_count = len(glove.dictionary)
    vocab_count = 0
    print("Writing vocab file...")
    with open(vocab_file, 'w', encoding='utf-8') as writer:
        for word, idx in glove.dictionary.items():
            if word in vocab._word_to_id.keys():
Exemplo n.º 30
0
GLOVE_MODEL_FILE = args.glove

if not os.path.exists(RESULT_DIR):
    os.mkdir(RESULT_DIR)

# MAIN
if os.path.exists(CORPUS_FILE):
    print('[{}] Reading corpus from file...'.format(chalk.yellow(CORPUS_FILE)))
    corpus = Corpus.load(CORPUS_FILE)
else:
    nx_G = util.get_nx_graph()
    walks = util.get_node2vec_walks(nx_G)
    corpus = Corpus()
    corpus.fit(walks, window=WINDOW_SIZE)
    print('[{}] Writing corpus file...'.format(chalk.green(CORPUS_FILE)))
    corpus.save(CORPUS_FILE)

if os.path.exists(GLOVE_MODEL_FILE) and not args.train:
    print('[{}] Reading glove model from file...'.format(
        chalk.yellow(GLOVE_MODEL_FILE)))
    glove = Glove.load(GLOVE_MODEL_FILE)
else:
    glove = Glove(no_components=VECTOR_DIMENSION, learning_rate=0.05)
    glove.fit(corpus.matrix,
              epochs=GLOVE_EPOCHS,
              no_threads=PARALLEL_WORKER_COUNT,
              verbose=True)
    glove.add_dictionary(corpus.dictionary)
    print('[{}] Writing glove file...'.format(chalk.green(GLOVE_MODEL_FILE)))
    glove.save(GLOVE_MODEL_FILE)
if args.query:
Exemplo n.º 31
0
    return sents_token


if __name__ == '__main__':

    # Set up parameters.
    train = 1
    parallelism = 1
    query = 'brave'

    # Build the corpus dictionary and the cooccurrence matrix.
    print('Pre-processing corpus')
    corpus_model = Corpus()
    corpus_model.fit(read_corpus(), window=10)
    corpus_model.save('corpus.model')

    print('Dict size: %s' % len(corpus_model.dictionary))
    print('Collocations: %s' % corpus_model.matrix.nnz)

    # Train the GloVe model and save it to disk.
    print('Training the GloVe model')

    glove = Glove(no_components=100, learning_rate=0.05)
    glove.fit(corpus_model.matrix,
              epochs=train,
              no_threads=parallelism,
              verbose=True)
    glove.add_dictionary(corpus_model.dictionary)
    glove.save('glove.model')
Exemplo n.º 32
0
                text = text.replace('Trump', 'Trump_Pre_Election')
            elif date and date >= ELECTION_DATE:
                text = text.replace('Trump', 'Trump_Post_Election')

            text = text.replace("\xa0", " ").replace('“',
                                                     '"').replace('”', '"')
            sents = sent_tokenize(text)
            for sent in sents:
                yield self.tokenizer.tokenize(sent)


dirname = os.path.expanduser('./output/articles')

sentences = SentencesIterator(dirname)
print('Building Corpus...')
corpus_model = Corpus()
corpus_model.fit(sentences, window=10)
corpus_model.save(OUTPUT_DIR + 'corpus.model')
print('Build and saved!')
print('Dict size: %s' % len(corpus_model.dictionary))
print('Collocations: %s' % corpus_model.matrix.nnz)
print('Training the GloVe model')

glove = Glove(no_components=300, learning_rate=0.05)
glove.fit(corpus_model.matrix, epochs=25, no_threads=10, verbose=True)
glove.add_dictionary(corpus_model.dictionary)

glove.save(OUTPUT_DIR + 'glove.model')
# model = gensim.models.Word2Vec(sentences, size=300, min_count=5, iter=10, workers=10, sg=1)
# model.save('./vectors/trump_preprocess_skipgram/w2v_foxnews')