예제 #1
0
파일: main.py 프로젝트: strategist922/MORE
def main(_):
    # Train a word2vec model.
    if FLAGS.train_model:
        if not FLAGS.train_data or not FLAGS.save_path:
            print("--train_data and --save_path must be specified.")
            sys.exit(1)
        opts = Options(FLAGS)
        with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as session:
            with tf.device("/gpu:2"):
                model = Word2Vec(opts, session)
                for i in range(opts.epochs_to_train):
                    print("Beginning epoch {}".format(i))
                    model.train()  # Process one epoch
                #  Perform a final save.
                model.saver.save(session,
                                 os.path.join(opts.save_path, "model.ckpt"),
                                 global_step=model.global_step)
    else:
        opts = Options(FLAGS)
        with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)) as session:
            with tf.device("/cpu:0"):
                model = Word2Vec(opts, session)
                model.get_embeddings_from_ckpt(
                    './Results/wup_lch_nam/com_90_p/')
                model.get_eval_sims(
                    "./Results/wup_lch_nam/com_90_p_report.csv",
                    eval_ds=FLAGS.eval_ds)
예제 #2
0
def main():

    # ----DATA PREPARATION---- #
    corpus = np.array(['the quick brown fox jumped over the lazy dog'])

    # ----MODEL TRAINING---- #
    hyperparameters = {
        'method': "cbow",
        'window_size': 2,
        'n': 100,  # typically ranges from 100 to 300
        'epochs': 10000,
        'learning_rate': 0.01
    }

    cbow = Word2Vec(**hyperparameters)

    training_data = cbow.generate_training_data(corpus)

    cbow.train(training_data)

    # get word embedding of word
    print(cbow.word_vec("fox"))

    # get similar words
    print(cbow.similar_words("fox"))
예제 #3
0
def do_infer_sent(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()
    if len(glob.glob(args.name + '.model.?????????.pth')) == 0:
        logging.error('no model available: {}'.format(args.name + '.model.?????????.pth'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    args.embedding_size, args.pooling = read_params(args)
    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    if args.cuda:
        model.cuda()

    dataset = Dataset(args, token, vocab, 'infer_sent', skip_subsampling=True)
    with torch.no_grad():
        model.eval()
        for batch in dataset:
            snts = model.SentEmbed(batch[0], batch[1], 'iEmb').cpu().detach().numpy().tolist()
            for i in range(len(snts)):
                sentence = ["{:.6f}".format(w) for w in snts[i]]
                print('{}\t{}'.format(batch[2][i]+1, ' '.join(sentence) ))
예제 #4
0
def train(args):
    idx2word = pickle.load(open(os.path.join(args.data_dir, 'idx2word.dat'), 'rb'))
    wc = pickle.load(open(os.path.join(args.data_dir, 'wc.dat'), 'rb'))
    wf = np.array([wc[word] for word in idx2word])
    wf = wf / wf.sum()
    ws = 1 - np.sqrt(args.ss_t / wf)
    ws = np.clip(ws, 0, 1)
    vocab_size = len(idx2word)
    weights = wf if args.weights else None
    word2vec = Word2Vec(vocab_size=vocab_size, embedding_size=args.e_dim)
    sgns = SkipGramNegSampling(embedding=word2vec, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights)
    optim = Adam(sgns.parameters())
    if args.cuda:
        sgns = sgns.cuda()
    if not os.path.isdir(args.save_dir):
        os.mkdir(args.save_dir)
    if args.conti:
        sgns.load_state_dict(t.load(os.path.join(args.save_dir, '{}.pt'.format(args.name))))
        optim.load_state_dict(t.load(os.path.join(args.save_dir, '{}.optim.pt'.format(args.name))))
    for epoch in range(1, args.epoch + 1):
        dataset = PermutedSubsampledCorpus(os.path.join(args.data_dir, 'train.dat'))
        dataloader = DataLoader(dataset, batch_size=args.mb, shuffle=True)
        total_batches = int(np.ceil(len(dataset) / args.mb))
        for batch, (iword, owords) in enumerate(dataloader):
            loss = sgns(iword, owords)
            optim.zero_grad()
            loss.backward()
            optim.step()
            print("[e{:2d}][b{:5d}/{:5d}] loss: {:7.4f}\r".format(epoch, batch + 1, total_batches, loss.data[0]), end='\r')
        print("")
    idx2vec = word2vec.ivectors.weight.data.cpu().numpy()
    pickle.dump(idx2vec, open(os.path.join(args.data_dir, 'idx2vec.dat'), 'wb'))
    t.save(sgns.state_dict(), os.path.join(args.save_dir, '{}.pt'.format(args.name)))
    t.save(optim.state_dict(), os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)))
예제 #5
0
def do_infer_word(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()
    if len(glob.glob(args.name + '.model.?????????.pth')) == 0:
        logging.error('no model available: {}'.format(args.name + '.model.?????????.pth'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    args.embedding_size, args.pooling = read_params(args)
    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    if args.cuda:
        model.cuda()

    if args.sim == 'cos':
        distance = nn.CosineSimilarity(dim=1, eps=1e-6)
    elif args.sim == 'pairwise':
        distance = nn.PairwiseDistance(eps=1e-6)
    else:
        logging.error('bad -sim option {}'.format(args.sim))
        sys.exit()

    dataset = Dataset(args, token, vocab, 'infer_word', skip_subsampling=True)
    with torch.no_grad():
        model.eval()
        voc_i = [i for i in range(0,len(vocab))]
        voc_e = model.Embed(voc_i,'iEmb')
        for batch in dataset:
            #batch[0] batch_wrd
            #batch[1] batch_isnt
            #batch[2] batch_iwrd
            wrd_i = batch[0]
            wrd_e = model.Embed(wrd_i, 'iEmb') #.cpu().detach().numpy().tolist()

            for i in range(len(wrd_i)): ### words to find their closest
                ind_snt = batch[1][i]
                ind_wrd = batch[2][i]
                wrd = vocab[wrd_i[i]]
                out = []
                out.append("{}:{}:{}".format(ind_snt,ind_wrd,wrd))

                dist_wrd_voc = distance(wrd_e[i].unsqueeze(0),voc_e)
                mininds = torch.argsort(dist_wrd_voc,dim=0,descending=True)
                for k in range(1,len(mininds)):
                    ind = mininds[k].item() #cpu().detach().numpy()
                    if i != ind:
                        dis = dist_wrd_voc[ind].item()
                        wrd = vocab[ind]
                        out.append("{:.6f}:{}".format(dis,wrd))
                        if len(out)-1 == args.k:
                            break
                print('\t'.join(out))
예제 #6
0
def train(args):
    idx2word = pickle.load(open(os.path.join(args.data_dir, 'idx2word.dat'), 'rb'))

    wc = pickle.load(open(os.path.join(args.data_dir, 'wc.dat'), 'rb'))

    wf = np.array([wc[word] for word in idx2word])

    # norm
    wf = wf / wf.sum()

    ws = 1 - np.sqrt(args.ss_t / wf)

    # Clip (limit) the values in an array
    ws = np.clip(ws, 0, 1)

    vocab_size = len(idx2word)
    weights = wf if args.weights else None

    if not os.path.isdir(args.save_dir):
        os.mkdir(args.save_dir)

    word2vec = Word2Vec(vocab_size=vocab_size, embedding_size=args.e_dim)
    model_path = os.path.join(args.save_dir, '{}.pt'.format(args.name))
    sgns = SGNS(embedding=word2vec, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights)

    if os.path.isfile(model_path) and args.conti:
        sgns.load_state_dict(t.load(model_path))

    if args.cuda:
        sgns = sgns.cuda()

    optim = Adam(sgns.parameters())
    optimpath = os.path.join(args.save_dir, '{}.optim.pt'.format(args.name))
    if os.path.isfile(optimpath) and args.conti:
        optim.load_state_dict(t.load(optimpath))

    for epoch in range(1, args.epoch + 1):
        #  dataset = PermutedSubsampledCorpus(os.path.join(args.data_dir, 'train.dat'))
        dataset = PermutedSubsampledCorpus(os.path.join(args.data_dir, 'train.dat'), ws=ws)

        dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
        total_batches = int(np.ceil(len(dataset) / args.batch_size))
        pbar = tqdm(dataloader)
        pbar.set_description("[Epoch {}]".format(epoch))

        for iword, owords in pbar:
            loss = sgns(iword, owords)
            optim.zero_grad()
            loss.backward()
            optim.step()
            pbar.set_postfix(loss=loss.item())
    idx2vec = word2vec.ivectors.weight.data.cpu().numpy()
    pickle.dump(idx2vec, open(os.path.join(args.data_dir, 'idx2vec.dat'), 'wb'))
    t.save(sgns.state_dict(), os.path.join(args.save_dir, '{}.pt'.format(args.name)))
    t.save(optim.state_dict(), os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)))
예제 #7
0
파일: train.py 프로젝트: LeeWooJung/Pytorch
def train():

    vocab = None
    word2idx = None
    idx2word = None
    wordcount = None
    data = None
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    if args.preprocess:
        print("Preprocess step start...")
        preprocess = Preprocess(args.data_path, args.window_size)
        preprocess.build_data(args.max_vocab)
        preprocess.build_training_data()

    vocab, word2idx, idx2word, wordcount, data = LoadData()

    wordfreq = np.array([wordcount[word] for word in idx2word])
    wordfreq = wordfreq / wordfreq.sum()

    dataset = SubsampleData(data, wordfreq)

    word2vec = Word2Vec(args.max_vocab, args.emb_dim, device).to(device)
    model = SkipGram_with_NS(word2vec, args.max_vocab, args.num_negs,
                             wordfreq).to(device)

    optimizer = optim.Adam(model.parameters())

    print("Start training word2vec model...")
    for epoch in range(1, args.n_epochs + 1):
        dataloader = DataLoader(dataset,
                                batch_size=args.batch_size,
                                shuffle=True)
        model.train()
        epoch_loss = 0
        total_batches = int(np.ceil(len(dataset) / args.batch_size))
        pbar = tqdm(dataloader)
        pbar.set_description("[Epoch {}]".format(epoch))
        for center, context in pbar:
            centerV, contextV, negativeV = model(center, context)
            loss = getLoss(centerV, contextV, negativeV)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            pbar.set_postfix(loss=loss.item())
            epoch_loss += loss.item()

        print("Average loss: {0:.4f}".format(epoch_loss / total_batches))
    print("Save the model...", end=' ')
    idx2vec = word2vec.input.weight.data.cpu().numpy()
    pickle.dump(idx2vec, open('idx2vec.dat', 'wb'))
    torch.save(model.state_dict(), 'skipgram_with_negative_sampling.pt')
    torch.save(optimizer.state_dict(), 'optimization.pt')
    print("DONE")
예제 #8
0
def do_train(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    if os.path.exists(args.name + '.param'):
        args.embedding_size, args.pooling = read_params(args)
    else:
        write_params(args)        

    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    if args.cuda:
        model.cuda()
#    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
#    optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
    optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, weight_decay=0.01, amsgrad=False)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    dataset = Dataset(args, token, vocab, args.method)

    n_epochs = 0
    losses = []
    while True:
        n_epochs += 1
        for batch in dataset:
            model.train()
            if args.method == 'skipgram':
                loss = model.forward_skipgram(batch)
            elif args.method == 'cbow':
                loss = model.forward_cbow(batch)
            elif args.method == 'sbow':
                loss = model.forward_sbow(batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            n_steps += 1
            losses.append(loss.data.cpu().detach().numpy())
            if n_steps % args.report_every_n_steps == 0:
                accum_loss = np.mean(losses)
                logging.info('{} n_epoch={} n_steps={} Loss={:.6f}'.format(args.method, n_epochs,n_steps,accum_loss))
                losses = []
            if n_steps % args.save_every_n_steps == 0:
                save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n)
        if n_epochs >= args.max_epochs:
            logging.info('Stop (max epochs reached)')
            break
    save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n)
예제 #9
0
def train(args):
    idx2word = pickle.load(open(os.path.join(args.data_dir, 'idx2word.dat'), 'rb'))
    wc = pickle.load(open(os.path.join(args.data_dir, 'wc.dat'), 'rb'))
    wf = np.array([wc[word] for word in idx2word])
    wf = wf / wf.sum()
    ws = 1 - np.sqrt(args.ss_t / wf)
    ws = np.clip(ws, 0, 1)
    vocab_size = len(idx2word)
    if args.sample_within:
        fake_indices = set([i for i, w in enumerate(idx2word) if w.startswith("::")])
    else:
        fake_indices = None
    weights = wf if args.weights else None
    if not os.path.isdir(args.save_dir):
        os.mkdir(args.save_dir)
    if args.multilingual:
        model = Word2VecHidden(vocab_size=vocab_size, embedding_size=args.e_dim, hidden_size=args.hidden)
    else:
        model = Word2Vec(vocab_size=vocab_size, embedding_size=args.e_dim)
    modelpath = os.path.join(args.save_dir, '{}.pt'.format(args.name))
    sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights, tie_weights=args.tie_weights, fake_indices=fake_indices)
    if os.path.isfile(modelpath) and args.conti:
        sgns.load_state_dict(t.load(modelpath))
    if args.cuda:
        sgns = sgns.cuda()
    optim = Adam(sgns.parameters(), lr=args.lr)
    optimpath = os.path.join(args.save_dir, '{}.optim.pt'.format(args.name))
    if os.path.isfile(optimpath) and args.conti:
        optim.load_state_dict(t.load(optimpath))
    for epoch in range(1, args.epoch + 1):
        dataset = PermutedSubsampledCorpus(os.path.join(args.data_dir, 'train.dat'))
        dataloader = DataLoader(dataset, batch_size=args.mb, shuffle=True)
        total_batches = int(np.ceil(len(dataset) / args.mb))
        pbar = tqdm(dataloader)
        pbar.set_description("[Epoch {}]".format(epoch))
        for iword, owords in pbar:
            loss = sgns(iword, owords)
            optim.zero_grad()
            loss.backward()
            optim.step()
            pbar.set_postfix(loss=loss.item())
    idx2vec = model.ivectors.weight.data.cpu().numpy()
    pickle.dump(idx2vec, open(os.path.join(args.data_dir, 'idx2vec.dat'), 'wb'))
    t.save(sgns.state_dict(), os.path.join(args.save_dir, '{}.pt'.format(args.name)))
    t.save(optim.state_dict(), os.path.join(args.save_dir, '{}.optim.pt'.format(args.name)))
예제 #10
0
def evaluate(target_words, top_k=10, synaware_w2v=True):
    """
    Provides a qualitative measure for the embeddings the model has learned by printing the most similar words to the
    ones provided as test words.
    :param target_words: Test words to discover the closest words to them, as List
    :param top_k: Number of closest words
    :param synaware_w2v: True: use a SynsetAwareWord2Vec model (default); False: use a basic Word2Vec model
    :return: None
    """

    print("Loading vocabularies...")
    word_to_ix, ix_to_word, subsampled_words = u.get_vocab(vocab_path="../resources/vocab.txt",
                                                           antivocab_path="../resources/antivocab.txt")

    print("Creating model...")
    if not synaware_w2v:
        model = Word2Vec(subsampled_words=subsampled_words,
                         vocabulary_size=len(word_to_ix),
                         embedding_size=EMBEDDING_SIZE,
                         learning_rate=LEARNING_RATE,
                         window_size=WINDOW_SIZE,
                         neg_samples=NEG_SAMPLES)
    else:
        model = SynsetAwareWord2Vec(subsampled_words=subsampled_words,
                                    vocabulary_size=len(word_to_ix),
                                    embedding_size=EMBEDDING_SIZE,
                                    learning_rate=LEARNING_RATE,
                                    window_size=WINDOW_SIZE,
                                    neg_samples=NEG_SAMPLES)

    saver = tf.train.Saver()
    with tf.Session() as sess:
        print("Loading model...")
        saver.restore(sess, MODEL_PATH_SYN_W2V if synaware_w2v else MODEL_PATH_W2V)

        target_words = [word_to_ix[w] for w in target_words if w in word_to_ix]
        sim_val = sess.run(model.similarity,
                           feed_dict={model.data["sim_test"]: target_words})

        for i in range(len(target_words)):
            print("Closest %d words to %s:" % (top_k, ix_to_word[target_words[i]]))
            closest_words = (-sim_val[i, :]).argsort()[1:top_k + 1]
            for j in range(top_k):
                word = ix_to_word[closest_words[j]]
                print("\t%d. %s" % (j+1, word))
예제 #11
0
def train(args):
    if LDAP.mimic0_movie1_wiki2 == 0:
        name = "MIMIC"
    elif LDAP.mimic0_movie1_wiki2 == 1:
        name = "MovieReview"
    else:
        name = "Wiki"
    idx2word = pickle.load(open(os.path.join(LDAP.output_path, name + '_idx2word.dat'), 'rb'))
    wc = pickle.load(open(os.path.join(LDAP.output_path, name + '_wc.dat'), 'rb'))
    wf = np.array([wc[word] for word in idx2word])
    wf = wf / wf.sum()
    ws = 1 - np.sqrt(args.ss_t / wf)
    ws = np.clip(ws, 0, 1)
    vocab_size = len(idx2word)
    weights = ws if args.weights else None

    model = Word2Vec(vocab_size=vocab_size, embedding_size=EMBEDP.veclen)
    time_code = '_#' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + '#'
    sgns = SGNS(embedding=model, vocab_size=vocab_size, n_negs=args.n_negs, weights=weights)
    if args.cuda:
        sgns = sgns.cuda()
    optim = Adam(sgns.parameters())
    test_data = pickle.load(open(os.path.join(LDAP.output_path, name + '_train.dat'), 'rb'))
    # for iword, oword in test_data:
    #     print(iword, type(iword))
    #     print(oword, type(oword))

    for epoch in range(1, args.epoch + 1):
        dataset = PermutedSubsampledCorpus(os.path.join(LDAP.output_path, name + '_train.dat'))
        dataloader = DataLoader(dataset, batch_size=args.mb, shuffle=True)
        total_batches = int(np.ceil(len(dataset) / args.mb))
        pbar = tqdm(dataloader)
        pbar.set_description("[Epoch {}]".format(epoch))
        for iword, owords in pbar:
            # print(iword.size(), owords.size())
            loss = sgns(iword, owords)
            optim.zero_grad()
            loss.backward()
            optim.step()
            pbar.set_postfix(loss=loss.item())
    idx2vec = model.ivectors.weight.data.cpu().numpy()
    pickle.dump(idx2vec, open(os.path.join(LDAP.output_path, name + '_idx2vec.dat'), 'wb'))
    t.save(sgns.state_dict(), os.path.join(LDAP.output_path, '{}.pt'.format(name + '_model')))
    t.save(optim.state_dict(), os.path.join(LDAP.output_path, '{}.optim.pt'.format(name + '_model')))
예제 #12
0
def train(use_gpu=False):
    num_epochs = 2
    batch_size = 256
    every = 10
    vocab = pickle.load(open('./stat/vocab_set.dat', 'rb'))
    V = len(vocab)
    word2vec = Word2Vec(V=V, use_gpu=use_gpu)
    perm_dict = pickle.load(open('./stat/permutation_dict.dat', 'rb'))

    start = time.time()
    for l in perm_dict:
        print("training sets with size {}...".format(l))
        sgns = SGNS(V=V,
                    embedding=word2vec,
                    batch_size=batch_size,
                    window_size=l,
                    n_negatives=5)
        optimizer = SGD(sgns.parameters(), 5e-1)
        dataset = PermutedCorpus(perm_dict[l])
        dataloader = DataLoader(dataset,
                                batch_size=batch_size,
                                shuffle=True,
                                num_workers=4)
        total_batches = len(dataset) // batch_size
        for epoch in range(1, num_epochs + 1):
            for batch, (iword, owords) in enumerate(dataloader):
                if len(iword) != batch_size:
                    continue
                loss = sgns(iword, owords)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                if not batch % every:
                    print("\t[e{}][b{}/{}] loss: {:7.4f}\r".format(
                        epoch, batch, total_batches, loss.data[0]))
    end = time.time()
    print("training done in {:.4f} seconds".format(
        end -
        start))  # It takes about 3.5 minutes with GPU, loss less than 7.5
    idx2vec = word2vec.forward([idx for idx in range(V + 1)])
    if use_gpu:
        idx2vec = idx2vec.cpu()
    pickle.dump(idx2vec.data.numpy(),
                open('./stat/idx2vec_{}epochs.dat'.format(num_epochs), 'wb'))
예제 #13
0
    def train():

        dimensionality_of_embeddings = model_config['dim_embedding']
        optimizer = optim_config['optimizer']
        epochs = optim_config['epochs']
        batch_size = optim_config['batch_size']

        model = Word2Vec(input_dim=data['vocabSize'],
                         units=int(dimensionality_of_embeddings))
        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
        model.fit(oneHotNumpy[:, 0, :],
                  oneHotNumpy[:, 1, :],
                  epochs=epochs,
                  batch_size=batch_size)

        emb = model.get_weights()[0]
        save("word2vec_embeddings.npy", emb)
예제 #14
0
def train(use_gpu=False):
    num_epochs = 10
    batch_size = 1024
    every = 10
    vocab = pickle.load(open('./data/vocab.dat', 'rb'))
    V = len(vocab)
    word2vec = Word2Vec(V=V, gpu=gpu)
    sgns = SGNS(
        # TODO(cipta): change
        max_firm=91924,  # Initial sample of the data
        embedding=word2vec,
        batch_size=batch_size,
        window_size=1,
        n_negatives=5)
    optimizer = SGD(sgns.parameters(), 5e-1)
    dataset = PermutedCorpus('./data/train.dat')
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=4)
    start = time.time()
    total_batches = len(dataset) // batch_size
    for epoch in range(1, num_epochs + 1):
        for batch, (iword, owords) in enumerate(dataloader):
            if len(iword) != batch_size:
                continue
            loss = sgns(iword, owords)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if not batch % every:
                print("[e{}][b{}/{}] loss: {:7.4f}\r".format(
                    epoch, batch, total_batches, loss.data[0]))
    end = time.time()
    print("training done in {:.4f} seconds".format(
        end -
        start))  # It takes about 3.5 minutes with GPU, loss less than 7.5
    idx2vec = word2vec.forward([idx for idx in range(V + 1)])
    if gpu:
        idx2vec = idx2vec.cpu()
    pickle.dump(word2vec.state_dict(), open('./data/word2vec.pt', 'wb'))
    pickle.dump(idx2vec.data.numpy(), open('./data/idx2vec.dat', 'wb'))
예제 #15
0
def do_train(args):
    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    args.voc_maxn = vocab.max_ngram
    args.use_bos_eos = vocab.use_bos_eos

    model, n_steps = load_model(args.name, vocab)
    if model is None:
        logging.info('start model from scratch')
        model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_pad)
    if args.cuda:
        model.cuda()

    optimizer = load_build_optim(args.name, model, args.learning_rate, args.beta1, args.beta2, args.eps)
    dataset = Dataset(args, token, vocab)
    n_epochs = 0
    losses = []
    while True:
        n_epochs += 1
        for batch in dataset:
            model.train()
            loss = model.forward(batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            n_steps += 1
            losses.append(loss.data.cpu().detach().numpy())
            if n_steps % args.report_every_n_steps == 0:
                accum_loss = np.mean(losses)
                logging.info('{} n_epoch={} n_steps={} Loss={:.6f}'.format(args.mode, n_epochs,n_steps,accum_loss))
                losses = []
            if n_steps % args.save_every_n_steps == 0:
                save_model(args.name, model, n_steps, args.keep_last_n)
                save_optim(args.name, optimizer)
        if n_epochs >= args.max_epochs:
            logging.info('Stop (max epochs reached)')
            break
    save_model(args.name, model, n_steps, args.keep_last_n)
    save_optim(args.name, optimizer)
예제 #16
0
def train(name,
          data_dir_1,
          save_dir,
          e_dim,
          n_negs,
          epoch,
          mb,
          ss_t,
          conti,
          weights,
          cuda=True,
          data_dir_0=None):

    idx2word_1 = pickle.load(
        open(os.path.join(data_dir_1, 'idx2word.dat'), 'rb'))
    word2idx_1 = pickle.load(
        open(os.path.join(data_dir_1, 'word2idx.dat'), 'rb'))

    #creating idx2idx dict for the overlapping section of the vocabularies
    if data_dir_0 is not None:
        word2idx_0 = pickle.load(
            open(os.path.join(data_dir_0, 'word2idx.dat'), 'rb'))
        vocab_inters = set(word2idx_0.keys()) & set(word2idx_1.keys())
        idx2idx = {word2idx_1[word]: word2idx_0[word] for word in vocab_inters}
        if data_dir_0 is not None:
            with open(data_dir_0 + 'idx2vec.dat', 'rb') as handle:
                previous_model = pickle.load(handle)
    else:
        previous_model = None

    wc = pickle.load(open(os.path.join(data_dir_1, 'wc.dat'), 'rb'))
    wf = np.array([wc[word] for word in idx2word_1])
    wf = wf / wf.sum()
    ws = 1 - np.sqrt(ss_t / wf)
    ws = np.clip(ws, 0, 1)
    vocab_size = len(idx2word_1)
    weights = wf if weights else None
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)
    model = Word2Vec(vocab_size=vocab_size, embedding_size=e_dim)
    modelpath = os.path.join(save_dir, '{}.pt'.format(name))
    sgns = SGNS(embedding=model,
                vocab_size=vocab_size,
                n_negs=n_negs,
                weights=weights,
                previous_model=previous_model)
    if os.path.isfile(modelpath) and conti:
        sgns.load_state_dict(t.load(modelpath))
    if cuda:
        sgns = sgns.cuda()
    optim = Adam(sgns.parameters())
    optimpath = os.path.join(save_dir, '{}.optim.pt'.format(name))
    if os.path.isfile(optimpath) and conti:
        optim.load_state_dict(t.load(optimpath))
    for epoch in range(1, epoch + 1):
        dataset = PermutedSubsampledCorpus(
            os.path.join(data_dir_1, 'train.dat'))
        #dataloader converts input numpy data into long tensors
        dataloader = DataLoader(dataset, batch_size=mb, shuffle=True)
        total_batches = int(np.ceil(len(dataset) / mb))
        pbar = tqdm(dataloader)
        pbar.set_description("[Epoch {}]".format(epoch))
        for iword, owords in pbar:
            if data_dir_0 is not None:
                # here we need to create a idx2idx dict
                vocab_present = list(
                    set(iword.cpu().numpy()) & set(idx2idx.keys()))
                if len(vocab_present) != 0:
                    rwords_dict = {
                        word: idx2idx[word]
                        for word in vocab_present
                    }
                else:
                    rwords_dict = None
            else:
                rwords_dict = None
            loss = sgns(iword, owords, rwords_dict)
            optim.zero_grad()
            loss.backward()
            optim.step()
            pbar.set_postfix(loss=loss.item())
    idx2vec = model.ivectors.weight.data.cpu().numpy()
    pickle.dump(idx2vec, open(os.path.join(data_dir_1, 'idx2vec.dat'), 'wb'))
    t.save(sgns.state_dict(), os.path.join(save_dir, '{}.pt'.format(name)))
    t.save(optim.state_dict(),
           os.path.join(save_dir, '{}.optim.pt'.format(name)))
예제 #17
0
def train_basic_w2v(dataset,
                    word_to_ix,
                    subsampled_words,
                    model_path,
                    model_ID,
                    epochs,
                    batch_size,
                    embedding_size,
                    lr,
                    window_size,
                    neg_samples,
                    retrain=False,
                    csv_export=True,
                    ix_to_word=None,
                    emb_export=False,
                    emb_path=None):
    """
    Trains a basic Word2Vec model with the given parameters, exporting embeddings in the end.
    :param dataset: Path to the dataset
    :param word_to_ix: Vocabulary to map words to integer IDs, as Dict
    :param subsampled_words: Words not to be considered since too frequent, as List
    :param model_path: Path to the outermost model folder
    :param model_ID: Name of the model being trained
    :param epochs: Number of epochs to use for training
    :param batch_size: Size of batches to be built
    :param embedding_size: Size of embeddings to be built
    :param lr: Learning rate for the training algorithm
    :param window_size: Number of words to consider as context; the full context is 2 * window_size large
    :param neg_samples: Number of negative samples to be used in the Noise Contrastive Estimation loss function
    :param retrain: True: loads the model and starts training again; False: starts a new training (default)
    :param csv_export: True: exports train and dev loss values per epoch to a CSV file (default); False: nothing
    :param ix_to_word: Reverse vocabulary (wrt to word_to_ix); MUST be provided if embeddings are to be exported
    :param emb_export: True: exports embeddings to a W2V textual format; False: nothing
    :param emb_path: Path to the embedding export file; MUST be provided if embeddings are to be exported
    :return: None
    """

    assert not emb_export or (
        emb_export and ix_to_word is not None and emb_path is not None
    ), "Embeddings export enabled but no reverse dictionary or embedding file path provided"

    with \
            tf.Session() as sess, \
            tf.summary.FileWriter("../logging/%s" % model_ID, sess.graph) as tf_logger, \
            open("../logs/training_%s.log" % model_ID, mode="w") as log:

        if csv_export:
            csv = open("../csv/%s.csv" % model_ID, mode="w")
            u.log_message(csv,
                          "epoch,train loss,dev loss",
                          to_stdout=False,
                          with_time=False)

        u.log_message(log, "Creating model...")
        model = Word2Vec(subsampled_words=subsampled_words,
                         vocabulary_size=len(word_to_ix),
                         embedding_size=embedding_size,
                         learning_rate=lr,
                         window_size=window_size,
                         neg_samples=neg_samples)

        u.log_message(log, "\tModel ID: %s" % model_ID)
        u.log_message(
            log, "\tModel path: %s/%s/model.ckpt" % (model_path, model_ID))
        u.log_message(log, "\tEmbedding size: %d" % embedding_size)
        u.log_message(log, "\tLearning rate: %.3f" % lr)
        u.log_message(log, "\tWindow size: %d" % window_size)
        u.log_message(log, "\tNegative samples: %d" % neg_samples)

        saver = tf.train.Saver()
        sess.run(tf.global_variables_initializer())

        if retrain:
            u.log_message(log, "Loading model...")
            saver.restore(sess, "%s/%s/model.ckpt" % (model_path, model_ID))

        u.log_message(log, "Starting training...")
        for epoch in range(1, epochs + 1):
            u.log_message(log, "Epoch: %d" % epoch)
            accumulated_loss = 0
            iterations = 0

            # training
            for batch_inputs, batch_labels in model.batch_generator(
                    dataset=dataset,
                    word_to_ix=word_to_ix,
                    batch_size=batch_size,
                    sent_limit=1720588):  # around 90% of 1_911_765

                _, loss_val = sess.run(
                    [model.train, model.loss],
                    feed_dict={
                        model.data["inputs"]: batch_inputs,
                        model.data["labels"]: batch_labels
                    })

                accumulated_loss += loss_val
                iterations += 1

            accumulated_loss /= iterations
            train_loss = accumulated_loss

            u.log_message(log, "\tTrain loss: %.5f" % train_loss)
            add_summary(tf_logger, "train loss", train_loss, epoch)

            # dev evaluation
            accumulated_loss = 0
            iterations = 0

            for batch_inputs, batch_labels in model.batch_generator(
                    dataset=dataset,
                    word_to_ix=word_to_ix,
                    batch_size=batch_size,
                    skip_first=1720589
            ):  # use the remaining 191_177 as dev set

                loss_val = sess.run(model.loss,
                                    feed_dict={
                                        model.data["inputs"]: batch_inputs,
                                        model.data["labels"]: batch_labels
                                    })

                accumulated_loss += loss_val
                iterations += 1

            accumulated_loss /= iterations
            dev_loss = accumulated_loss

            u.log_message(log, "\tDev loss: %.5f" % dev_loss)
            add_summary(tf_logger, "dev loss", dev_loss, epoch)

            if csv_export:
                u.log_message(csv,
                              "%d,%.5f,%.5f" % (epoch, train_loss, dev_loss),
                              to_stdout=False,
                              with_time=False)

            if epoch % SAVE_FREQUENCY == 0:
                saver.save(sess, "%s/%s/model.ckpt" % (model_path, model_ID))
                u.log_message(log, "\tModel saved")

        u.log_message(log, "Training ended.")
        saver.save(sess, "%s/%s/model.ckpt" % (model_path, model_ID))

        if emb_export:
            u.log_message(log, "Exporting embeddings...")
            emb_matrix = sess.run(model.embeddings)
            model.export_keyedvector(emb_matrix,
                                     embeddings_path=emb_path,
                                     ix_to_word=ix_to_word)
            u.log_message(log, "Embeddings exported")

        if csv_export:
            csv.close()
예제 #18
0
import os

from model import Word2Vec

config = {}
config['corpus'] = "corpus.txt"  # input data
config['window'] = 5  # (maximum) window size
config['embed_size'] = 100  # dimensionality of word embeddings
config['alpha'] = 0.75  # smooth out unigram frequencies
config['table_size'] = int(1E8)  # table size from which to sample neg samples
config[
    'neg_sample_size'] = 5  # number of negative samples for each positive sample
config['min_frequency'] = 10  #threshold for vocab frequency
config['lr'] = 0.025  # initial learning rate
config['min_lr'] = 0.001  # min learning rate
config['epochs'] = 3  # number of epochs to train
config['gpu'] = 0  # 1 = use gpu, 0 = use cpu
config['stream'] = 1  # 1 = stream from hard drive 0 = copy to memory first

with tf.Session() as sess:
    w2v = Word2Vec(config, sess)
    w2v.build_vocab(config['corpus'])
    w2v.build_table()

    for idx in xrange(config['epochs']):
        w2v.lr = config['lr']
        w2v.train_model(config['corpus'])

    w2v.get_sim_words(['the', 'he', 'can'], 5)
예제 #19
0
def train(data,
          idx2word,
          wc,
          e_dim=128,
          name='word2vec',
          n_negs=5,
          conti=False,
          cuda=False,
          epoch=1,
          ss_t=1e-5,
          mb=4096,
          weights=False,
          save_dir='./output'):
    #idx2word = pickle.load(open(os.path.join(data_dir, 'idx2word.dat'), 'rb'))
    #wc = pickle.load(open(os.path.join(data_dir, 'wc.dat'), 'rb'))
    wf = np.array([wc[word] for word in idx2word])
    wf = wf / wf.sum()
    ws = 1 - np.sqrt(ss_t / wf)
    ws = np.clip(ws, 0, 1)
    vocab_size = len(idx2word)
    weights = wf if weights else None
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)
    model = Word2Vec(vocab_size=vocab_size, embedding_size=e_dim)
    modelpath = os.path.join(save_dir, '{}.pt'.format(name))
    sgns = SGNS(embedding=model,
                vocab_size=vocab_size,
                n_negs=n_negs,
                weights=weights)
    if os.path.isfile(modelpath) and conti:
        sgns.load_state_dict(t.load(modelpath))
    if cuda:
        sgns = sgns.cuda()
    optim = Adam(sgns.parameters())
    optimpath = os.path.join(save_dir, '{}.optim.pt'.format(name))
    if os.path.isfile(optimpath) and conti:
        optim.load_state_dict(t.load(optimpath))
    for epoch in range(1, epoch + 1):
        flag = False
        dataset = PermutedSubsampledCorpus(data)
        dataloader = DataLoader(dataset, batch_size=mb, shuffle=True)
        total_batches = int(np.ceil(len(dataset) / mb))
        pbar = tqdm(dataloader)
        pbar.set_description("[Epoch {}]".format(epoch))
        losses = []
        prev_loss = 0
        for iword, owords in pbar:
            loss = sgns(iword, owords)
            losses.append(loss.item())
            prev_loss = loss.item()
            if mean(losses[-10:]) < sys.epsilon:
                flag = True
                break
            optim.zero_grad()
            loss.backward()
            optim.step()
            pbar.set_postfix(loss=loss.item())
        if flag:
            break
    idx2vec = model.ivectors.weight.data.cpu().numpy()
    #pickle.dump(idx2vec, open(os.path.join(data_dir, 'idx2vec.dat'), 'wb'))
    t.save(sgns.state_dict(), os.path.join(save_dir, '{}.pt'.format(name)))
    t.save(optim.state_dict(),
           os.path.join(save_dir, '{}.optim.pt'.format(name)))
    return idx2vec
예제 #20
0
            prototypes = pickle.load(
                open(
                    os.path.join(args.preprocess_dir,
                                 os.path.join('som_log',
                                              args.prototypes_path)), 'rb'))

        else:
            prototypes = pickle.load(
                open(
                    os.path.join(args.preprocess_dir,
                                 os.path.join('som', args.prototypes_path)),
                    'rb'))

        model = Word2Vec(prototypes=prototypes,
                         alpha=args.alpha,
                         vocab_size=vocab_size,
                         embedding_size=args.e_dim,
                         is_cuda=args.cuda,
                         log_space=args.log_space)

    elif args.scheme in ['GMM']:
        # we can still use custom collate prototype for dataloader
        custom_collate_fn = custom_collate_gmm
        # gmm instance path!
        if args.log_space:
            gmm_posterior = pickle.load(
                open(
                    os.path.join(args.preprocess_dir,
                                 os.path.join('gmm_log', args.gmms_path)),
                    'rb'))
        else:
            gmm_posterior = pickle.load(
예제 #21
0
def do_train(args):
    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    dataset = Dataset(args,vocab,token)

    model, n_steps = load_model(args.name, vocab)

    if model is None:
        logging.info('start model from scratch')
        model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_pad)
    if args.cuda:
        model.cuda()

    optimizer = load_build_optim(args.name, model, args.learning_rate, args.beta1, args.beta2, args.eps)
    n_epochs = 0
    losses = []
    min_val_loss = 0.0
    n_valid_nogain = 0
    stop = False
    while not stop:

        n_epochs += 1
        for batch_idx, batch_neg, batch_ctx, batch_msk in dataset:

            model.train()
            loss = model.forward(batch_idx, batch_neg, batch_ctx, batch_msk)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            n_steps += 1
            losses.append(loss.data.cpu().detach().numpy())

            if n_steps % args.report_every_n_steps == 0:
                accum_loss = np.mean(losses)
                logging.info('{} n_epoch={} n_steps={} Loss={:.6f}'.format(args.mode, n_epochs,n_steps,accum_loss))
                losses = []

            if n_steps % args.save_every_n_steps == 0:
                save_model(args.name, model, n_steps, args.keep_last_n)
                save_optim(args.name, optimizer)

            if n_steps % args.valid_every_n_steps == 0:

                min_val_loss, n_valid_nogain = do_validation(args,token,vocab,model,n_steps,min_val_loss,n_valid_nogain)

                if args.early_stop > 0 and n_valid_nogain >= args.early_stop:
                    stop = True
                    logging.info('stop ({} valids without improving performance reached)'.format(n_valid_nogain))
                    break #go to end of dataset

            if args.max_steps > 0 and n_steps >= args.max_steps:
                stop = True
                logging.info('stop ({} steps reached)'.format(n_steps))
                break #go to end of dataset

        if args.max_epochs > 0 and n_epochs >= args.max_epochs:
            stop = True
            logging.info('stop ({} epochs reached)'.format(n_epochs))


    save_model(args.name, model, n_steps, args.keep_last_n)
    save_optim(args.name, optimizer)
예제 #22
0
from model import Word2Vec

# optimizers_ selection = "Adam" or "RMSP" or "SGD"
# weight_selection 은 encoder, decoder 임베디중 어떤것을 사용할 것인지 선택하는 변수
# weight_sharing=True시 weight_selection="decoder"라고 설정해도 encoder의 embedding_matrix 로 강제 설정된다.
Word2Vec(
    TEST=True,
    tSNE=True,
    model_name="Word2Vec",
    weight_selection="encoder",  # encoder or decoder
    vocabulary_size=50000,
    tSNE_plot=200,
    similarity_number=8,
    # similarity_number -> 비슷한 문자 출력 개수
    # num_skip : 하나의 문장당 num_skips 개의 데이터를 생성
    validation_number=30,
    embedding_size=128,
    batch_size=128,
    num_skips=2,
    window_size=1,
    negative_sampling=64,
    optimizer_selection="SGD",
    learning_rate=0.1,
    training_epochs=1000,
    display_step=1,
    weight_sharing=False)
예제 #23
0
    logger.info('Building negative sampling distribution')
    negative_sampler = HierarchicalSampler(
        vocab=vocab,
        alpha=config['negative_sampling']['alpha'],
        chunks_num=config['negative_sampling']['vocab_chunks_num'])

    logger.info('Building model computation graph')
    optimizer = tf.train.AdagradOptimizer(
        learning_rate=config['training_params']['initial_learning_rate'])

    negative_samples_num = config['sliding_window']['max_size'] * \
        config['negative_sampling']['samples_num']

    word2vec = Word2Vec(optimizer=optimizer,
                        embedding_size=config['embeddings']['size'],
                        vocabulary_size=vocabulary_size,
                        batch_size=config['training_params']['batch_size'],
                        negative_samples=negative_samples_num)

    tf_threads_num = config['training_process']['tensorflow_threads']
    session_cfg = tf.ConfigProto(inter_op_parallelism_threads=tf_threads_num,
                                 intra_op_parallelism_threads=tf_threads_num,
                                 use_per_session_threads=True)

    with tf.Session(graph=word2vec.model_graph, config=session_cfg) as session:
        workers_num = config['training_process']['workers_num']
        queue_size = workers_num * config['training_process']['queue_factor']
        task_queue = Queue.Queue(maxsize=queue_size)

        logger.info('Initializing model params')
        session.run(tf.initialize_all_variables())
예제 #24
0
def main(argv):
    try:
        os.mkdir(FLAGS.weights_directory)
    except:
        pass

    # Load data
    logging.info(f"\n\nLoading data from {FLAGS.text_file}...\n\n")
    dm = DataManager.from_text_file(FLAGS.text_file,
                                    FLAGS.window_size,
                                    threshold=FLAGS.threshold)

    # metrics
    train_loss = tf.keras.metrics.Mean()
    VOCAB_SIZE = dm.skg.vocab_size + 1
    history = {'loss': []}

    # model
    logging.info(
        f"\n\nConstructing model. d_model: {FLAGS.d_model}. vocab_size: {VOCAB_SIZE}\n\n"
    )
    tf.keras.backend.clear_session()
    model = Word2Vec(vocab_size=VOCAB_SIZE, d_model=FLAGS.d_model)
    optimizer = tf.keras.optimizers.Adam(FLAGS.learning_rate)

    @tf.function
    def train_step(inp, ctxt, lbl, mask):
        with tf.GradientTape() as tape:
            pred = model((inp, ctxt))
            loss = loss_fn(lbl, pred, mask)
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        train_loss(loss)

    # Checkpointing
    ckpt_path = FLAGS.weights_directory + '/checkpoints'
    ckpt = tf.train.Checkpoint(model=model,
                               optimizer=optimizer,
                               step=tf.Variable(1))
    ckpt_manager = tf.train.CheckpointManager(ckpt, ckpt_path, max_to_keep=5)

    DS_SIZE = dm.num_tokens // FLAGS.batch_size
    train_ds = dm.batched_ds(FLAGS.batch_size, FLAGS.num_ns,
                             FLAGS.buffer_size).prefetch(1)

    # Restore checkpoints
    ckpt.restore(ckpt_manager.latest_checkpoint)
    if ckpt_manager.latest_checkpoint:
        logging.info(f"Restored from {ckpt_manager.latest_checkpoint}")
    else:
        logging.info("Initializing from scratch.")

    total_start = time.time()
    for epoch in range(FLAGS.epochs):
        logging.info(f"\n\n----- Epoch {epoch+1}/{FLAGS.epochs} -----")
        train_loss.reset_states()
        start = time.time()
        for step, ((inp, ctxt), lbl, mask) in enumerate(train_ds):

            train_step(inp, ctxt, lbl, mask)
            loss = train_loss.result().numpy()
            diff = (time.time() - start) / (step + 1)
            history['loss'].append(loss)
            print_bar(step, DS_SIZE, diff, loss)
            ckpt.step.assign_add(1)

            # we start the drop threshold off high to get some training for
            # common words, then decrease it over training to learn rare words
            if FLAGS.anneal_threshold and (
                    step + 1) % 500 == 0 and FLAGS.threshold > 1e-5:
                FLAGS.threshold /= 10
                dm.skg.set_threshold(FLAGS.threshold)

            if (step + 1) % 1000 == 0:
                logging.info(f"~~~~~~ Step: {step+1} ~~~~~~")
                test_common_analogies(model, dm)

            if (step + 1) % 5000 == 0:
                save_path = ckpt_manager.save()
                logging.info(
                    "Saved checkpoint for step {step} to: {save_path}")

    logging.info(
        f"\n\n\nCompleted training. Total time: {time.time()-total_start:.2f}s\n\n\n"
    )
    filename = FLAGS.weights_directory + '/metrics.json'
    logging.info(f"\n\nSaving training metrics to: {filename}")
    with open(filename, 'w') as file:
        file.write(json.dumps(history))
    filename = FLAGS.weights_directory + '/weights.h5'
    logging.info(f"\n\nSaving model to: {filename}")
    model.save(filename)
예제 #25
0
    "apple cat dog like", "dog fish milk like", "dog cat eyes like",
    "i like apple", "apple i hate", "apple i movie book music like",
    "cat dog hate", "cat dog like"
]

# sentences = [
#     "i like fish", "i like apple", "i like animal",
#     "dog hate milk", "dog hate fish", "dog hate apple",
# ]

word_list = " ".join(sentences).split()
word_list = list(set(word_list))

input_batch, output_batch = make_batch(sentences)

model = Word2Vec()

for epoch in range(20000):
    for i in range(len(input_batch)):
        model.forward(input_batch[i])
        model.backward(output_batch[i], lr)
    if (epoch + 1) % 1000 == 0:
        print('Epoch: ', '%04d' % (epoch + 1), ', Loss: ', model.loss[0][0])
        # print(model.embedding)
        # print(model.w)

for i, label in enumerate(word_list):
    W = model.embedding
    WT = model.embedding.T
    x, y = float(W[i][0]), float(W[i][1])
    plt.scatter(x, y)
예제 #26
0
def train(args):
    if args.gpuid > -1:
        torch.cuda.set_device(args.gpuid)
        tmp = torch.ByteTensor([0])
        torch.backends.cudnn.enabled = True
        tmp.cuda()
        print("using GPU", args.gpuid)
        print('CUDNN  VERSION', torch.backends.cudnn.version())
    else:
        print("using CPU")
    idx2unigram_prob = pickle.load(
        open(os.path.join(args.data_dir, 'idx2unigram_prob.pkl'), 'rb'))
    idx, unigram_prob = zip(*sorted([(idx, p)
                                     for idx, p in idx2unigram_prob.items()]))
    unigram_prob = np.array(unigram_prob)
    if args.use_noise_weights:
        noise_unigram_prob = unigram_prob[:args.max_vocab]**0.75
        noise_unigram_prob = noise_unigram_prob / noise_unigram_prob.sum()
    else:
        noise_unigram_prob = None
    if args.model == 'Word2Vec':
        embedding_model = Word2Vec(word_vocab_size=args.max_vocab,
                                   embedding_size=args.embedding_size)
    elif args.model == 'Spell2Vec':
        char2idx = pickle.load(
            open(os.path.join(args.data_dir, 'char2idx.pkl'), 'rb'))
        wordidx2spelling, vocab_size, max_spelling_len = load_spelling(
            os.path.join(args.data_dir, 'wordidx2charidx.pkl'), )
        embedding_model = Spell2Vec(
            wordidx2spelling,
            word_vocab_size=args.max_vocab,
            noise_vocab_size=args.
            max_vocab,  # len(noise_weights) if noise_weights is not None else 20000,
            char_vocab_size=len(char2idx),
            embedding_size=args.embedding_size,
            char_embedding_size=args.char_embedding_size,
            dropout=args.dropout,
            char_composition=args.char_composition,
            bidirectional=True)
    elif args.model == 'SpellHybrid2Vec':
        char2idx = pickle.load(
            open(os.path.join(args.data_dir, 'char2idx.pkl'), 'rb'))
        wordidx2spelling, vocab_size, max_spelling_len = load_spelling(
            os.path.join(args.data_dir, 'wordidx2charidx.pkl'), )
        embedding_model = SpellHybrid2Vec(
            wordidx2spelling,
            word_vocab_size=args.max_vocab,
            noise_vocab_size=args.
            max_vocab,  # len(noise_weights) if noise_weights is not None else 20000,
            char_vocab_size=len(char2idx),
            embedding_size=args.embedding_size,
            char_embedding_size=args.char_embedding_size,
            dropout=args.dropout,
            char_composition=args.char_composition,
            bidirectional=True)

    else:
        raise NotImplementedError('unknown embedding model')
    dataset = LazyTextDataset(
        corpus_file=os.path.join(args.data_dir, 'corpus.txt'),
        word2idx_file=os.path.join(args.data_dir, 'word2idx.pkl'),
        unigram_prob=unigram_prob,
        window=args.window,
        max_vocab=args.max_vocab if args.model == 'Word2Vec' else 1e8)
    dataloader = DataLoader(dataset=dataset,
                            batch_size=args.batch_size,
                            shuffle=True,
                            collate_fn=my_collate)
    total_batches = int(np.ceil(len(dataset) / args.batch_size))
    sgns = SGNS(embedding_model=embedding_model,
                num_neg_samples=args.num_neg_samples,
                weights=noise_unigram_prob)
    optim = Adam(sgns.parameters())  # , lr = 0.5)
    if args.gpuid > -1:
        sgns.init_cuda()

    if not os.path.isdir(args.save_dir):
        os.mkdir(args.save_dir)
    print(sgns)
    for epoch in range(1, args.epoch + 1):
        ave_time = 0.
        s = time.time()
        for batch_idx, batch in enumerate(dataloader):
            iword, owords = batch
            nwords = sgns.sample_noise(iword.size()[0])
            loss = sgns(iword, owords, nwords)
            optim.zero_grad()
            loss.backward()
            optim.step()
            if batch_idx % 10 == 0 and batch_idx > 0:
                e = time.time()
                ave_time = (e - s) / 10.
                s = time.time()
            print("e{:d} b{:5d}/{:5d} loss:{:7.4f} ave_time:{:7.4f}\r".format(
                epoch, batch_idx + 1, total_batches, loss.data[0], ave_time))
        path = args.save_dir + '/' + embedding_model.__class__.__name__ + '_e{:d}_loss{:.4f}'.format(
            epoch, loss.data[0])
        embedding_model.save_model(path)
    if args.eval_dir != '':
        eval_vecs = open(os.path.join(args.save_dir, 'vocab_vec.txt'),
                         'w',
                         encoding='utf-8')
        eval_vocab = [
            ev.strip()
            for ev in open(os.path.join(args.eval_dir, 'fullVocab.txt'),
                           'r',
                           encoding='utf-8').readlines()
        ]
        word2idx = dataset.word2idx
        char2idx = pickle.load(
            open(os.path.join(args.data_dir, 'char2idx.pkl'), 'rb'))
        for ev in eval_vocab:
            ev_id = word2idx.get(ev, word2idx['<UNK>'])
            if isinstance(embedding_model, Word2Vec):
                ev_id = ev_id if args.max_vocab > ev_id else word2idx['<UNK>']
                vec = embedding_model.query(ev_id)
            else:
                ev_id = ev_id if args.max_vocab > ev_id else word2idx['<UNK>']
                spelling = [char2idx['<BOW>']] + [
                    char2idx.get(i, char2idx['<UNK>']) for i in ev
                ] + [char2idx['<EOW>']]
                spelling = spelling + [char2idx['<PAD>']
                                       ] * (max_spelling_len - len(spelling))
                vec = embedding_model.query(ev_id, spelling)
            vec = ','.join(['%4f' % i for i in vec.flatten()])
            eval_vecs.write(ev + ' ' + vec + '\n')
        eval_vecs.close()