Exemplo n.º 1
0
def do_infer_sent(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()
    if len(glob.glob(args.name + '.model.?????????.pth')) == 0:
        logging.error('no model available: {}'.format(args.name + '.model.?????????.pth'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    args.embedding_size, args.pooling = read_params(args)
    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    if args.cuda:
        model.cuda()

    dataset = Dataset(args, token, vocab, 'infer_sent', skip_subsampling=True)
    with torch.no_grad():
        model.eval()
        for batch in dataset:
            snts = model.SentEmbed(batch[0], batch[1], 'iEmb').cpu().detach().numpy().tolist()
            for i in range(len(snts)):
                sentence = ["{:.6f}".format(w) for w in snts[i]]
                print('{}\t{}'.format(batch[2][i]+1, ' '.join(sentence) ))
Exemplo n.º 2
0
def do_infer_word(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()
    if len(glob.glob(args.name + '.model.?????????.pth')) == 0:
        logging.error('no model available: {}'.format(args.name + '.model.?????????.pth'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    args.embedding_size, args.pooling = read_params(args)
    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    if args.cuda:
        model.cuda()

    if args.sim == 'cos':
        distance = nn.CosineSimilarity(dim=1, eps=1e-6)
    elif args.sim == 'pairwise':
        distance = nn.PairwiseDistance(eps=1e-6)
    else:
        logging.error('bad -sim option {}'.format(args.sim))
        sys.exit()

    dataset = Dataset(args, token, vocab, 'infer_word', skip_subsampling=True)
    with torch.no_grad():
        model.eval()
        voc_i = [i for i in range(0,len(vocab))]
        voc_e = model.Embed(voc_i,'iEmb')
        for batch in dataset:
            #batch[0] batch_wrd
            #batch[1] batch_isnt
            #batch[2] batch_iwrd
            wrd_i = batch[0]
            wrd_e = model.Embed(wrd_i, 'iEmb') #.cpu().detach().numpy().tolist()

            for i in range(len(wrd_i)): ### words to find their closest
                ind_snt = batch[1][i]
                ind_wrd = batch[2][i]
                wrd = vocab[wrd_i[i]]
                out = []
                out.append("{}:{}:{}".format(ind_snt,ind_wrd,wrd))

                dist_wrd_voc = distance(wrd_e[i].unsqueeze(0),voc_e)
                mininds = torch.argsort(dist_wrd_voc,dim=0,descending=True)
                for k in range(1,len(mininds)):
                    ind = mininds[k].item() #cpu().detach().numpy()
                    if i != ind:
                        dis = dist_wrd_voc[ind].item()
                        wrd = vocab[ind]
                        out.append("{:.6f}:{}".format(dis,wrd))
                        if len(out)-1 == args.k:
                            break
                print('\t'.join(out))
Exemplo n.º 3
0
def do_train(args):
    if not os.path.exists(args.name + '.token'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.token'))
        sys.exit()
    if not os.path.exists(args.name + '.vocab'):
        logging.error('missing {} file (run preprocess mode)'.format(args.name + '.vocab'))
        sys.exit()

    token = OpenNMTTokenizer(args.name + '.token')
    vocab = Vocab()
    vocab.read(args.name + '.vocab')
    if os.path.exists(args.name + '.param'):
        args.embedding_size, args.pooling = read_params(args)
    else:
        write_params(args)        

    model = Word2Vec(len(vocab), args.embedding_size, args.pooling, vocab.idx_unk)
    if args.cuda:
        model.cuda()
#    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.beta1,args.beta2), eps=args.eps)
#    optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
    optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate, betas=(args.beta1, args.beta2), eps=args.eps, weight_decay=0.01, amsgrad=False)
    n_steps, model, optimizer = load_model_optim(args.name, args.embedding_size, vocab, model, optimizer)
    dataset = Dataset(args, token, vocab, args.method)

    n_epochs = 0
    losses = []
    while True:
        n_epochs += 1
        for batch in dataset:
            model.train()
            if args.method == 'skipgram':
                loss = model.forward_skipgram(batch)
            elif args.method == 'cbow':
                loss = model.forward_cbow(batch)
            elif args.method == 'sbow':
                loss = model.forward_sbow(batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            n_steps += 1
            losses.append(loss.data.cpu().detach().numpy())
            if n_steps % args.report_every_n_steps == 0:
                accum_loss = np.mean(losses)
                logging.info('{} n_epoch={} n_steps={} Loss={:.6f}'.format(args.method, n_epochs,n_steps,accum_loss))
                losses = []
            if n_steps % args.save_every_n_steps == 0:
                save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n)
        if n_epochs >= args.max_epochs:
            logging.info('Stop (max epochs reached)')
            break
    save_model_optim(args.name, model, optimizer, n_steps, args.keep_last_n)