示例#1
0
def test_padding():
    batch = [
        np.array([1.0, 2.0]),
        np.array([1.0, 2.0, 3.0, 4.0, 5.0]),
        np.array([3.0]),
        np.array([1.0, 2.0, 3.0]),
    ]
    # index 0
    padded = pad_batch(batch, 0)
    np.testing.assert_allclose(
        padded,
        np.array([
            [1.0, 2.0, 0.0, 0.0, 0.0],
            [1.0, 2.0, 3.0, 4.0, 5.0],
            [3.0, 0.0, 0.0, 0.0, 0.0],
            [1.0, 2.0, 3.0, 0.0, 0.0],
        ]),
    )
    # index 11
    padded = pad_batch(batch, 11)
    np.testing.assert_allclose(
        padded,
        np.array([
            [1.0, 2.0, 11.0, 11.0, 11.0],
            [1.0, 2.0, 3.0, 4.0, 5.0],
            [3.0, 11.0, 11.0, 11.0, 11.0],
            [1.0, 2.0, 3.0, 11.0, 11.0],
        ]),
    )
示例#2
0
def test_pad_batch():

    a = np.asarray([[0, 1, 2, 3], [0, 1, 2, 3]])
    b = np.asarray([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]])

    assert a.shape == (2, 4)
    assert b.shape == (3, 4)

    batch = [a, b]

    padded_batch = pad_batch(batch)

    assert padded_batch.shape[0] == 2
    assert padded_batch.shape[1] == 3
    assert padded_batch.shape[2] == 4
def evaluate_epoch_csu(epoch, eval_type='valid'):
    label = np.array([[]])
    text = input('>>> ')
    if text.strip() == '': return
    text = np.array([[encoder.get(token, encoder['_unk_']) for token in text.strip().split()[:params.cut_down_len]]])

    # initialize
    # logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch))
    model.eval()
    
    # data without shuffle
    # if eval_type == 'train': text, label = train['text'], train['label']
    # elif eval_type == 'valid': text, label = valid['text'], valid['label']
    # else: text, label = test['text'], test['label']

    valid_preds, valid_labels = [], []

    for stidx in range(0, len(text), params.batch_size):
        # prepare batch
        text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True)
        label_batch = label[stidx: stidx + params.batch_size]
        
        b = Batch(text_batch, label_batch, encoder['_pad_'])

        # model forward
        clf_output = model(b, clf=True, lm=False)

        # evaluation
        pred = clf_output.max(1)[1].data.cpu().numpy().astype(float)
        valid_preds.extend(pred.tolist())
        valid_labels.extend(label_batch.tolist())
        
    valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels)

    # A = (valid_preds == valid_labels).astype(float)
    # acc = A.mean()
    # z = 1.96    # 95%
    # delta = z * np.sqrt(acc * (1 - acc) / len(A))
    # conf_interval = (acc - delta, acc + delta)
    # print('num instance', len(A))
    # print('delta', delta)
    # print('conf interval', '[%.3f , %.3f]' % (conf_interval[0], conf_interval[1]))

    id2label = {v: k for k, v in params.label2id.items()}
    label = id2label[int(valid_preds[0])]
    print(label)
示例#4
0
def evaluate_epoch_csu(epoch, eval_type='valid'):
    # initialize
    logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch))
    model.eval()

    # data without shuffle
    if eval_type == 'train': text, label = train['text'], train['label']
    elif eval_type == 'valid': text, label = valid['text'], valid['label']
    else:
        if params.dataset == 'headline':
            text, label = valid['text'], valid['label']
        else:
            text, label = test['text'], test['label']

    valid_preds, valid_labels = [], []

    for stidx in range(0, len(text), params.batch_size):
        # prepare batch
        text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(),
                               encoder,
                               pad_start_end=True)
        label_batch = label[stidx:stidx + params.batch_size]

        b = Batch(text_batch, label_batch, encoder['_pad_'])

        # model forward
        clf_output = model(b, clf=True, lm=False)

        # evaluation
        pred = clf_output.max(1)[1].data.cpu().numpy().astype(float)
        valid_preds.extend(pred.tolist())
        valid_labels.extend(label_batch.tolist())

    valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels)
    A = (valid_preds == valid_labels).astype(float)
    acc = A.mean()

    runid = params.inputdir.replace('exp/', '').replace('/', '-')
    save_path = 'exp/adv/acc.%s.npy' % runid
    print('Saved ACC to:', save_path)
    np.save(save_path, A)

    logger.info('{}; acc {}'.format(
        epoch,
        round(acc, 3),
    ))
示例#5
0
def batcher(params, batch):
    # batch contains list of words
    sentences = [' '.join(s) for s in batch]
    num_sents = []
    # numericalize into BPE format
    for sent in sentences:
        num_sent = text_encoder.encode([sent], verbose=False, lazy=True)[0]
        num_sents.append([encoder['_start_']] + num_sent + [encoder['_end_']])

    sent_batch = pad_batch(num_sents, encoder['_pad_'])
    sent_lengths = (sent_batch[:, :-1] != encoder['_pad_']).sum(axis=1) # numpy
    sent_batch = Variable(torch.from_numpy(sent_batch)).cuda(params.gpu_id)
    sent_mask = make_std_mask(sent_batch, encoder['_pad_'])

    embeddings = params.infersent.encode(sent_batch, sent_mask)
    embeddings = params.infersent.pick_h(embeddings, sent_lengths)

    return embeddings.data.cpu().numpy()
示例#6
0
def evaluate_epoch_csu(epoch, eval_type='valid'):
    # initialize
    logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch))
    model.eval()

    # data without shuffle
    if eval_type == 'train': text, label = train['text'], train['label']
    elif eval_type == 'valid': text, label = valid['text'], valid['label']
    else: text, label = test['text'], test['label']

    valid_preds, valid_labels = [], []

    for stidx in range(0, len(text), params.batch_size):
        # prepare batch
        text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(),
                               encoder,
                               pad_start_end=True)
        label_batch = label[stidx:stidx + params.batch_size]

        b = Batch(text_batch, label_batch, encoder['_pad_'])

        # model forward
        clf_output = model(b, clf=True, lm=False)

        # evaluation
        pred = clf_output.max(1)[1].data.cpu().numpy().astype(float)
        valid_preds.extend(pred.tolist())
        valid_labels.extend(label_batch.tolist())

    valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels)
    acc = (valid_preds == valid_labels).astype(float)
    np.save('/home/anonymous/acc.npy', acc)
    acc = acc.mean()

    logger.info('{}; acc {}'.format(
        epoch,
        round(acc, 3),
    ))
示例#7
0
def train_epoch_csu(epoch):
    # initialize
    logger.info('\nTRAINING : Epoch {}'.format(epoch))
    model.train()
    all_costs, all_accs = [], []

    # shuffle the data
    permutation = np.random.permutation(len(train['text']))
    text = train['text'][permutation]
    label = train['label'][permutation]
    print('TRAIN DATA', len(text))

    for stidx in range(0, len(text), params.batch_size):
        # prepare batch
        text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(),
                               encoder,
                               pad_start_end=True)
        text_batch2 = text_batch[:, :-1]

        label_batch = label[stidx:stidx + params.batch_size]

        b = Batch(text_batch, label_batch, encoder['_pad_'])

        # model forward
        if params.lm_coef == 0.: clf_output = model(b, clf=True, lm=False)
        else:
            if params.sememe:
                clf_output, (text_y_hat, sememe_y_hat) = model(b,
                                                               clf=True,
                                                               lm=True)
            else:
                clf_output, text_y_hat = model(b, clf=True, lm=True)

        # evaluation
        pred = clf_output.max(1)[1].data.cpu().numpy().astype(float)
        acc = (pred == label_batch).astype(float).mean()

        loss = model.compute_clf_loss(clf_output, b.label)
        if params.lm_coef != 0.0:
            lm_loss = model.compute_lm_loss(text_y_hat, b.text_y,
                                            b.text_loss_mask)
            loss += params.lm_coef * lm_loss
            if params.sememe:
                sememe_y = torch.FloatTensor(
                    word2sememe[text_batch2.reshape(-1)].reshape(
                        [text_batch2.shape[0], text_batch2.shape[1],
                         -1])).cuda()
                sp_loss = model.compute_clf_loss(sememe_y_hat,
                                                 sememe_y,
                                                 multilabel=True)
                loss += params.lm_coef * sp_loss

        all_costs.append(loss.data.item())
        all_accs.append(acc)

        # backward
        model_opt.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), params.max_norm)

        # optimizer step
        model_opt.step()

        # log and reset
        if len(all_costs) == params.log_interval:
            logger.info('{}; loss {}; acc {}; lr {}; embed_norm {}'.format(
                stidx,
                round(np.mean(all_costs), 2),
                round(np.mean(all_accs), 3),
                params.lr,  #model_opt.rate(),
                model.tgt_embed[0].lut.weight.data.norm()))
            all_costs, all_accs = [], []

    # save
    torch.save(model,
               os.path.join(params.outputdir, "model-{}.pickle".format(epoch)))
示例#8
0
def evaluate_epoch_csu(epoch, eval_type='valid'):
    # initialize
    logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch))
    model.eval()
    
    # data without shuffle
    if eval_type == 'train': text, label = train['text'], train['label']
    elif eval_type == 'valid': text, label = valid['text'], valid['label']
    else: text, label = test['text'], test['label']

    valid_scores, valid_preds, valid_labels = [], [], []

    for stidx in range(0, len(text), params.batch_size):
        # prepare batch
        text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True)
        label_batch = label[stidx: stidx + params.batch_size]
        
        b = Batch(text_batch, label_batch, encoder['_pad_'])

        # model forward
        clf_output = model(b, clf=True, lm=False)

        # evaluation
        score = torch.sigmoid(clf_output).data.cpu().numpy()
        pred = (score > 0.5).astype(float)
        valid_scores.extend(score.tolist())
        valid_preds.extend(pred.tolist())
        valid_labels.extend(label_batch.tolist())
        
    valid_scores, valid_preds, valid_labels = np.array(valid_scores), np.array(valid_preds), np.array(valid_labels)
    np.save('{}/scores-{}.npy'.format(params.outputdir, epoch), valid_scores)
    
    if params.hierachical:
        parents = json.load(open('data/parents.json'))
        id2label = json.load(open('data/labels.json'))
        label2id = dict([(j, i) for i, j in enumerate(id2label)])
        for i in range(valid_preds.shape[0]): 
            last_pred_i = valid_preds[i].copy()
            while True:
                for j in range(valid_preds.shape[1]):
                    did = id2label[j]
                    flag = True
                    now = did
                    while now in parents:
                        now = parents[now]
                        if now not in label2id: break
                        if valid_preds[i, label2id[now]] == 0:
                            flag = False
                            break
                    if not flag: 
                        valid_preds[i, j] = 0.
                if (valid_preds[i] == last_pred_i).all(): break
                last_pred_i = valid_preds[i].copy()

    em = metrics.accuracy_score(valid_labels, valid_preds)
    p, r, f1, s = metrics.precision_recall_fscore_support(valid_labels, valid_preds, average='weighted')

    logger.info('{}; em {}; p {}; r {}; f1 {}'.format(
        epoch, 
        round(em, 3),
        round(p, 3),
        round(r, 3),
        round(f1, 3)
    ))
示例#9
0
def train_epoch_csu(epoch):
    # initialize
    logger.info('\nTRAINING : Epoch {}'.format(epoch))
    model.train()
    all_costs, all_em, all_p, all_r, all_f1 = [], [], [], [], []

    # shuffle the data
    permutation = np.random.permutation(len(train['text']))
    text = train['text'][permutation]
    label = train['label'][permutation]

    for stidx in range(0, len(text), params.batch_size):
        # prepare batch
        text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True)
        label_batch = label[stidx: stidx + params.batch_size]
        
        b = Batch(text_batch, label_batch, encoder['_pad_'])

        # model forward
        if params.lm_coef == 0.: clf_output = model(b, clf=True, lm=False)
        else: clf_output, text_y_hat = model(b, clf=True, lm=True)

        # evaluation
        pred = (torch.sigmoid(clf_output) > 0.5).data.cpu().numpy().astype(float)
        em = metrics.accuracy_score(label_batch, pred)
        p, r, f1, s = metrics.precision_recall_fscore_support(label_batch, pred, average='weighted')

        all_em.append(em)
        all_p.append(p)
        all_r.append(r)
        all_f1.append(f1)
 
        if params.hierachical: loss = model.compute_hierachical_loss(clf_output, b.label)
        else: loss = model.compute_clf_loss(clf_output, b.label)
        if params.lm_coef != 0.0:
            lm_loss = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask)
            loss += params.lm_coef * lm_loss

        all_costs.append(loss.data.item())
        
        # backward
        model_opt.optimizer.zero_grad()
        loss.backward()

        # optimizer step
        model_opt.step()

        # log and reset 
        if len(all_costs) == params.log_interval:
            logger.info('{}; loss {}; em {}; p {}; r {}; f1 {}; lr {}; embed_norm {}'.format(
                stidx, 
                round(np.mean(all_costs), 2),
                round(np.mean(all_em), 3),
                round(np.mean(all_p), 3),
                round(np.mean(all_r), 3),
                round(np.mean(all_f1), 3),
                model_opt.rate(),
                model.tgt_embed[0].lut.weight.data.norm()
            ))
            all_costs, all_em, all_p, all_r, all_f1 = [], [], [], [], []

    # save
    torch.save(model, os.path.join(params.outputdir, "model-{}.pickle".format(epoch)))
示例#10
0
        torch.cuda.manual_seed(args.seed)

    cuda_prompt = "you are using cuda." if args.cuda else "you are not using cuda."
    print("start model building, " + cuda_prompt)

    print("start data loading: train data at {}, test data at {}".format(
        args.train_path, args.test_path))
    English = Vocab("../data/translation/English")
    French = Vocab("../data/translation/French")
    train_data = ParallelData(French, English,
                              "../data/translation/French-train-source.txt",
                              "../data/translation/English-train-target.txt")
    test_data = ParallelData(French, English,
                             "../data/translation/French-val-source.txt",
                             "../data/translation/English-val-target.txt")
    collate = lambda x: pad_batch(x, train_data.source_vocab.PAD)
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               collate_fn=collate,
                                               num_workers=args.num_workers)
    test_loader = torch.utils.data.DataLoader(test_data,
                                              batch_size=args.batch_size,
                                              shuffle=True,
                                              collate_fn=collate,
                                              num_workers=args.num_workers)

    print("finish data loading.")
    print("preparing directory {}".format(args.dir))
    os.makedirs(args.dir, exist_ok=True)
    print("building model")