def test_padding(): batch = [ np.array([1.0, 2.0]), np.array([1.0, 2.0, 3.0, 4.0, 5.0]), np.array([3.0]), np.array([1.0, 2.0, 3.0]), ] # index 0 padded = pad_batch(batch, 0) np.testing.assert_allclose( padded, np.array([ [1.0, 2.0, 0.0, 0.0, 0.0], [1.0, 2.0, 3.0, 4.0, 5.0], [3.0, 0.0, 0.0, 0.0, 0.0], [1.0, 2.0, 3.0, 0.0, 0.0], ]), ) # index 11 padded = pad_batch(batch, 11) np.testing.assert_allclose( padded, np.array([ [1.0, 2.0, 11.0, 11.0, 11.0], [1.0, 2.0, 3.0, 4.0, 5.0], [3.0, 11.0, 11.0, 11.0, 11.0], [1.0, 2.0, 3.0, 11.0, 11.0], ]), )
def test_pad_batch(): a = np.asarray([[0, 1, 2, 3], [0, 1, 2, 3]]) b = np.asarray([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]]) assert a.shape == (2, 4) assert b.shape == (3, 4) batch = [a, b] padded_batch = pad_batch(batch) assert padded_batch.shape[0] == 2 assert padded_batch.shape[1] == 3 assert padded_batch.shape[2] == 4
def evaluate_epoch_csu(epoch, eval_type='valid'): label = np.array([[]]) text = input('>>> ') if text.strip() == '': return text = np.array([[encoder.get(token, encoder['_unk_']) for token in text.strip().split()[:params.cut_down_len]]]) # initialize # logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() # data without shuffle # if eval_type == 'train': text, label = train['text'], train['label'] # elif eval_type == 'valid': text, label = valid['text'], valid['label'] # else: text, label = test['text'], test['label'] valid_preds, valid_labels = [], [] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx: stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward clf_output = model(b, clf=True, lm=False) # evaluation pred = clf_output.max(1)[1].data.cpu().numpy().astype(float) valid_preds.extend(pred.tolist()) valid_labels.extend(label_batch.tolist()) valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels) # A = (valid_preds == valid_labels).astype(float) # acc = A.mean() # z = 1.96 # 95% # delta = z * np.sqrt(acc * (1 - acc) / len(A)) # conf_interval = (acc - delta, acc + delta) # print('num instance', len(A)) # print('delta', delta) # print('conf interval', '[%.3f , %.3f]' % (conf_interval[0], conf_interval[1])) id2label = {v: k for k, v in params.label2id.items()} label = id2label[int(valid_preds[0])] print(label)
def evaluate_epoch_csu(epoch, eval_type='valid'): # initialize logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() # data without shuffle if eval_type == 'train': text, label = train['text'], train['label'] elif eval_type == 'valid': text, label = valid['text'], valid['label'] else: if params.dataset == 'headline': text, label = valid['text'], valid['label'] else: text, label = test['text'], test['label'] valid_preds, valid_labels = [], [] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx:stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward clf_output = model(b, clf=True, lm=False) # evaluation pred = clf_output.max(1)[1].data.cpu().numpy().astype(float) valid_preds.extend(pred.tolist()) valid_labels.extend(label_batch.tolist()) valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels) A = (valid_preds == valid_labels).astype(float) acc = A.mean() runid = params.inputdir.replace('exp/', '').replace('/', '-') save_path = 'exp/adv/acc.%s.npy' % runid print('Saved ACC to:', save_path) np.save(save_path, A) logger.info('{}; acc {}'.format( epoch, round(acc, 3), ))
def batcher(params, batch): # batch contains list of words sentences = [' '.join(s) for s in batch] num_sents = [] # numericalize into BPE format for sent in sentences: num_sent = text_encoder.encode([sent], verbose=False, lazy=True)[0] num_sents.append([encoder['_start_']] + num_sent + [encoder['_end_']]) sent_batch = pad_batch(num_sents, encoder['_pad_']) sent_lengths = (sent_batch[:, :-1] != encoder['_pad_']).sum(axis=1) # numpy sent_batch = Variable(torch.from_numpy(sent_batch)).cuda(params.gpu_id) sent_mask = make_std_mask(sent_batch, encoder['_pad_']) embeddings = params.infersent.encode(sent_batch, sent_mask) embeddings = params.infersent.pick_h(embeddings, sent_lengths) return embeddings.data.cpu().numpy()
def evaluate_epoch_csu(epoch, eval_type='valid'): # initialize logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() # data without shuffle if eval_type == 'train': text, label = train['text'], train['label'] elif eval_type == 'valid': text, label = valid['text'], valid['label'] else: text, label = test['text'], test['label'] valid_preds, valid_labels = [], [] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx:stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward clf_output = model(b, clf=True, lm=False) # evaluation pred = clf_output.max(1)[1].data.cpu().numpy().astype(float) valid_preds.extend(pred.tolist()) valid_labels.extend(label_batch.tolist()) valid_preds, valid_labels = np.array(valid_preds), np.array(valid_labels) acc = (valid_preds == valid_labels).astype(float) np.save('/home/anonymous/acc.npy', acc) acc = acc.mean() logger.info('{}; acc {}'.format( epoch, round(acc, 3), ))
def train_epoch_csu(epoch): # initialize logger.info('\nTRAINING : Epoch {}'.format(epoch)) model.train() all_costs, all_accs = [], [] # shuffle the data permutation = np.random.permutation(len(train['text'])) text = train['text'][permutation] label = train['label'][permutation] print('TRAIN DATA', len(text)) for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx:stidx + params.batch_size].tolist(), encoder, pad_start_end=True) text_batch2 = text_batch[:, :-1] label_batch = label[stidx:stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward if params.lm_coef == 0.: clf_output = model(b, clf=True, lm=False) else: if params.sememe: clf_output, (text_y_hat, sememe_y_hat) = model(b, clf=True, lm=True) else: clf_output, text_y_hat = model(b, clf=True, lm=True) # evaluation pred = clf_output.max(1)[1].data.cpu().numpy().astype(float) acc = (pred == label_batch).astype(float).mean() loss = model.compute_clf_loss(clf_output, b.label) if params.lm_coef != 0.0: lm_loss = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask) loss += params.lm_coef * lm_loss if params.sememe: sememe_y = torch.FloatTensor( word2sememe[text_batch2.reshape(-1)].reshape( [text_batch2.shape[0], text_batch2.shape[1], -1])).cuda() sp_loss = model.compute_clf_loss(sememe_y_hat, sememe_y, multilabel=True) loss += params.lm_coef * sp_loss all_costs.append(loss.data.item()) all_accs.append(acc) # backward model_opt.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), params.max_norm) # optimizer step model_opt.step() # log and reset if len(all_costs) == params.log_interval: logger.info('{}; loss {}; acc {}; lr {}; embed_norm {}'.format( stidx, round(np.mean(all_costs), 2), round(np.mean(all_accs), 3), params.lr, #model_opt.rate(), model.tgt_embed[0].lut.weight.data.norm())) all_costs, all_accs = [], [] # save torch.save(model, os.path.join(params.outputdir, "model-{}.pickle".format(epoch)))
def evaluate_epoch_csu(epoch, eval_type='valid'): # initialize logger.info('\n{} : Epoch {}'.format(eval_type.upper(), epoch)) model.eval() # data without shuffle if eval_type == 'train': text, label = train['text'], train['label'] elif eval_type == 'valid': text, label = valid['text'], valid['label'] else: text, label = test['text'], test['label'] valid_scores, valid_preds, valid_labels = [], [], [] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx: stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward clf_output = model(b, clf=True, lm=False) # evaluation score = torch.sigmoid(clf_output).data.cpu().numpy() pred = (score > 0.5).astype(float) valid_scores.extend(score.tolist()) valid_preds.extend(pred.tolist()) valid_labels.extend(label_batch.tolist()) valid_scores, valid_preds, valid_labels = np.array(valid_scores), np.array(valid_preds), np.array(valid_labels) np.save('{}/scores-{}.npy'.format(params.outputdir, epoch), valid_scores) if params.hierachical: parents = json.load(open('data/parents.json')) id2label = json.load(open('data/labels.json')) label2id = dict([(j, i) for i, j in enumerate(id2label)]) for i in range(valid_preds.shape[0]): last_pred_i = valid_preds[i].copy() while True: for j in range(valid_preds.shape[1]): did = id2label[j] flag = True now = did while now in parents: now = parents[now] if now not in label2id: break if valid_preds[i, label2id[now]] == 0: flag = False break if not flag: valid_preds[i, j] = 0. if (valid_preds[i] == last_pred_i).all(): break last_pred_i = valid_preds[i].copy() em = metrics.accuracy_score(valid_labels, valid_preds) p, r, f1, s = metrics.precision_recall_fscore_support(valid_labels, valid_preds, average='weighted') logger.info('{}; em {}; p {}; r {}; f1 {}'.format( epoch, round(em, 3), round(p, 3), round(r, 3), round(f1, 3) ))
def train_epoch_csu(epoch): # initialize logger.info('\nTRAINING : Epoch {}'.format(epoch)) model.train() all_costs, all_em, all_p, all_r, all_f1 = [], [], [], [], [] # shuffle the data permutation = np.random.permutation(len(train['text'])) text = train['text'][permutation] label = train['label'][permutation] for stidx in range(0, len(text), params.batch_size): # prepare batch text_batch = pad_batch(text[stidx: stidx + params.batch_size].tolist(), encoder, pad_start_end=True) label_batch = label[stidx: stidx + params.batch_size] b = Batch(text_batch, label_batch, encoder['_pad_']) # model forward if params.lm_coef == 0.: clf_output = model(b, clf=True, lm=False) else: clf_output, text_y_hat = model(b, clf=True, lm=True) # evaluation pred = (torch.sigmoid(clf_output) > 0.5).data.cpu().numpy().astype(float) em = metrics.accuracy_score(label_batch, pred) p, r, f1, s = metrics.precision_recall_fscore_support(label_batch, pred, average='weighted') all_em.append(em) all_p.append(p) all_r.append(r) all_f1.append(f1) if params.hierachical: loss = model.compute_hierachical_loss(clf_output, b.label) else: loss = model.compute_clf_loss(clf_output, b.label) if params.lm_coef != 0.0: lm_loss = model.compute_lm_loss(text_y_hat, b.text_y, b.text_loss_mask) loss += params.lm_coef * lm_loss all_costs.append(loss.data.item()) # backward model_opt.optimizer.zero_grad() loss.backward() # optimizer step model_opt.step() # log and reset if len(all_costs) == params.log_interval: logger.info('{}; loss {}; em {}; p {}; r {}; f1 {}; lr {}; embed_norm {}'.format( stidx, round(np.mean(all_costs), 2), round(np.mean(all_em), 3), round(np.mean(all_p), 3), round(np.mean(all_r), 3), round(np.mean(all_f1), 3), model_opt.rate(), model.tgt_embed[0].lut.weight.data.norm() )) all_costs, all_em, all_p, all_r, all_f1 = [], [], [], [], [] # save torch.save(model, os.path.join(params.outputdir, "model-{}.pickle".format(epoch)))
torch.cuda.manual_seed(args.seed) cuda_prompt = "you are using cuda." if args.cuda else "you are not using cuda." print("start model building, " + cuda_prompt) print("start data loading: train data at {}, test data at {}".format( args.train_path, args.test_path)) English = Vocab("../data/translation/English") French = Vocab("../data/translation/French") train_data = ParallelData(French, English, "../data/translation/French-train-source.txt", "../data/translation/English-train-target.txt") test_data = ParallelData(French, English, "../data/translation/French-val-source.txt", "../data/translation/English-val-target.txt") collate = lambda x: pad_batch(x, train_data.source_vocab.PAD) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, collate_fn=collate, num_workers=args.num_workers) test_loader = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=True, collate_fn=collate, num_workers=args.num_workers) print("finish data loading.") print("preparing directory {}".format(args.dir)) os.makedirs(args.dir, exist_ok=True) print("building model")