def getdata_matching(task, type, batch=4): Task, vocab, ind, index, Devset, Testset = preprocess(task, type) Train = [] global Test j = 0 for i in range(len(Task)): vocab.index_dataset(Task[i], field_name='words', new_field_name='words') if i in ind: list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] slice = random.sample(list, 4) another = [x for x in list if x not in slice] train = Task[i][another] support = Task[i][slice] else: length = len(Task[i]) list = [x - 1 for x in range(length)] slice = random.sample(list, 20) another = [x for x in list if x not in slice] train, support = Task[i][another], Task[i][slice] if i == index: Test = Pair(Testset, support) Dev = Pair(Devset, support) Train.append(Pair(train, support)) for i in range(len(Train)): Train[i].batch.set_input('words') Train[i].support.set_input('words') Train[i].batch.set_target('onehot') Train[i].support.set_target('onehot') Train[i].batch.apply(lambda x: len(x['words']), new_field_name='seq_len') Train[i].support.apply(lambda x: len(x['words']), new_field_name='seq_len') Test.batch.set_input('words') Test.support.set_input('words') Test.batch.set_target('onehot') Test.support.set_target('onehot') Test.batch.apply(lambda x: len(x['words']), new_field_name='seq_len') Test.support.apply(lambda x: len(x['words']), new_field_name='seq_len') Dev.batch.set_input('words') Dev.support.set_input('words') Dev.batch.set_target('onehot') Dev.support.set_target('onehot') Dev.batch.apply(lambda x: len(x['words']), new_field_name='seq_len') Dev.support.apply(lambda x: len(x['words']), new_field_name='seq_len') Train_batch = [] for i in range(len(Train)): if i in ind: sampler = BucketSampler(num_buckets=1, batch_size=batch, seq_len_field_name='seq_len') Train_batch.append(Pair(Batch(batch_size=batch, dataset=Train[i].batch, sampler=sampler), Train[i].support)) else: sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') Train_batch.append(Pair(Batch(batch_size=batch, dataset=Train[i].batch, sampler=sampler), Train[i].support)) sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') Test_batch = Pair(Batch(batch_size=batch, dataset=Test.batch, sampler=sampler), Test.support) Dev_batch = Pair(Batch(batch_size=batch, dataset=Dev.batch, sampler=sampler), Dev.support) return Train_batch, Dev_batch, Test_batch, len(vocab)
def dump_model_result(config, model): tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb')) metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag') dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb")) data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False) model.cuda() eval_results = {} dev_data.set_input('tag') dev_data.set_target('seq_len') with torch.no_grad(): for i, (batch_x, batch_y) in enumerate(data_iterator): print('batch', i) #if i > 10: # break char = batch_x['char'].cuda() word = batch_x['word'].cuda() pos = batch_x['pos'].cuda() spo = batch_x['spo'].cuda() seq_len = batch_x['seq_len'].cuda() tag = batch_y['tag'].cuda() #pred = model(char, word, pos, spo, seq_len, tag) pred = model.predict(char, word, pos, spo, seq_len) # labels? #labels = idx2label(pred['pred'], tag_vocab.idx2word) #print(pred) #print(tag) #exit() metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()}) eval_result = metrics.get_metric() metric_name = metrics.__class__.__name__ eval_results[metric_name] = eval_result print("[tester] \n{}".format(_format_eval_results(eval_results)))
def cnn_train(epoch, data, model, batch_size=20): device = torch.device("cuda") optim = torch.optim.Adam(model.parameters(), lr=0.002) lossfunc = nn.CrossEntropyLoss() train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len') train_batch = Batch(batch_size=batch_size, dataset=data, sampler=train_sampler) for i in range(epoch): loss_list = [] cnt = 0 for batch_x, batch_y in train_batch: batch_x['words'] = batch_x['words'].long().contiguous().to(device) batch_y['target'] = batch_y['target'].long().contiguous().to( device) optim.zero_grad() output = model(batch_x['words']) loss = lossfunc(output['pred'], batch_y['target']) loss.backward() optim.step() loss_list.append(loss.item()) print('[info] Epoch %d Iteration %d Loss : %f' % (i, cnt, loss_list[-1])) cnt += 1 loss_list.clear() torch.save(model.state_dict(), './cnn_state.pth')
def train(model, dataset, optimizer, num_epoch=30): loss_history = [] loss_fn = nn.CrossEntropyLoss().to(device) for i in range(num_epoch): start = time.time() print("Epoch: {0} start".format(i)) model.train() print(model_status(model.training)) losses = 0 for batch_x, batch_y in Batch(dataset, sampler=RandomSampler(), batch_size=batch_size): x, lengths, y = pack(batch_x, batch_y) score = model(x, lengths) loss = loss_fn(score, y) losses += loss optimizer.zero_grad() loss.backward() optimizer.step() loss_history.append(losses) print("Epoch: {0} finish".format(i)) acc = predict(model, val_dataset) end = time.time() print("Epoch: {0}, loss: {1}, accu: {2}, time: {3}\n".format( i, losses, acc, end - start)) return loss_history
def cnn_train(epoch, data, model, batch_size=32): device = torch.device("cuda") optim = torch.optim.Adam(model.parameters(), lr=0.001) lossfunc = nn.CrossEntropyLoss() train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len') train_batch = Batch(batch_size=batch_size, dataset=data, sampler=train_sampler) for i in range(epoch): loss_list = [] cnt = 0 for batch_x, batch_y in train_batch: optim.zero_grad() batch_x['words'] = batch_x['words'].long().contiguous().to(device) batch_y['target'] = batch_y['target'].long().contiguous().to( device) output = model(batch_x['words']) loss = lossfunc(output['pred'], batch_y['target']) loss.backward() optim.step() loss_list.append(loss.item()) info_str = '[info] Epoch {:d} Iteration {:d} Loss : {:.2f}'.format( i, cnt, loss_list[-1]) print(info_str) with open('./cnn_rec.txt', 'a') as fp: fp.write(info_str) fp.write('\n') cnt += 1 loss_list.clear() torch.save(model.state_dict(), './cnn_state.pth')
def preprocess(batch=16): raw_data1 = [] raw_data2 = [] for i in range(len(traindata.data)): raw_data1.append( Instance(sentence=traindata.data[i], label=int(traindata.target[i]))) trainset = DataSet(raw_data1) trainset.apply(lambda x: pre(x['sentence']), new_field_name='words') for i in range(len(testdata.data)): raw_data2.append( Instance(sentence=testdata.data[i], label=int(testdata.target[i]))) testset = DataSet(raw_data2) testset.apply(lambda x: pre(x['sentence']), new_field_name='words') global vocab vocab = Vocabulary(min_freq=1).from_dataset(trainset, testset, field_name='words') vocab.index_dataset(trainset, testset, field_name='words', new_field_name='words') trainset.set_input('words') testset.set_input('words') trainset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) testset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True) trainset.apply(lambda x: len(x['words']), new_field_name='seq_len') testset.apply(lambda x: len(x['words']), new_field_name='seq_len') global vocabsize vocabsize = len(vocab) sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler) test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler) return train_batch, test_batch, vocabsize
def test_sequential_batch(self): batch_size = 32 num_samples = 1000 dataset = generate_fake_dataset(num_samples) batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_x, batch_y in batch: pass
def dump_all_models_prob(config, models): dev_data = pickle.load( open(os.path.join(config.data_path, config.dev_name), "rb")) bert_dev_data = pickle.load( open(os.path.join(config.bert_data_path, config.dev_name), "rb")) data_iterator = Batch(dev_data, config.ensemble_batch, sampler=SequentialSampler(), as_numpy=False) bert_data_iterator = Batch(bert_dev_data, config.ensemble_batch, sampler=SequentialSampler(), as_numpy=False) for i, model in enumerate(models[:-1]): dump_one_model_prob(config.prob_path, config.ensemble_models[i], dev_data, model, data_iterator) dump_bert_model_prob(config.prob_path, config.ensemble_models[-1], bert_dev_data, models[-1], bert_data_iterator)
def predict(model, dataset): model.eval() print(model_status(model.training)) num_correct = torch.tensor(0.0) num_sample = torch.tensor(0.0) for batch_x, batch_y in Batch(dataset, sampler=SequentialSampler(), batch_size=batch_size): x, lengths, y = pack(batch_x, batch_y) score = model(x, lengths) y_predict = torch.argmax(score, dim=1) num_correct += torch.sum(y_predict == y) num_sample += x.shape[0] return 1.0 * num_correct / num_sample
def test_list_padding(self): ds = DataSet({ "x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10, "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10 }) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertEqual(x["x"].shape, (4, 4)) self.assertEqual(y["y"].shape, (4, 4))
def test_simple(self): dataset = construct_dataset([[ "FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world" ] for _ in range(40)]) dataset.set_target() batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True) cnt = 0 for _, _ in batch: cnt += 1 self.assertEqual(cnt, 10)
def test_list_of_numpy_to_tensor(self): ds = DataSet([ Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2) ] + [ Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2) ]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: print(x, y)
def test_list_of_list_to_tensor(self): ds = DataSet( [Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] + [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_dataset_batching(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertTrue( isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray)) self.assertEqual(len(x["x"]), 4) self.assertEqual(len(y["y"]), 4) self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4]) self.assertListEqual(list(y["y"][-1]), [5, 6])
def test_numpy_to_tensor(self): ds = DataSet({ "x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10) }) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def predict(config, model): tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb')) metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag') dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb")) char_vocab = pickle.load(open(os.path.join(config.data_path, config.char_vocab_name), "rb")) data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False) model.cuda() schema = get_schemas(config.source_path) eval_results = {} dev_data.set_input('tag') dev_data.set_target('seq_len') result = {} with torch.no_grad(): for i, (batch_x, _) in enumerate(data_iterator): print('batch', i) #if i > 10: # break char = batch_x['char'].cuda() word = batch_x['word'].cuda() pos = batch_x['pos'].cuda() spo = batch_x['spo'].cuda() seq_len = batch_x['seq_len'].cuda() #pred = model(char, word, pos, spo, seq_len, tag) pred = model.predict(char, word, pos, spo, seq_len) # labels? texts = char2text(char.cpu().data, char_vocab.idx2word) labels = idx2label(pred['pred'].cpu().data, tag_vocab.idx2word) spos = idx2spo(schema, spo.cpu().data) result = label2spo(labels, texts, result, spos) #print(pred) #print(tag) #exit() # metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()}) # eval_result = metrics.get_metric() # metric_name = metrics.__class__.__name__ # eval_results[metric_name] = eval_result return result
def get_answer(model, dataset): answer = [] print("start to generate result") model.eval() print(model_status(model.training)) for batch_x, batch_y in Batch(dataset, sampler=SequentialSampler(), batch_size=batch_size): x, lengths = pack(batch_x, batch_y, 0) score = model(x, lengths) y_predict = torch.argmax(score, dim=1).cpu().numpy() answer += list(y_predict) index = [a + 156061 for a in range(len(answer))] name = "result/CNN_pretrain" + str(use_pretrain) + "_freeze" + str( freeze_pretrain) + "dropouot" + str( dropout_rate) + "_batch_size" + str(batch_size) + "_lr" + str( learning_rate) + "_epoch" + str(num_epoch) + ".csv" dataframe = pd.DataFrame({'PhraseId': index, 'Sentiment': answer}) dataframe.to_csv(name, index=False, sep=',') return answer
best_model_file_name = "{}/model.bin".format(root_dir) model = models.BertC(name=options.model, dropout=options.dropout, num_class=5) if options.old_model is not None: model.load_state_dict(torch.load(options.old_model)) devices = [int(x) for x in options.devi] model = nn.DataParallel(model, device_ids=devices) device = torch.device("cuda:{}".format(devices[0])) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=options.lr, eps=1e-6) # train_sampler = BucketSampler(batch_size=options.batch_size, seq_len_field_name='seq_len') train_sampler = RandomSampler() dev_sampler = SequentialSampler() test_batch = Batch(dataset=test_set, batch_size=options.batch_size, sampler=dev_sampler) def tester(test_batch): model.eval() total = 0.0 acc = 0.0 tar_succ = 0 ori = 0 with torch.no_grad(): for batch_x, batch_y in test_batch: if len(batch_x["seq_len"]) < len(devices): continue seq = batch_x["seq"] out = model(seq, batch_x["seq_len"])
def predict(config, models, weight): test_data = pickle.load( open(os.path.join(config.data_path, config.test_name), "rb")) bert_test_data = pickle.load( open(os.path.join(config.bert_data_path, config.test_name), "rb")) data_iterator = Batch(test_data, config.predict_batch, sampler=SequentialSampler(), as_numpy=False) bert_data_iterator = Batch(bert_test_data, config.predict_batch, sampler=SequentialSampler(), as_numpy=False) for model in models: model.cuda() schema = get_schemas_list(config.source_path) weight = torch.tensor(weight).float() weight.cuda() weight_sum = torch.sum(weight) read_data = [] with open(os.path.join(config.source_path, config.test_source), 'rb') as f: for line in f: read_data.append(json.loads(line)) spo_list = [] with torch.no_grad(): for i, ((batch_x, _), (bert_batch_x, _)) in enumerate(zip(data_iterator, bert_data_iterator)): print('batch', i) #if i >= 5: # break # batch text = batch_x['text'].cuda() # target = batch_y['target'].cuda() # bert batch input_ids = bert_batch_x['input_ids'].cuda() token_type_ids = bert_batch_x['token_type_ids'].cuda() attention_mask = bert_batch_x['attention_mask'].cuda() # label_id = bert_batch_y['label_id'].cuda() # assert torch.equal(target, label_id) pred = models[-1](input_ids, token_type_ids, attention_mask) pred['output'] *= weight[-1] for i, model in enumerate(models[:-1]): pred['output'] += model(text)['output'] * weight[i] pred['output'] /= weight_sum for prob in pred['output']: spo_list.append(prob2spo(prob, schema)) with open(os.path.join(config.predict_path, config.predict_name), 'w') as f: for data, spo in zip(read_data, spo_list): data["spo_list"] = spo f.write(json.dumps(data, ensure_ascii=False) + '\n')
x = self.embedding(chars) batch_size = x.size(0) length = x.size(1) if hasattr(self, 'bigram_embedding'): bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1) self.drop(x) x = self.fc1(x) feats = self.transformer(x, masks) feats = self.fc2(feats) probs = self.crf.viterbi_decode(feats, masks, get_score=False) return {'pred': probs, 'seq_lens':seq_lens} if __name__ == "__main__": from fastNLP import Batch from fastNLP import RandomSampler from train_star import load_ppl2014_fastway ds, word_v, tag_v = load_ppl2014_fastway('/home/darktower/nlp_exp/data/ppl2014') train, test = ds.split(0.8) del ds, test data_iterator = Batch(train, sampler=RandomSampler(), batch_size =64, as_numpy=False) for batch_x, batch_y in data_iterator: break cws = TransformerCWS(len(word_vocab), 350, tag_size=len(tag_v)) print('Loss is: ', cws(batch_x['words'], batch_y['target'], batch_x['seq_len']))
def getdata_proto(task, type, batch=4): Task, vocab, ind, index, testset, devset = preprocess(task, type) Train = [] global Test for i in range(len(Task)): vocab.index_dataset(Task[i], field_name='words', new_field_name='words') if i in ind: list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] rawsupport0 = [] rawsupport1 = [] while (len(rawsupport0) == 0 or len(rawsupport1) == 0): slice = random.sample(list, 4) another = [x for x in list if x not in slice] train = Task[i][another] for inn in slice: if Task[i][inn]['label'] == -1: rawsupport0.append(inn) else: rawsupport1.append(inn) support0 = Task[i][rawsupport0] support1 = Task[i][rawsupport1] else: length = len(Task[i]) list = [x - 1 for x in range(length)] rawsupport0 = [] rawsupport1 = [] while (len(rawsupport0) == 0 or len(rawsupport1) == 0): slice = random.sample(list, 20) another = [x for x in list if x not in slice] train = Task[i][another] for inn in slice: if Task[i][inn]['label'] == -1: rawsupport0.append(inn) else: rawsupport1.append(inn) support0 = Task[i][rawsupport0] support1 = Task[i][rawsupport1] if i == index: Test = Triple(testset, support0, support1) Dev = Triple(devset, support0, support1) Train.append(Triple(train, support0, support1)) for i in range(len(Train)): Train[i].batch.set_input('words') Train[i].support0.set_input('words') Train[i].support1.set_input('words') Train[i].batch.set_target('onehot') Train[i].support0.set_target('onehot') Train[i].support1.set_target('onehot') Train[i].batch.apply(lambda x: len(x['words']), new_field_name='seq_len') Train[i].support0.apply(lambda x: len(x['words']), new_field_name='seq_len') Train[i].support1.apply(lambda x: len(x['words']), new_field_name='seq_len') Test.batch.set_input('words') Test.support0.set_input('words') Test.support1.set_input('words') Test.batch.set_target('onehot') Test.support0.set_target('onehot') Test.support1.set_target('onehot') Test.batch.apply(lambda x: len(x['words']), new_field_name='seq_len') Test.support0.apply(lambda x: len(x['words']), new_field_name='seq_len') Test.support1.apply(lambda x: len(x['words']), new_field_name='seq_len') Dev.batch.set_input('words') Dev.support0.set_input('words') Dev.support1.set_input('words') Dev.batch.set_target('onehot') Dev.support0.set_target('onehot') Dev.support1.set_target('onehot') Dev.batch.apply(lambda x: len(x['words']), new_field_name='seq_len') Dev.support0.apply(lambda x: len(x['words']), new_field_name='seq_len') Dev.support1.apply(lambda x: len(x['words']), new_field_name='seq_len') Train_batch = [] for i in range(len(Train)): if i in ind: sampler = BucketSampler(num_buckets=1, batch_size=batch, seq_len_field_name='seq_len') Train_batch.append(Triple(Batch(batch_size=batch, dataset=Train[i].batch, sampler=sampler), Train[i].support0, Train[i].support1)) else: sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') Train_batch.append(Triple(Batch(batch_size=batch, dataset=Train[i].batch, sampler=sampler), Train[i].support0, Train[i].support1)) sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len') Test_batch = Triple(Batch(batch_size=batch, dataset=Test.batch, sampler=sampler), Test.support0, Test.support1) Dev_batch = Triple(Batch(batch_size=batch, dataset=Dev.batch, sampler=sampler), Dev.support0, Dev.support1) return Train_batch, Dev_batch, Test_batch, len(vocab)
def train(args): text_data = TextData() with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin: text_data = pickle.load(fin) vocab_size = text_data.vocab_size class_num = text_data.class_num # class_num = 1 seq_len = text_data.max_seq_len print("(vocab_size,class_num,seq_len):({0},{1},{2})".format( vocab_size, class_num, seq_len)) train_data = text_data.train_set val_data = text_data.val_set test_data = text_data.test_set train_data.set_input('words', 'seq_len') train_data.set_target('target') val_data.set_input('words', 'seq_len') val_data.set_target('target') test_data.set_input('words', 'seq_len') test_data.set_target('target') init_embeds = None if args.pretrain_model == "None": print("No pretrained model with be used.") print("vocabsize:{0}".format(vocab_size)) init_embeds = (vocab_size, args.embed_size) elif args.pretrain_model == "word2vec": embeds_path = os.path.join(args.prepare_dir, 'w2v_embeds.pkl') print("Loading Word2Vec pretrained embedding from {0}.".format( embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove': embeds_path = os.path.join(args.prepare_dir, 'glove_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) elif args.pretrain_model == 'glove2wv': embeds_path = os.path.join(args.prepare_dir, 'glove2wv_embeds.pkl') print( "Loading Glove pretrained embedding from {0}.".format(embeds_path)) with open(embeds_path, 'rb') as fin: init_embeds = pickle.load(fin) else: init_embeds = (vocab_size, args.embed_size) if args.model == "CNNText": print("Using CNN Model.") model = CNNText(init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) elif args.model == "StarTransformer": print("Using StarTransformer Model.") model = STSeqCls(init_embeds, num_cls=class_num, hidden_size=args.hidden_size) elif args.model == "MyCNNText": model = MyCNNText(init_embeds=init_embeds, num_classes=class_num, padding=2, dropout=args.dropout) print("Using user defined CNNText") elif args.model == "LSTMText": print("Using LSTM Model.") model = LSTMText(init_embeds=init_embeds, output_dim=class_num, hidden_dim=args.hidden_size, num_layers=args.num_layers, dropout=args.dropout) elif args.model == "Bert": print("Using Bert Model.") else: print("Using default model: CNNText.") model = CNNText((vocab_size, args.embed_size), num_classes=class_num, padding=2, dropout=0.1) print(model) if args.cuda: device = torch.device('cuda') else: device = None print("train_size:{0} ; val_size:{1} ; test_size:{2}".format( train_data.get_length(), val_data.get_length(), test_data.get_length())) if args.optim == "Adam": print("Using Adam as optimizer.") optimizer = fastnlp_optim.Adam(lr=0.001, weight_decay=args.weight_decay) if (args.model_suffix == "default"): args.model_suffix == args.optim else: print("No Optimizer will be used.") optimizer = None criterion = CrossEntropyLoss() metric = AccuracyMetric() model_save_path = os.path.join(args.model_dir, args.model, args.model_suffix) earlystop = EarlyStopCallback(args.patience) fitlog_back = FitlogCallback({"val": val_data, "train": train_data}) trainer = Trainer(train_data=train_data, model=model, save_path=model_save_path, device=device, n_epochs=args.epochs, optimizer=optimizer, dev_data=val_data, loss=criterion, batch_size=args.batch_size, metrics=metric, callbacks=[fitlog_back, earlystop]) trainer.train() print("Train Done.") tester = Tester(data=val_data, model=model, metrics=metric, batch_size=args.batch_size, device=device) tester.test() print("Test Done.") print("Predict the answer with best model...") acc = 0.0 output = [] data_iterator = Batch(test_data, batch_size=args.batch_size) for data_x, batch_y in data_iterator: i_data = Variable(data_x['words']).cuda() pred = model(i_data)[C.OUTPUT] pred = pred.sigmoid() # print(pred.shape) output.append(pred.cpu().data) output = torch.cat(output, 0).numpy() print(output.shape) print("Predict Done. {} records".format(len(output))) result_save_path = os.path.join(args.result_dir, args.model + "_" + args.model_suffix) with open(result_save_path + ".pkl", 'wb') as f: pickle.dump(output, f) output = output.squeeze()[:, 1].tolist() projectid = text_data.test_projectid.values answers = [] count = 0 for i in range(len(output)): if output[i] > 0.5: count += 1 print("true sample count:{}".format(count)) add_count = 0 for i in range(len(projectid) - len(output)): output.append([0.13]) add_count += 1 print("Add {} default result in predict.".format(add_count)) df = pd.DataFrame() df['projectid'] = projectid df['y'] = output df.to_csv(result_save_path + ".csv", index=False) print("Predict Done, results saved to {}".format(result_save_path)) fitlog.finish()
def predict(args): text_data = TextData() with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin: text_data = pickle.load(fin) vocab_size = text_data.vocab_size class_num = text_data.class_num # class_num = 1 seq_len = text_data.max_seq_len print("(vocab_size,class_num,seq_len):({0},{1},{2})".format( vocab_size, class_num, seq_len)) test_data = text_data.test_set test_data.set_input('words', 'seq_len') test_data.set_target('target') test_size = test_data.get_length() print("test_size:{}".format(test_size)) print("Data type:{}".format(type(test_data))) init_embeds = None model_save_path = os.path.join(args.model_dir, args.model, args.model_suffix, args.reload_model_name) print("Loading the model {}".format(model_save_path)) model = torch.load(model_save_path) model.eval() print(model) if args.cuda: device = torch.device('cuda') else: device = None model.to(device) acc = 0.0 output = [] data_iterator = Batch(test_data, batch_size=args.batch_size) for data_x, batch_y in data_iterator: i_data = Variable(data_x['words']).cuda() pred = model(i_data)[C.OUTPUT] pred = pred.sigmoid() # print(pred.shape) output.append(pred.cpu().data) output = torch.cat(output, 0).numpy() print(output.shape) print("Predict Done.{} records".format(len(output) * args.batch_size)) result_save_path = os.path.join( args.result_dir, args.model + args.model_suffix + args.reload_model_name) with open(result_save_path + ".pkl", 'wb') as f: pickle.dump(output, f) output = output.squeeze()[:, 1].tolist() projectid = text_data.test_projectid.values answers = [] count = 0 for i in range(len(output)): if output[i] > 0.5: count += 1 print("pc1 < 0.5 count:{}".format(count)) for i in range(len(projectid) - len(output)): output.append([0.87]) df = pd.DataFrame() df['projectid'] = projectid df['y'] = output df.to_csv(result_save_path + ".csv", index=False) print("Predict Done, results saved to {}".format(result_save_path)) # with open(result_save_path,'w') as f: # for i in output: # f.write() fitlog.finish()
def ensemble(config, models, sum_prob=False, weight=[1, 1, 1, 1, 1]): f1 = F1_score(pred='output', target='target') f1.tp.cuda() f1.fp.cuda() f1.fn.cuda() dev_data = pickle.load( open(os.path.join(config.data_path, config.dev_name), "rb")) bert_dev_data = pickle.load( open(os.path.join(config.bert_data_path, config.dev_name), "rb")) data_iterator = Batch(dev_data, config.ensemble_batch, sampler=SequentialSampler(), as_numpy=False) bert_data_iterator = Batch(bert_dev_data, config.ensemble_batch, sampler=SequentialSampler(), as_numpy=False) for model in models: model.cuda() eval_results = {} weight = torch.tensor(weight) weight.cuda() weight_sum = torch.sum(weight).float() with torch.no_grad(): for i, ((batch_x, batch_y), (bert_batch_x, bert_batch_y)) in enumerate( zip(data_iterator, bert_data_iterator)): print('batch', i) #if i > 10: # break # batch text = batch_x['text'].cuda() target = batch_y['target'].cuda() # bert batch input_ids = bert_batch_x['input_ids'].cuda() token_type_ids = bert_batch_x['token_type_ids'].cuda() attention_mask = bert_batch_x['attention_mask'].cuda() label_id = bert_batch_y['label_id'].cuda() #assert torch.equal(target, label_id) pred = models[-1](input_ids, token_type_ids, attention_mask) pred['output'] *= weight[-1] #if not sum_prob: # pred['output'][pred['output'] >= 0.5] = 1.0 * weight[-1] # pred['output'][pred['output'] < 0.5] = 0.0 # for i, model in enumerate(models[:-1]): # temp = model(text)['output'] # temp[temp >= 0.5] = 1.0 * weight[i] # temp[temp < 0.5] = 0.0 # pred['output'] += temp #else: for i, model in enumerate(models[:-1]): pred['output'] += model(text)['output'] * weight[i] pred['output'] /= weight_sum #bert_batch_y['label_id'].cuda() f1({'output': pred['output'].cuda()}, {'label_id': bert_batch_y['label_id'].cuda()}) eval_result = f1.get_metric() metric_name = f1.__class__.__name__ eval_results[metric_name] = eval_result print("[ensemble] \n{}".format(_format_eval_results(eval_results)))