def getdata_matching(task, type, batch=4):
    Task, vocab, ind, index, Devset, Testset = preprocess(task, type)
    Train = []
    global Test
    j = 0
    for i in range(len(Task)):
        vocab.index_dataset(Task[i], field_name='words', new_field_name='words')
        if i in ind:
            list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            slice = random.sample(list, 4)
            another = [x for x in list if x not in slice]
            train = Task[i][another]
            support = Task[i][slice]
        else:
            length = len(Task[i])
            list = [x - 1 for x in range(length)]
            slice = random.sample(list, 20)
            another = [x for x in list if x not in slice]
            train, support = Task[i][another], Task[i][slice]
        if i == index:
            Test = Pair(Testset, support)
            Dev = Pair(Devset, support)
        Train.append(Pair(train, support))

    for i in range(len(Train)):
        Train[i].batch.set_input('words')
        Train[i].support.set_input('words')
        Train[i].batch.set_target('onehot')
        Train[i].support.set_target('onehot')
        Train[i].batch.apply(lambda x: len(x['words']), new_field_name='seq_len')
        Train[i].support.apply(lambda x: len(x['words']), new_field_name='seq_len')

    Test.batch.set_input('words')
    Test.support.set_input('words')
    Test.batch.set_target('onehot')
    Test.support.set_target('onehot')
    Test.batch.apply(lambda x: len(x['words']), new_field_name='seq_len')
    Test.support.apply(lambda x: len(x['words']), new_field_name='seq_len')

    Dev.batch.set_input('words')
    Dev.support.set_input('words')
    Dev.batch.set_target('onehot')
    Dev.support.set_target('onehot')
    Dev.batch.apply(lambda x: len(x['words']), new_field_name='seq_len')
    Dev.support.apply(lambda x: len(x['words']), new_field_name='seq_len')

    Train_batch = []
    for i in range(len(Train)):
        if i in ind:
            sampler = BucketSampler(num_buckets=1, batch_size=batch, seq_len_field_name='seq_len')
            Train_batch.append(Pair(Batch(batch_size=batch, dataset=Train[i].batch, sampler=sampler), Train[i].support))
        else:
            sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
            Train_batch.append(Pair(Batch(batch_size=batch, dataset=Train[i].batch, sampler=sampler), Train[i].support))

    sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
    Test_batch = Pair(Batch(batch_size=batch, dataset=Test.batch, sampler=sampler), Test.support)
    Dev_batch = Pair(Batch(batch_size=batch, dataset=Dev.batch, sampler=sampler), Dev.support)
    return Train_batch, Dev_batch, Test_batch, len(vocab)
예제 #2
0
def dump_model_result(config, model):
    tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb'))
    metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag')
    dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb"))
    data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False)
    model.cuda()

    eval_results = {}
    dev_data.set_input('tag')
    dev_data.set_target('seq_len')
    with torch.no_grad():
        for i, (batch_x, batch_y) in enumerate(data_iterator):
            print('batch', i)
            #if i > 10:
            #    break
            char = batch_x['char'].cuda()
            word = batch_x['word'].cuda()
            pos = batch_x['pos'].cuda()
            spo = batch_x['spo'].cuda()
            seq_len = batch_x['seq_len'].cuda()

            tag = batch_y['tag'].cuda()
            
            #pred = model(char, word, pos, spo, seq_len, tag)
            pred = model.predict(char, word, pos, spo, seq_len)  # labels?
            #labels = idx2label(pred['pred'], tag_vocab.idx2word)
            #print(pred)
            #print(tag)
            #exit()
            metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()})
        eval_result = metrics.get_metric()
        metric_name = metrics.__class__.__name__
        eval_results[metric_name] = eval_result

    print("[tester] \n{}".format(_format_eval_results(eval_results)))
예제 #3
0
def cnn_train(epoch, data, model, batch_size=20):
    device = torch.device("cuda")
    optim = torch.optim.Adam(model.parameters(), lr=0.002)
    lossfunc = nn.CrossEntropyLoss()

    train_sampler = BucketSampler(batch_size=batch_size,
                                  seq_len_field_name='seq_len')
    train_batch = Batch(batch_size=batch_size,
                        dataset=data,
                        sampler=train_sampler)

    for i in range(epoch):
        loss_list = []
        cnt = 0
        for batch_x, batch_y in train_batch:
            batch_x['words'] = batch_x['words'].long().contiguous().to(device)
            batch_y['target'] = batch_y['target'].long().contiguous().to(
                device)

            optim.zero_grad()
            output = model(batch_x['words'])
            loss = lossfunc(output['pred'], batch_y['target'])
            loss.backward()
            optim.step()
            loss_list.append(loss.item())

            print('[info] Epoch %d Iteration %d Loss : %f' %
                  (i, cnt, loss_list[-1]))
            cnt += 1

        loss_list.clear()
    torch.save(model.state_dict(), './cnn_state.pth')
예제 #4
0
def train(model, dataset, optimizer, num_epoch=30):
    loss_history = []
    loss_fn = nn.CrossEntropyLoss().to(device)
    for i in range(num_epoch):
        start = time.time()
        print("Epoch: {0} start".format(i))
        model.train()
        print(model_status(model.training))
        losses = 0

        for batch_x, batch_y in Batch(dataset,
                                      sampler=RandomSampler(),
                                      batch_size=batch_size):
            x, lengths, y = pack(batch_x, batch_y)
            score = model(x, lengths)
            loss = loss_fn(score, y)
            losses += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        loss_history.append(losses)
        print("Epoch: {0} finish".format(i))
        acc = predict(model, val_dataset)
        end = time.time()
        print("Epoch: {0}, loss: {1}, accu: {2}, time: {3}\n".format(
            i, losses, acc, end - start))
    return loss_history
예제 #5
0
def cnn_train(epoch, data, model, batch_size=32):
    device = torch.device("cuda")
    optim = torch.optim.Adam(model.parameters(), lr=0.001)
    lossfunc = nn.CrossEntropyLoss()

    train_sampler = BucketSampler(batch_size=batch_size,
                                  seq_len_field_name='seq_len')
    train_batch = Batch(batch_size=batch_size,
                        dataset=data,
                        sampler=train_sampler)

    for i in range(epoch):
        loss_list = []
        cnt = 0
        for batch_x, batch_y in train_batch:
            optim.zero_grad()
            batch_x['words'] = batch_x['words'].long().contiguous().to(device)
            batch_y['target'] = batch_y['target'].long().contiguous().to(
                device)
            output = model(batch_x['words'])
            loss = lossfunc(output['pred'], batch_y['target'])
            loss.backward()
            optim.step()
            loss_list.append(loss.item())
            info_str = '[info] Epoch {:d} Iteration {:d} Loss : {:.2f}'.format(
                i, cnt, loss_list[-1])
            print(info_str)
            with open('./cnn_rec.txt', 'a') as fp:
                fp.write(info_str)
                fp.write('\n')
            cnt += 1
        loss_list.clear()
        torch.save(model.state_dict(), './cnn_state.pth')
예제 #6
0
def preprocess(batch=16):
    raw_data1 = []
    raw_data2 = []

    for i in range(len(traindata.data)):
        raw_data1.append(
            Instance(sentence=traindata.data[i],
                     label=int(traindata.target[i])))
    trainset = DataSet(raw_data1)
    trainset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    for i in range(len(testdata.data)):
        raw_data2.append(
            Instance(sentence=testdata.data[i], label=int(testdata.target[i])))
    testset = DataSet(raw_data2)
    testset.apply(lambda x: pre(x['sentence']), new_field_name='words')

    global vocab
    vocab = Vocabulary(min_freq=1).from_dataset(trainset,
                                                testset,
                                                field_name='words')
    vocab.index_dataset(trainset,
                        testset,
                        field_name='words',
                        new_field_name='words')
    trainset.set_input('words')
    testset.set_input('words')

    trainset.apply(lambda x: int(x['label']),
                   new_field_name='target',
                   is_target=True)
    testset.apply(lambda x: int(x['label']),
                  new_field_name='target',
                  is_target=True)

    trainset.apply(lambda x: len(x['words']), new_field_name='seq_len')
    testset.apply(lambda x: len(x['words']), new_field_name='seq_len')

    global vocabsize
    vocabsize = len(vocab)
    sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
    train_batch = Batch(batch_size=batch, dataset=trainset, sampler=sampler)
    test_batch = Batch(batch_size=batch, dataset=testset, sampler=sampler)

    return train_batch, test_batch, vocabsize
예제 #7
0
    def test_sequential_batch(self):
        batch_size = 32
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)

        batch = Batch(dataset,
                      batch_size=batch_size,
                      sampler=SequentialSampler())
        for batch_x, batch_y in batch:
            pass
예제 #8
0
def dump_all_models_prob(config, models):
    dev_data = pickle.load(
        open(os.path.join(config.data_path, config.dev_name), "rb"))
    bert_dev_data = pickle.load(
        open(os.path.join(config.bert_data_path, config.dev_name), "rb"))

    data_iterator = Batch(dev_data,
                          config.ensemble_batch,
                          sampler=SequentialSampler(),
                          as_numpy=False)
    bert_data_iterator = Batch(bert_dev_data,
                               config.ensemble_batch,
                               sampler=SequentialSampler(),
                               as_numpy=False)

    for i, model in enumerate(models[:-1]):
        dump_one_model_prob(config.prob_path, config.ensemble_models[i],
                            dev_data, model, data_iterator)
    dump_bert_model_prob(config.prob_path, config.ensemble_models[-1],
                         bert_dev_data, models[-1], bert_data_iterator)
예제 #9
0
def predict(model, dataset):
    model.eval()
    print(model_status(model.training))
    num_correct = torch.tensor(0.0)
    num_sample = torch.tensor(0.0)
    for batch_x, batch_y in Batch(dataset,
                                  sampler=SequentialSampler(),
                                  batch_size=batch_size):
        x, lengths, y = pack(batch_x, batch_y)
        score = model(x, lengths)
        y_predict = torch.argmax(score, dim=1)
        num_correct += torch.sum(y_predict == y)
        num_sample += x.shape[0]
    return 1.0 * num_correct / num_sample
예제 #10
0
 def test_list_padding(self):
     ds = DataSet({
         "x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
         "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10
     })
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds,
                  batch_size=4,
                  sampler=SequentialSampler(),
                  as_numpy=True)
     for x, y in iter:
         self.assertEqual(x["x"].shape, (4, 4))
         self.assertEqual(y["y"].shape, (4, 4))
예제 #11
0
    def test_simple(self):
        dataset = construct_dataset([[
            "FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the",
            "world"
        ] for _ in range(40)])
        dataset.set_target()
        batch = Batch(dataset,
                      batch_size=4,
                      sampler=SequentialSampler(),
                      as_numpy=True)

        cnt = 0
        for _, _ in batch:
            cnt += 1
        self.assertEqual(cnt, 10)
예제 #12
0
 def test_list_of_numpy_to_tensor(self):
     ds = DataSet([
         Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)
     ] + [
         Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6]))
         for _ in range(2)
     ])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds,
                  batch_size=4,
                  sampler=SequentialSampler(),
                  as_numpy=False)
     for x, y in iter:
         print(x, y)
예제 #13
0
 def test_list_of_list_to_tensor(self):
     ds = DataSet(
         [Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
         [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds,
                  batch_size=4,
                  sampler=SequentialSampler(),
                  as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
예제 #14
0
 def test_dataset_batching(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds,
                  batch_size=4,
                  sampler=SequentialSampler(),
                  as_numpy=True)
     for x, y in iter:
         self.assertTrue(
             isinstance(x["x"], np.ndarray)
             and isinstance(y["y"], np.ndarray))
         self.assertEqual(len(x["x"]), 4)
         self.assertEqual(len(y["y"]), 4)
         self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
         self.assertListEqual(list(y["y"][-1]), [5, 6])
예제 #15
0
 def test_numpy_to_tensor(self):
     ds = DataSet({
         "x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
         "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)
     })
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds,
                  batch_size=4,
                  sampler=SequentialSampler(),
                  as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
예제 #16
0
def predict(config, model):
    tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb'))
    metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag')
    dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb"))
    char_vocab = pickle.load(open(os.path.join(config.data_path, config.char_vocab_name), "rb"))

    data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False)
    model.cuda()

    schema = get_schemas(config.source_path)

    eval_results = {}
    dev_data.set_input('tag')
    dev_data.set_target('seq_len')
    result = {}
    with torch.no_grad():
        for i, (batch_x, _) in enumerate(data_iterator):
            print('batch', i)
            #if i > 10:
            #    break
            char = batch_x['char'].cuda()
            word = batch_x['word'].cuda()
            pos = batch_x['pos'].cuda()
            spo = batch_x['spo'].cuda()
            seq_len = batch_x['seq_len'].cuda()
            
            #pred = model(char, word, pos, spo, seq_len, tag)
            pred = model.predict(char, word, pos, spo, seq_len)  # labels?

            texts = char2text(char.cpu().data, char_vocab.idx2word)
            labels = idx2label(pred['pred'].cpu().data, tag_vocab.idx2word)
            spos = idx2spo(schema, spo.cpu().data)
            result = label2spo(labels, texts, result, spos)
            #print(pred)
            #print(tag)
            #exit()
            # metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()})
        # eval_result = metrics.get_metric()
        # metric_name = metrics.__class__.__name__
        # eval_results[metric_name] = eval_result

    return result
예제 #17
0
def get_answer(model, dataset):
    answer = []
    print("start to generate result")
    model.eval()
    print(model_status(model.training))
    for batch_x, batch_y in Batch(dataset,
                                  sampler=SequentialSampler(),
                                  batch_size=batch_size):
        x, lengths = pack(batch_x, batch_y, 0)
        score = model(x, lengths)
        y_predict = torch.argmax(score, dim=1).cpu().numpy()
        answer += list(y_predict)
    index = [a + 156061 for a in range(len(answer))]
    name = "result/CNN_pretrain" + str(use_pretrain) + "_freeze" + str(
        freeze_pretrain) + "dropouot" + str(
            dropout_rate) + "_batch_size" + str(batch_size) + "_lr" + str(
                learning_rate) + "_epoch" + str(num_epoch) + ".csv"
    dataframe = pd.DataFrame({'PhraseId': index, 'Sentiment': answer})
    dataframe.to_csv(name, index=False, sep=',')
    return answer
예제 #18
0
best_model_file_name = "{}/model.bin".format(root_dir)
model = models.BertC(name=options.model, dropout=options.dropout, num_class=5)
if options.old_model is not None:
    model.load_state_dict(torch.load(options.old_model))
devices = [int(x) for x in options.devi]
model = nn.DataParallel(model, device_ids=devices)
device = torch.device("cuda:{}".format(devices[0]))
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=options.lr, eps=1e-6)

# train_sampler = BucketSampler(batch_size=options.batch_size, seq_len_field_name='seq_len')
train_sampler = RandomSampler()
dev_sampler = SequentialSampler()
test_batch = Batch(dataset=test_set,
                   batch_size=options.batch_size,
                   sampler=dev_sampler)


def tester(test_batch):
    model.eval()
    total = 0.0
    acc = 0.0
    tar_succ = 0
    ori = 0
    with torch.no_grad():
        for batch_x, batch_y in test_batch:
            if len(batch_x["seq_len"]) < len(devices):
                continue
            seq = batch_x["seq"]
            out = model(seq, batch_x["seq_len"])
예제 #19
0
def predict(config, models, weight):
    test_data = pickle.load(
        open(os.path.join(config.data_path, config.test_name), "rb"))
    bert_test_data = pickle.load(
        open(os.path.join(config.bert_data_path, config.test_name), "rb"))

    data_iterator = Batch(test_data,
                          config.predict_batch,
                          sampler=SequentialSampler(),
                          as_numpy=False)
    bert_data_iterator = Batch(bert_test_data,
                               config.predict_batch,
                               sampler=SequentialSampler(),
                               as_numpy=False)

    for model in models:
        model.cuda()

    schema = get_schemas_list(config.source_path)
    weight = torch.tensor(weight).float()
    weight.cuda()
    weight_sum = torch.sum(weight)

    read_data = []
    with open(os.path.join(config.source_path, config.test_source), 'rb') as f:
        for line in f:
            read_data.append(json.loads(line))

    spo_list = []
    with torch.no_grad():
        for i, ((batch_x, _),
                (bert_batch_x,
                 _)) in enumerate(zip(data_iterator, bert_data_iterator)):
            print('batch', i)
            #if i >= 5:
            #    break
            # batch
            text = batch_x['text'].cuda()
            # target = batch_y['target'].cuda()
            # bert batch
            input_ids = bert_batch_x['input_ids'].cuda()
            token_type_ids = bert_batch_x['token_type_ids'].cuda()
            attention_mask = bert_batch_x['attention_mask'].cuda()
            # label_id = bert_batch_y['label_id'].cuda()

            # assert torch.equal(target, label_id)

            pred = models[-1](input_ids, token_type_ids, attention_mask)
            pred['output'] *= weight[-1]
            for i, model in enumerate(models[:-1]):
                pred['output'] += model(text)['output'] * weight[i]
            pred['output'] /= weight_sum

            for prob in pred['output']:
                spo_list.append(prob2spo(prob, schema))

    with open(os.path.join(config.predict_path, config.predict_name),
              'w') as f:
        for data, spo in zip(read_data, spo_list):
            data["spo_list"] = spo
            f.write(json.dumps(data, ensure_ascii=False) + '\n')
예제 #20
0
        x = self.embedding(chars)
        batch_size = x.size(0)
        length = x.size(1)
        if hasattr(self, 'bigram_embedding'):
            bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size
            x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1)
        self.drop(x)
        x = self.fc1(x)
        feats = self.transformer(x, masks)
        feats = self.fc2(feats)

        probs = self.crf.viterbi_decode(feats, masks, get_score=False)

        return {'pred': probs, 'seq_lens':seq_lens}


if __name__ == "__main__":
    from fastNLP import Batch 
    from fastNLP import RandomSampler
    from train_star import load_ppl2014_fastway

    ds, word_v, tag_v = load_ppl2014_fastway('/home/darktower/nlp_exp/data/ppl2014')
    train, test = ds.split(0.8)
    del ds, test

    data_iterator = Batch(train, sampler=RandomSampler(), batch_size =64, as_numpy=False)
    for batch_x, batch_y in data_iterator:
        break

    cws = TransformerCWS(len(word_vocab), 350, tag_size=len(tag_v))
    print('Loss is: ', cws(batch_x['words'], batch_y['target'], batch_x['seq_len']))
예제 #21
0
def getdata_proto(task, type, batch=4):
    Task, vocab, ind, index, testset, devset = preprocess(task, type)
    Train = []
    global Test
    for i in range(len(Task)):
        vocab.index_dataset(Task[i], field_name='words', new_field_name='words')
        if i in ind:
            list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
            rawsupport0 = []
            rawsupport1 = []
            while (len(rawsupport0) == 0 or len(rawsupport1) == 0):
                slice = random.sample(list, 4)
                another = [x for x in list if x not in slice]
                train = Task[i][another]

                for inn in slice:
                    if Task[i][inn]['label'] == -1:
                        rawsupport0.append(inn)
                    else:
                        rawsupport1.append(inn)
            support0 = Task[i][rawsupport0]
            support1 = Task[i][rawsupport1]
        else:
            length = len(Task[i])
            list = [x - 1 for x in range(length)]
            rawsupport0 = []
            rawsupport1 = []
            while (len(rawsupport0) == 0 or len(rawsupport1) == 0):
                slice = random.sample(list, 20)
                another = [x for x in list if x not in slice]
                train = Task[i][another]

                for inn in slice:
                    if Task[i][inn]['label'] == -1:
                        rawsupport0.append(inn)
                    else:
                        rawsupport1.append(inn)

            support0 = Task[i][rawsupport0]
            support1 = Task[i][rawsupport1]
        if i == index:
            Test = Triple(testset, support0, support1)
            Dev = Triple(devset, support0, support1)
        Train.append(Triple(train, support0, support1))

    for i in range(len(Train)):
        Train[i].batch.set_input('words')
        Train[i].support0.set_input('words')
        Train[i].support1.set_input('words')
        Train[i].batch.set_target('onehot')
        Train[i].support0.set_target('onehot')
        Train[i].support1.set_target('onehot')
        Train[i].batch.apply(lambda x: len(x['words']), new_field_name='seq_len')
        Train[i].support0.apply(lambda x: len(x['words']), new_field_name='seq_len')
        Train[i].support1.apply(lambda x: len(x['words']), new_field_name='seq_len')

    Test.batch.set_input('words')
    Test.support0.set_input('words')
    Test.support1.set_input('words')
    Test.batch.set_target('onehot')
    Test.support0.set_target('onehot')
    Test.support1.set_target('onehot')
    Test.batch.apply(lambda x: len(x['words']), new_field_name='seq_len')
    Test.support0.apply(lambda x: len(x['words']), new_field_name='seq_len')
    Test.support1.apply(lambda x: len(x['words']), new_field_name='seq_len')

    Dev.batch.set_input('words')
    Dev.support0.set_input('words')
    Dev.support1.set_input('words')
    Dev.batch.set_target('onehot')
    Dev.support0.set_target('onehot')
    Dev.support1.set_target('onehot')
    Dev.batch.apply(lambda x: len(x['words']), new_field_name='seq_len')
    Dev.support0.apply(lambda x: len(x['words']), new_field_name='seq_len')
    Dev.support1.apply(lambda x: len(x['words']), new_field_name='seq_len')

    Train_batch = []
    for i in range(len(Train)):
        if i in ind:
            sampler = BucketSampler(num_buckets=1, batch_size=batch, seq_len_field_name='seq_len')
            Train_batch.append(Triple(Batch(batch_size=batch, dataset=Train[i].batch, sampler=sampler), Train[i].support0, Train[i].support1))
        else:
            sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
            Train_batch.append(Triple(Batch(batch_size=batch, dataset=Train[i].batch, sampler=sampler), Train[i].support0, Train[i].support1))

    sampler = BucketSampler(batch_size=batch, seq_len_field_name='seq_len')
    Test_batch = Triple(Batch(batch_size=batch, dataset=Test.batch, sampler=sampler), Test.support0, Test.support1)
    Dev_batch = Triple(Batch(batch_size=batch, dataset=Dev.batch, sampler=sampler), Dev.support0, Dev.support1)
    return Train_batch, Dev_batch, Test_batch, len(vocab)
예제 #22
0
def train(args):
    text_data = TextData()
    with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin:
        text_data = pickle.load(fin)
    vocab_size = text_data.vocab_size
    class_num = text_data.class_num
    # class_num = 1
    seq_len = text_data.max_seq_len
    print("(vocab_size,class_num,seq_len):({0},{1},{2})".format(
        vocab_size, class_num, seq_len))

    train_data = text_data.train_set
    val_data = text_data.val_set
    test_data = text_data.test_set
    train_data.set_input('words', 'seq_len')
    train_data.set_target('target')
    val_data.set_input('words', 'seq_len')
    val_data.set_target('target')

    test_data.set_input('words', 'seq_len')
    test_data.set_target('target')

    init_embeds = None
    if args.pretrain_model == "None":
        print("No pretrained model with be used.")
        print("vocabsize:{0}".format(vocab_size))
        init_embeds = (vocab_size, args.embed_size)
    elif args.pretrain_model == "word2vec":
        embeds_path = os.path.join(args.prepare_dir, 'w2v_embeds.pkl')
        print("Loading Word2Vec pretrained embedding from {0}.".format(
            embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    elif args.pretrain_model == 'glove':
        embeds_path = os.path.join(args.prepare_dir, 'glove_embeds.pkl')
        print(
            "Loading Glove pretrained embedding from {0}.".format(embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    elif args.pretrain_model == 'glove2wv':
        embeds_path = os.path.join(args.prepare_dir, 'glove2wv_embeds.pkl')
        print(
            "Loading Glove pretrained embedding from {0}.".format(embeds_path))
        with open(embeds_path, 'rb') as fin:
            init_embeds = pickle.load(fin)
    else:
        init_embeds = (vocab_size, args.embed_size)

    if args.model == "CNNText":
        print("Using CNN Model.")
        model = CNNText(init_embeds,
                        num_classes=class_num,
                        padding=2,
                        dropout=args.dropout)
    elif args.model == "StarTransformer":
        print("Using StarTransformer Model.")
        model = STSeqCls(init_embeds,
                         num_cls=class_num,
                         hidden_size=args.hidden_size)
    elif args.model == "MyCNNText":
        model = MyCNNText(init_embeds=init_embeds,
                          num_classes=class_num,
                          padding=2,
                          dropout=args.dropout)
        print("Using user defined CNNText")
    elif args.model == "LSTMText":
        print("Using LSTM Model.")
        model = LSTMText(init_embeds=init_embeds,
                         output_dim=class_num,
                         hidden_dim=args.hidden_size,
                         num_layers=args.num_layers,
                         dropout=args.dropout)
    elif args.model == "Bert":
        print("Using Bert Model.")
    else:
        print("Using default model: CNNText.")
        model = CNNText((vocab_size, args.embed_size),
                        num_classes=class_num,
                        padding=2,
                        dropout=0.1)
    print(model)
    if args.cuda:
        device = torch.device('cuda')
    else:
        device = None

    print("train_size:{0} ; val_size:{1} ; test_size:{2}".format(
        train_data.get_length(), val_data.get_length(),
        test_data.get_length()))

    if args.optim == "Adam":
        print("Using Adam as optimizer.")
        optimizer = fastnlp_optim.Adam(lr=0.001,
                                       weight_decay=args.weight_decay)
        if (args.model_suffix == "default"):
            args.model_suffix == args.optim
    else:
        print("No Optimizer will be used.")
        optimizer = None

    criterion = CrossEntropyLoss()
    metric = AccuracyMetric()
    model_save_path = os.path.join(args.model_dir, args.model,
                                   args.model_suffix)
    earlystop = EarlyStopCallback(args.patience)
    fitlog_back = FitlogCallback({"val": val_data, "train": train_data})
    trainer = Trainer(train_data=train_data,
                      model=model,
                      save_path=model_save_path,
                      device=device,
                      n_epochs=args.epochs,
                      optimizer=optimizer,
                      dev_data=val_data,
                      loss=criterion,
                      batch_size=args.batch_size,
                      metrics=metric,
                      callbacks=[fitlog_back, earlystop])
    trainer.train()
    print("Train Done.")

    tester = Tester(data=val_data,
                    model=model,
                    metrics=metric,
                    batch_size=args.batch_size,
                    device=device)
    tester.test()
    print("Test Done.")

    print("Predict the answer with best model...")
    acc = 0.0
    output = []
    data_iterator = Batch(test_data, batch_size=args.batch_size)
    for data_x, batch_y in data_iterator:
        i_data = Variable(data_x['words']).cuda()
        pred = model(i_data)[C.OUTPUT]
        pred = pred.sigmoid()
        # print(pred.shape)
        output.append(pred.cpu().data)
    output = torch.cat(output, 0).numpy()
    print(output.shape)
    print("Predict Done. {} records".format(len(output)))
    result_save_path = os.path.join(args.result_dir,
                                    args.model + "_" + args.model_suffix)
    with open(result_save_path + ".pkl", 'wb') as f:
        pickle.dump(output, f)
    output = output.squeeze()[:, 1].tolist()
    projectid = text_data.test_projectid.values
    answers = []
    count = 0
    for i in range(len(output)):
        if output[i] > 0.5:
            count += 1
    print("true sample count:{}".format(count))
    add_count = 0
    for i in range(len(projectid) - len(output)):
        output.append([0.13])
        add_count += 1
    print("Add {} default result in predict.".format(add_count))

    df = pd.DataFrame()
    df['projectid'] = projectid
    df['y'] = output
    df.to_csv(result_save_path + ".csv", index=False)
    print("Predict Done, results saved to {}".format(result_save_path))

    fitlog.finish()
예제 #23
0
def predict(args):
    text_data = TextData()
    with open(os.path.join(args.vocab_dir, args.vocab_data), 'rb') as fin:
        text_data = pickle.load(fin)
    vocab_size = text_data.vocab_size
    class_num = text_data.class_num
    # class_num = 1
    seq_len = text_data.max_seq_len
    print("(vocab_size,class_num,seq_len):({0},{1},{2})".format(
        vocab_size, class_num, seq_len))

    test_data = text_data.test_set
    test_data.set_input('words', 'seq_len')
    test_data.set_target('target')
    test_size = test_data.get_length()
    print("test_size:{}".format(test_size))
    print("Data type:{}".format(type(test_data)))

    init_embeds = None
    model_save_path = os.path.join(args.model_dir, args.model,
                                   args.model_suffix, args.reload_model_name)
    print("Loading the model {}".format(model_save_path))
    model = torch.load(model_save_path)
    model.eval()
    print(model)
    if args.cuda:
        device = torch.device('cuda')
    else:
        device = None
    model.to(device)
    acc = 0.0
    output = []
    data_iterator = Batch(test_data, batch_size=args.batch_size)
    for data_x, batch_y in data_iterator:
        i_data = Variable(data_x['words']).cuda()
        pred = model(i_data)[C.OUTPUT]
        pred = pred.sigmoid()
        # print(pred.shape)
        output.append(pred.cpu().data)
    output = torch.cat(output, 0).numpy()
    print(output.shape)
    print("Predict Done.{} records".format(len(output) * args.batch_size))
    result_save_path = os.path.join(
        args.result_dir,
        args.model + args.model_suffix + args.reload_model_name)
    with open(result_save_path + ".pkl", 'wb') as f:
        pickle.dump(output, f)
    output = output.squeeze()[:, 1].tolist()
    projectid = text_data.test_projectid.values
    answers = []
    count = 0
    for i in range(len(output)):
        if output[i] > 0.5:
            count += 1
    print("pc1 < 0.5 count:{}".format(count))
    for i in range(len(projectid) - len(output)):
        output.append([0.87])

    df = pd.DataFrame()
    df['projectid'] = projectid
    df['y'] = output
    df.to_csv(result_save_path + ".csv", index=False)
    print("Predict Done, results saved to {}".format(result_save_path))
    # with open(result_save_path,'w') as f:

    #     for i in output:
    #         f.write()
    fitlog.finish()
예제 #24
0
def ensemble(config, models, sum_prob=False, weight=[1, 1, 1, 1, 1]):
    f1 = F1_score(pred='output', target='target')
    f1.tp.cuda()
    f1.fp.cuda()
    f1.fn.cuda()

    dev_data = pickle.load(
        open(os.path.join(config.data_path, config.dev_name), "rb"))
    bert_dev_data = pickle.load(
        open(os.path.join(config.bert_data_path, config.dev_name), "rb"))

    data_iterator = Batch(dev_data,
                          config.ensemble_batch,
                          sampler=SequentialSampler(),
                          as_numpy=False)
    bert_data_iterator = Batch(bert_dev_data,
                               config.ensemble_batch,
                               sampler=SequentialSampler(),
                               as_numpy=False)

    for model in models:
        model.cuda()

    eval_results = {}
    weight = torch.tensor(weight)
    weight.cuda()
    weight_sum = torch.sum(weight).float()
    with torch.no_grad():
        for i, ((batch_x, batch_y), (bert_batch_x, bert_batch_y)) in enumerate(
                zip(data_iterator, bert_data_iterator)):
            print('batch', i)
            #if i > 10:
            #    break
            # batch
            text = batch_x['text'].cuda()
            target = batch_y['target'].cuda()
            # bert batch
            input_ids = bert_batch_x['input_ids'].cuda()
            token_type_ids = bert_batch_x['token_type_ids'].cuda()
            attention_mask = bert_batch_x['attention_mask'].cuda()
            label_id = bert_batch_y['label_id'].cuda()

            #assert torch.equal(target, label_id)

            pred = models[-1](input_ids, token_type_ids, attention_mask)
            pred['output'] *= weight[-1]
            #if not sum_prob:
            #    pred['output'][pred['output'] >= 0.5] = 1.0 * weight[-1]
            #    pred['output'][pred['output'] < 0.5] = 0.0
            #    for i, model in enumerate(models[:-1]):
            #        temp = model(text)['output']
            #        temp[temp >= 0.5] = 1.0 * weight[i]
            #        temp[temp < 0.5] = 0.0
            #        pred['output'] += temp
            #else:
            for i, model in enumerate(models[:-1]):
                pred['output'] += model(text)['output'] * weight[i]
            pred['output'] /= weight_sum

            #bert_batch_y['label_id'].cuda()
            f1({'output': pred['output'].cuda()},
               {'label_id': bert_batch_y['label_id'].cuda()})
        eval_result = f1.get_metric()
        metric_name = f1.__class__.__name__
        eval_results[metric_name] = eval_result

    print("[ensemble] \n{}".format(_format_eval_results(eval_results)))