예제 #1
0
def load_data(data_dir, task, cv, training_data_size=None):
    # Load data from file
    if task == "trec":
        train_x, train_y, test_x, test_y = load_dataset[task](data_dir)
        data = train_x + test_x
        label = None
    elif task == "sst":
        train_x, train_y, valid_x, valid_y, test_x, test_y = load_dataset[
            task](data_dir)
        data = train_x + valid_x + test_x
        label = None
    else:
        data, label = load_dataset[task](data_dir)

    if task == "trec":
        train_x, train_y, valid_x, valid_y = cv_split2(train_x,
                                                       train_y,
                                                       nfold=10,
                                                       valid_id=cv)
    elif task != "sst":
        train_x, train_y, valid_x, valid_y, test_x, test_y = cv_split(
            data, label, nfold=10, test_id=cv)

    nclasses = max(train_y) + 1

    # Set the training dataset size
    if training_data_size:
        train_x = train_x[:training_data_size]
        train_y = train_y[:training_data_size]

    train_y = torch.LongTensor(train_y)
    valid_y = torch.LongTensor(valid_y)
    test_y = torch.LongTensor(test_y)

    dataset = {
        "train": (train_x, train_y),
        "valid": (valid_x, valid_y),
        "test": (test_x, test_y),
        "nclasses": nclasses,
    }

    return dataset, data
예제 #2
0
def main(args):
    if args.dataset == 'mr':
        data, label = dataloader.read_MR(args.path)
    elif args.dataset == 'subj':
        data, label = dataloader.read_SUBJ(args.path)
    elif args.dataset == 'cr':
        data, label = dataloader.read_CR(args.path)
    elif args.dataset == 'mpqa':
        data, label = dataloader.read_MPQA(args.path)
    elif args.dataset == 'trec':
        train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path)
        data = train_x + test_x
        label = None
    elif args.dataset == 'sst':
        train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path)
        data = train_x + valid_x + test_x
        label = None
    else:
        raise Exception("unknown dataset: {}".format(args.dataset))

    emb_layer = modules.EmbeddingLayer(
        args.d, data,
        embs = dataloader.load_embedding(args.embedding)
    )

    if args.dataset == 'trec':
        train_x, train_y, valid_x, valid_y = dataloader.cv_split2(
            train_x, train_y,
            nfold = 10,
            valid_id = args.cv
        )
    elif args.dataset != 'sst':
        train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split(
            data, label,
            nfold = 10,
            test_id = args.cv
        )

    nclasses = max(train_y)+1

    train_x, train_y = dataloader.create_batches(
        train_x, train_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )
    valid_x, valid_y = dataloader.create_batches(
        valid_x, valid_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )
    test_x, test_y = dataloader.create_batches(
        test_x, test_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )

    model = Model(args, emb_layer, nclasses).cuda()
    need_grad = lambda x: x.requires_grad
    optimizer = optim.Adam(
        filter(need_grad, model.parameters()),
        lr = args.lr
    )

    best_valid = 1e+8
    test_err = 1e+8
    for epoch in range(args.max_epoch):
        best_valid, test_err = train_model(epoch, model, optimizer,
            train_x, train_y,
            valid_x, valid_y,
            test_x, test_y,
            best_valid, test_err
        )
        if args.lr_decay>0:
            optimizer.param_groups[0]['lr'] *= args.lr_decay

    sys.stdout.write("best_valid: {:.6f}\n".format(
        best_valid
    ))
    sys.stdout.write("test_err: {:.6f}\n".format(
        test_err
    ))
예제 #3
0
def main(args):
    datasetList = ['mr', 'subj', 'cr', 'mpqa', 'trec', 'sst']
    numberOfTest = 5
    args.max_epoch = 100
    for dset in datasetList:
        if dset == 'mr':
            data, label = dataloader.read_MR(args.path)
        elif dset == 'subj':
            data, label = dataloader.read_SUBJ(args.path)
        elif dset == 'cr':
            data, label = dataloader.read_CR(args.path)
        elif dset == 'mpqa':
            data, label = dataloader.read_MPQA(args.path)
        elif dset == 'trec':
            train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path)
            data = train_x + test_x
            label = None
        elif dset == 'sst':
            train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path)
            data = train_x + valid_x + test_x
            label = None
        else:
            raise Exception("unknown dataset: {}".format(dset))

        emb_layer = modules.EmbeddingLayer(
            args.d, data,
            embs = dataloader.load_embedding(args.embedding)
        )

        if dset == 'trec':
            train_x, train_y, valid_x, valid_y = dataloader.cv_split2(
                train_x, train_y,
                nfold = 10,
                valid_id = args.cv
            )
        elif dset != 'sst':
            train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split(
                data, label,
                nfold = 10,
                test_id = args.cv
            )
        nclasses = max(train_y)+1

        train_x, train_y = dataloader.create_batches(train_x, train_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')
        valid_x, valid_y = dataloader.create_batches(valid_x, valid_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')
        test_x, test_y = dataloader.create_batches(test_x, test_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')

        for models in range(3):
            if models == 1:
                args.cnn = True
                modelName = 'CNN'
            elif models == 2:
                args.cnn = False
                args.lstm = True
                modelName = 'LSTM'
            else:
                args.lstm = False
                modelName = 'SRU'

            sys.stdout.write("Training {} with {} architecture: \n".format(dset,modelName))
            args.dropout = 0.5


            for testNo in range(numberOfTest):
                model = Model(args, emb_layer, nclasses).cuda()
                need_grad = lambda x: x.requires_grad
                optimizer = optim.Adam(filter(need_grad, model.parameters()), lr = args.lr)

                best_valid = 1e+8
                test_err = 1e+8
                results = []
                for epoch in range(args.max_epoch):
                    results.append(train_model(epoch, model, optimizer, train_x, train_y, valid_x, valid_y, test_x, test_y, best_valid, test_err))
                
                with open('results_{d}_{m}_{i}.csv'.format(d=dset, m=modelName, i=(testNo+1)), 'wb') as dump:
                    wr = csv.writer(dump, delimiter=',')
                    wr.writerow(['Epoch','Training Loss', 'Validation Error', 'Test Error', 'Duration'])
                    for idx, value in enumerate(results):
                        wr.writerow(value)