Exemplo n.º 1
0
def predict(args, data_name):
    from tqdm import tqdm
    from torch.utils.data import DataLoader
    from network import BertForMultiLabelSequenceClassification
    from utils import SubmitGenerator

    with open('../dataset/testData_%s.pkl' % data_name, 'rb') as f:
        testData = pickle.load(f)

    device = torch.device('cuda:%d' %
                          args.cuda if torch.cuda.is_available() else 'cpu')
    model = BertForMultiLabelSequenceClassification.from_pretrained(
        args.model, num_labels=3)
    model.load_state_dict(
        torch.load('../model/%s/model.pkl.%d' %
                   (args.dir_name, args.checkpoint)))
    model.train(False)
    model.to(device)

    dataloader = DataLoader(dataset=testData,
                            batch_size=args.batch_size,
                            shuffle=False,
                            collate_fn=testData.collate_fn,
                            num_workers=1)
    trange = tqdm(enumerate(dataloader), total=len(dataloader), desc='Predict')
    prediction = []
    for i, (tokens, segments, masks, node_vec, tfidf, labels) in trange:
        with torch.no_grad():
            o_labels = model(tokens.to(device), node_vec.to(device),
                             tfidf.to(device), segments.to(device),
                             masks.to(device))
            o_labels = o_labels > 0.0
            prediction.append(o_labels.to('cpu'))

    prediction = torch.cat(prediction).detach().numpy().astype(int)

    if not os.path.exists('../score/task 2/'):
        os.makedirs('../score/task 2/')
    SubmitGenerator(prediction, '../data/task2_sample_submission.csv', True,
                    '../score/task 2/task2_submission.csv')
Exemplo n.º 2
0
    return rankn


def with_feedback(query):
    relations = []
    for doc in doc_datas:
        relations.append(calc_relation(doc['bigram'], query))
    relations_idx = sorted(range(len(relations)), key=lambda k: relations[k])

    query = rocchio_feedback(query, get_rankn_bigram(relations_idx[-1:-101:-1], doc_datas), 10)
    relations = []
    for doc in doc_datas:
        relations.append(calc_relation(doc['bigram'], query))
    relations_idx = sorted(range(len(relations)), key=lambda k: relations[k])
    rankn = idx2file_name(relations_idx[-1:-101:-1])
    return rankn


if mp.cpu_count() >= 4:
    with mp.Pool(4) as pool:
        if args.feedback:
            results = pool.map(with_feedback, query_data)
        else:
            results = pool.map(test, query_data)
else:
    if args.feedback:
        results = pool.map(with_feedback, query_data)
    else:
        results = pool.map(test, query_data)
SubmitGenerator(results, args.ranked_list)
Exemplo n.º 3
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--arch',
                        default="model",
                        help='architecture (model_dir)')
    parser.add_argument('--do_train', action='store_true')
    parser.add_argument('--do_predict', action='store_true')
    parser.add_argument('--do_plot', action='store_true')
    parser.add_argument('--hidden_size', default=256, type=int)
    parser.add_argument('--batch_size', default=256, type=int)
    parser.add_argument('--max_epoch', default=10000, type=int)
    parser.add_argument('--lr', default=1e-3, type=float)
    parser.add_argument('--step_lr', default=0.5, type=float)
    parser.add_argument('--cuda', default=0, type=int)
    parser.add_argument('--ckpt',
                        type=int,
                        help='load pre-trained model epoch')
    args = parser.parse_args()

    if args.do_train:

        dataset = pd.read_csv("../../data/train.csv")
        dataset.drop("Id", axis=1, inplace=True)
        train_set, valid_set = train_test_split(dataset,
                                                test_size=0.1,
                                                random_state=73)
        feature_for_training = ["F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9"]
        feature_for_prediction = ["F1"]

        train = preprocess_samples(train_set, feature_for_training,
                                   feature_for_prediction)
        valid = preprocess_samples(valid_set, feature_for_training,
                                   feature_for_prediction)

        trainData = FeatureDataset(train)
        validData = FeatureDataset(valid)

        device = torch.device(
            'cuda:%d' % args.cuda if torch.cuda.is_available() else 'cpu')
        max_epoch = args.max_epoch
        trainer = Trainer(device, trainData, validData, args)

        for epoch in range(1, max_epoch + 1):
            print('Epoch: {}'.format(epoch))
            trainer.run_epoch(epoch, True)
            trainer.run_epoch(epoch, False)

    if args.do_predict:

        dataset = pd.read_csv("../../data/test.csv")
        dataset.drop("Id", axis=1, inplace=True)
        feature_for_testing = ["F2", "F3", "F4", "F5", "F6", "F7", "F8", "F9"]
        test = preprocess_samples(dataset, feature_for_testing)

        testData = FeatureDataset(test)

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = SimpleNet(input_size=9,
                          output_size=12,
                          hidden_size=args.hidden_size)
        model.load_state_dict(
            torch.load('%s/model.pkl.%d' % (args.arch, args.ckpt)))
        model.train(False)
        model.to(device)
        dataloader = DataLoader(dataset=testData,
                                batch_size=args.batch_size,
                                shuffle=False,
                                collate_fn=testData.collate_fn,
                                num_workers=4)
        trange = tqdm(enumerate(dataloader),
                      total=len(dataloader),
                      desc='Predict')
        prediction = []
        for i, (ft, _, y) in trange:
            b = ft.shape[0]
            missing_ft = torch.zeros(b, 1)
            all_ft = torch.cat([missing_ft, ft], dim=1)
            o_labels, _ = model(all_ft.to(device))
            o_labels = torch.argmax(o_labels, axis=1)
            prediction.append(o_labels.to('cpu').numpy().tolist())

        prediction = sum(prediction, [])
        SubmitGenerator(prediction, "../../data/sampleSubmission.csv")

    if args.do_plot:
        plot_history("{file}/history.json".format(file=args.arch))
Exemplo n.º 4
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--arch',
                        required=True,
                        help='architecture (model_dir)')
    parser.add_argument('--data_dir', default='../../data/', type=str)
    parser.add_argument('--do_train', action='store_true')
    parser.add_argument('--do_predict', action='store_true')
    parser.add_argument('--hidden_size', default=512, type=int)
    parser.add_argument('--batch_size', default=64, type=int)
    parser.add_argument('--max_epoch', default=300, type=int)
    parser.add_argument('--lr', default=1e-3, type=float)
    parser.add_argument('--cuda', default=1, type=int)
    parser.add_argument('--ckpt',
                        default=-1,
                        type=int,
                        help='load pre-trained model epoch')
    args = parser.parse_args()

    if args.do_train:

        dataset = pd.read_csv(args.data_dir + "train.csv")
        dataset.drop("Id", axis=1, inplace=True)

        # drop outlier
        # outlier_idx = []
        # features = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10', 'F11', 'F12', 'F13', 'F14']
        # for f in features:
        #     outlier_idx += get_outlier(dataset[f])
        # outlier_idx = list(set(outlier_idx))
        # print(len(outlier_idx))
        # dataset.drop(outlier_idx)

        train_set, valid_set = train_test_split(dataset,
                                                test_size=0.1,
                                                random_state=58)
        train = preprocess_samples(train_set, missing=["F2", "F7", "F12"])
        valid = preprocess_samples(valid_set, missing=["F2", "F7", "F12"])
        trainData = FeatureDataset(train)
        validData = FeatureDataset(valid)

        device = torch.device(
            'cuda:%d' % args.cuda if torch.cuda.is_available() else 'cpu')
        model = simpleNet(args.hidden_size)
        model.to(device)
        trainer = Trainer(device, trainData, validData, model, args.lr,
                          args.batch_size, args.arch)

        for epoch in range(1, args.max_epoch + 1):
            print('Epoch: {}'.format(epoch))
            trainer.run_epoch(epoch, True)
            trainer.run_epoch(epoch, False)
            trainer.save(epoch)

    if args.do_predict:

        dataset = pd.read_csv(args.data_dir + "test.csv")
        dataset.drop("Id", axis=1, inplace=True)
        test = preprocess_samples(dataset, missing=["F2", "F7", "F12"])
        testData = FeatureDataset(test)

        device = torch.device(
            'cuda:%d' % args.cuda if torch.cuda.is_available() else 'cpu')
        model = simpleNet(args.hidden_size)
        model.load_state_dict(
            torch.load('%s/model.pkl.%d' % (args.arch, args.ckpt)))
        model.train(False)
        model.to(device)
        dataloader = DataLoader(dataset=testData,
                                batch_size=args.batch_size,
                                shuffle=False,
                                collate_fn=testData.collate_fn,
                                num_workers=4)
        trange = tqdm(enumerate(dataloader),
                      total=len(dataloader),
                      desc='Predict')
        prediction = []
        for i, (x, missing, y) in trange:
            # call model.predict instead of model.forward
            o_labels = model.predict(x.to(device))
            o_labels = torch.argmax(o_labels, axis=1)
            prediction.append(o_labels.to('cpu'))

        prediction = torch.cat(prediction).detach().numpy().astype(int)
        SubmitGenerator(prediction, args.data_dir + 'sampleSubmission.csv')
Exemplo n.º 5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--arch',
                        required=True,
                        help='architecture (model_dir)')
    parser.add_argument('--data_dir', default='../../data/', type=str)
    parser.add_argument('--do_train', action='store_true')
    parser.add_argument('--do_predict', action='store_true')
    parser.add_argument('--hidden_size', default=512, type=int)
    parser.add_argument('--batch_size', default=32, type=int)
    parser.add_argument('--max_epoch', default=800, type=int)
    parser.add_argument('--lr', default=1e-3, type=float)
    parser.add_argument('--cuda', default=1, type=int)
    parser.add_argument('--ckpt',
                        default=-1,
                        type=int,
                        help='load pre-trained model epoch')
    args = parser.parse_args()

    if args.do_train:

        dataset = pd.read_csv(args.data_dir + "train.csv")
        dataset.drop("Id", axis=1, inplace=True)

        train_set, valid_set = train_test_split(dataset,
                                                test_size=0.2,
                                                random_state=42)
        train = preprocess_samples(train_set, missing=["F2", "F7", "F12"])
        valid = preprocess_samples(valid_set, missing=["F2", "F7", "F12"])
        trainData = FeatureDataset(train)
        validData = FeatureDataset(valid)

        device = torch.device(
            'cuda:%d' % args.cuda if torch.cuda.is_available() else 'cpu')
        trainer = Trainer(device, trainData, validData, args.hidden_size,
                          args.lr, args.batch_size, args.arch)

        for epoch in range(1, args.max_epoch + 1):
            print('Epoch: {}'.format(epoch))
            trainer.run_epoch(epoch, True)
            trainer.run_epoch(epoch, False)
            if epoch % 50 == 0:
                trainer.save(epoch)

    if args.do_predict:

        dataset = pd.read_csv(args.data_dir + "test.csv")
        dataset.drop("Id", axis=1, inplace=True)
        test = preprocess_samples(dataset, missing=["F2", "F7", "F12"])
        testData = FeatureDataset(test)

        path = '%s/model.pkl.%d' % (args.arch, args.ckpt)
        checkpoint = torch.load(path)
        device = torch.device(
            'cuda:%d' % args.cuda if torch.cuda.is_available() else 'cpu')

        model = SimpleNet(args.hidden_size)
        model.load_state_dict(checkpoint['model'])
        model.to(device)
        model.train(False)
        generator = Generator(args.hidden_size)
        generator.load_state_dict(checkpoint['generator'])
        generator.to(device)
        generator.train(False)

        dataloader = DataLoader(dataset=testData,
                                batch_size=args.batch_size,
                                shuffle=False,
                                collate_fn=testData.collate_fn,
                                num_workers=4)
        trange = tqdm(enumerate(dataloader),
                      total=len(dataloader),
                      desc='Predict')
        prediction = []
        for i, (features, missing, y) in trange:

            gen_missing = generator(features.to(device))
            all_features = torch.cat(
                (features.to(device), gen_missing.to(device)), dim=1)
            o_labels = model(all_features)
            o_labels = F.sigmoid(o_labels) > 0.5
            prediction.append(o_labels.to('cpu'))

        prediction = torch.cat(prediction).detach().numpy().astype(int)
        SubmitGenerator(prediction, args.data_dir + 'sampleSubmission.csv')
Exemplo n.º 6
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--arch', required=True, help='architecture (model_dir)')
    parser.add_argument('--data_dir', default='../../data/', type=str)
    parser.add_argument('--do_train', action='store_true')
    parser.add_argument('--do_predict', action='store_true')
    parser.add_argument('--hidden_size', default=512, type=int)
    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--max_epoch', default=300, type=int)
    parser.add_argument('--lr', default=1e-3, type=float)
    parser.add_argument('--cuda', default=0, type=int)
    parser.add_argument('--ckpt', default=-1, type=int, help='load pre-trained model epoch')
    args = parser.parse_args()

    if args.do_train:

        dataset = pd.read_csv(args.data_dir + "train.csv")
        dataset.drop("Id", axis=1, inplace=True)
        dataset.drop("F2", axis=1, inplace=True)
        dataset.drop("F7", axis=1, inplace=True)
        dataset.drop("F12", axis=1, inplace=True)
        train_set, valid_set = train_test_split(dataset, test_size=0.1, random_state=58)
        train = preprocess_samples(train_set, missing=["F2", "F7", "F12"])
        valid = preprocess_samples(valid_set, missing=["F2", "F7", "F12"])
        trainData = FeatureDataset(train)
        validData = FeatureDataset(valid)

        device = torch.device('cuda:%d' % args.cuda if torch.cuda.is_available() else 'cpu')
        model = simpleNet(args.hidden_size)
        model.to(device)
        opt = torch.optim.Adam(model.parameters(), lr=args.lr)
        # criteria = torch.nn.CrossEntropyLoss()
        criteria = torch.nn.BCEWithLogitsLoss()
        max_epoch = args.max_epoch
        batch_size = args.batch_size
        trainer = Trainer(device, trainData, validData, model, criteria, opt, batch_size, args.arch)

        for epoch in range(1, max_epoch + 1):
            print('Epoch: {}'.format(epoch))
            trainer.run_epoch(epoch, True)
            trainer.run_epoch(epoch, False)
            trainer.save(epoch)

    if args.do_predict:

        dataset = pd.read_csv(args.data_dir + "test.csv")
        dataset.drop("Id", axis=1, inplace=True)
        dataset.drop("F2", axis=1, inplace=True)
        dataset.drop("F7", axis=1, inplace=True)
        dataset.drop("F12", axis=1, inplace=True)
        test = preprocess_samples(dataset, missing=["F2", "F7", "F12"])
        testData = FeatureDataset(test)

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = simpleNet(args.hidden_size)
        model.load_state_dict(torch.load('%s/model.pkl.%d' % (args.arch, args.ckpt)))
        model.train(False)
        model.to(device)
        dataloader = DataLoader(dataset=testData,
                                batch_size=args.batch_size,
                                shuffle=False,
                                collate_fn=testData.collate_fn,
                                num_workers=4)
        trange = tqdm(enumerate(dataloader), total=len(dataloader), desc='Predict')
        prediction = []
        for i, (x, y) in trange:
            o_labels = model(x.to(device))
            o_labels = F.sigmoid(o_labels) > 0.5
            prediction.append(o_labels.to('cpu'))

        prediction = torch.cat(prediction).detach().numpy().astype(int)
        SubmitGenerator(prediction, args.data_dir + 'sampleSubmission.csv')
Exemplo n.º 7
0
def main():

    #parser = argparse.ArgumentParser()
    parser = ArgumentParser()
    parser.add_argument('--arch', required=True)
    parser.add_argument('--do_train', action='store_true')
    parser.add_argument('--do_predict', action='store_true')
    parser.add_argument('--data_dir', default='../../data/', type=str)
    parser.add_argument('--cuda', default=0)
    parser.add_argument('--hidden_size', default=256, type=int)
    parser.add_argument('--batch_size', default=256, type=int)
    parser.add_argument('--max_epoch', default=1500, type=int)
    parser.add_argument('--lr', default=1e-3, type=float)
    parser.add_argument('--wd', default=1e-2, type=float)
    parser.add_argument('--do_plot', action='store_true')
    args = parser.parse_args()

    missing_list = ["F1"]

    if args.do_train:

        data = pd.read_csv(args.data_dir + 'train.csv')
        # axis = 0 for row ; axis = 1 for column
        # inplace = if modify the origin data
        data.drop("Id", axis=1, inplace=True)
        for drop in missing_list:
            data.drop(drop, axis=1, inplace=True)

        # fixed random_state for same samples
        train_set, valid_set = train_test_split(data,
                                                test_size=0.1,
                                                random_state=73)
        train = preprocess_samples(train_set, missing_list)
        valid = preprocess_samples(valid_set, missing_list)
        trainData = FeatureDataset(train)
        validData = FeatureDataset(valid)

        device = torch.device(
            'cuda:%d' % args.cuda if torch.cuda.is_available() else 'cpu')
        model = simpleNet(args.hidden_size)
        model.to(device)
        batch_size = args.batch_size
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.lr,
                                     weight_decay=args.wd)
        # CrossEntropyLoss is loss function for multi-class classification
        loss_function = torch.nn.CrossEntropyLoss()
        max_epoch = args.max_epoch

        trainer = Trainer(device, trainData, validData, model, loss_function,
                          optimizer, batch_size, args.arch)

        for epoch in range(max_epoch):
            print('Epoch: {}'.format(epoch))
            # True for training ; False for validation
            trainer.run_epoch(epoch, True)
            trainer.run_epoch(epoch, False)

    if args.do_predict:

        data = pd.read_csv(args.data, 'test.csv')
        data.drop("Id", axis=1, inplace=True)
        for drop in missing_list:
            data.drop(drop, axis=1, inplace=True)

        test = preprocess_samples(data, missing_list)
        testData = FeatureDataset(test)

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = simpleNet(args.hidden_size, missing_list)
        model.load_state_dict(torch.load(f'{args.arch}/model.pkl'))
        #model.eval()      # same as model.train(False)
        model.train(False)
        model.to(device)

        # Dataloader for testdata
        dataloader = DataLoader(
            testData,
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=8,
            collate_fn=testData.collate_fn,
        )

        trange = tqdm(enumerate(dataloader),
                      total=len(dataloader),
                      desc='Predict')
        prediction = []
        for i, (x, y) in trange:
            o_labels = model.forward(x.to(device))
            o_labels = torch.argmax(o_labels, dim=1)
            prediction.append(o_labels.to('cpu'))

        prediction = torch.cat(prediction).detach().numpy().astype(int)
        SubmitGenerator(prediction, args.data_dir + 'sampleSubmission.csv')

        if args.do_plot:
            plot_history(args.arch, args.max_epoch, plot_acc=True)
Exemplo n.º 8
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--arch',
                        required=True,
                        help='architecture (model_dir)')
    parser.add_argument('--data_dir', default='../../data/', type=str)
    parser.add_argument('--do_train', action='store_true')
    parser.add_argument('--do_predict', action='store_true')
    parser.add_argument('--do_plot', action='store_true')
    parser.add_argument('--hidden_size', default=256, type=int)
    parser.add_argument('--batch_size', default=256, type=int)
    parser.add_argument('--max_epoch', default=1500, type=int)
    parser.add_argument('--lr', default=1e-3, type=float)
    parser.add_argument('--wd', default=1e-2, type=float)
    parser.add_argument('--cuda', default=1, type=int)
    args = parser.parse_args()

    missing_list = ["F1"]

    if args.do_train:

        dataset = pd.read_csv(args.data_dir + "train.csv")
        dataset.drop("Id", axis=1, inplace=True)
        for drop in missing_list:
            dataset.drop(drop, axis=1, inplace=True)

        train_set, valid_set = train_test_split(dataset,
                                                test_size=0.1,
                                                random_state=73)
        train = preprocess_samples(train_set, missing=missing_list)
        valid = preprocess_samples(valid_set, missing=missing_list)
        trainData = FeatureDataset(train)
        validData = FeatureDataset(valid)

        device = torch.device(
            'cuda:%d' % args.cuda if torch.cuda.is_available() else 'cpu')
        model = simpleNet(args.hidden_size, missing_list)
        model.to(device)
        batch_size = args.batch_size
        opt = torch.optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)

        # TODO: choose proper loss function for multi-class classification
        criteria = torch.nn.CrossEntropyLoss()

        trainer = Trainer(device, trainData, validData, model, criteria, opt,
                          batch_size, args.arch)

        max_epoch = args.max_epoch

        for epoch in range(max_epoch):
            #if epoch >= 10:
            #plot_history(args.arch, plot_acc=True)
            print('Epoch: {}'.format(epoch))
            trainer.run_epoch(epoch, True)  # True for training
            trainer.run_epoch(epoch, False)

    if args.do_predict:

        dataset = pd.read_csv(args.data_dir + "test.csv")
        dataset.drop("Id", axis=1, inplace=True)
        for drop in missing_list:
            dataset.drop(drop, axis=1, inplace=True)
        test = preprocess_samples(dataset, missing=missing_list)
        testData = FeatureDataset(test)

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = simpleNet(args.hidden_size, missing_list)

        # TODO: Load saved model here
        model.load_state_dict(torch.load(f'{args.arch}/model.pkl'))
        model.eval()
        pass

        model.train(False)
        model.to(device)

        # TODO: create dataloader for testData.
        # You can set batch_size as `args.batch_size` here, and `collate_fn=testData.collate_fn`.
        # DO NOT shuffle for testing
        dataloader = DataLoader(
            testData,
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=8,
            collate_fn=testData.collate_fn,
        )

        trange = tqdm(enumerate(dataloader),
                      total=len(dataloader),
                      desc='Predict')
        prediction = []
        for i, (x, y) in trange:
            o_labels = model(x.to(device))
            #o_labels = x.to(device)
            o_labels = torch.argmax(o_labels, dim=1)
            prediction.append(o_labels.to('cpu'))

        prediction = torch.cat(prediction).detach().numpy().astype(int)
        SubmitGenerator(prediction, args.data_dir + 'sampleSubmission.csv')

    if args.do_plot:
        plot_history(args.arch, args.max_epoch, plot_acc=True)