Пример #1
0
    def setup_train_configuration(self, config: Dict[str, Any]) -> None:
        """Setup train configuration."""
        self.config = config
        self.total_epochs = self.config["EPOCHS"]

        # get datasets
        trainset, testset = utils.get_dataset(
            config["DATASET"],
            config["AUG_TRAIN"],
            config["AUG_TEST"],
            config["AUG_TRAIN_PARAMS"],
            config["AUG_TEST_PARAMS"],
        )
        self.input_size = trainset[0][0].size()
        logger.info("Datasets prepared")

        # transform the training dataset for CutMix augmentation
        if "CUTMIX" in config:
            trainset = CutMix(
                trainset,
                config["MODEL_PARAMS"]["num_classes"],
                **config["CUTMIX"],
            )

        # get dataloaders
        self.trainloader, self.testloader = utils.get_dataloader(
            trainset,
            testset,
            config["BATCH_SIZE"],
            config["N_WORKERS"],
        )
        logger.info("Dataloader prepared")

        # define criterion and optimizer
        self.criterion = get_criterion(
            criterion_name=config["CRITERION"],
            criterion_params=config["CRITERION_PARAMS"],
            device=self.device,
        )

        self.regularizer = None
        if "REGULARIZER" in config:
            self.regularizer = get_regularizer(config["REGULARIZER"],
                                               config["REGULARIZER_PARAMS"])

        self.optimizer = optim.SGD(
            self.model.parameters(),
            lr=config["LR"],
            momentum=config["MOMENTUM"],
            weight_decay=config["WEIGHT_DECAY"],
            nesterov=config["NESTEROV"],
        )

        # learning rate scheduler
        self.lr_scheduler = get_lr_scheduler(
            config["LR_SCHEDULER"],
            config["LR_SCHEDULER_PARAMS"],
        )
Пример #2
0
def test_get_dataset():
    data_dir = utils.get_dataset()
    assert os.path.isdir(data_dir), "Image directory was not created"
    assert isinstance(data_dir,
                      pathlib.Path), "Returned directory is not pathlib.Path"
    files = os.listdir(data_dir)
    assert len(files) > 0, "No files in image directory."
    if len(files) > 0:
        for file in files:
            assert file.endswith(
                "jpg")  # Don't want any of those dumb mat files
            assert os.path.getsize(data_dir / file) > 0, f"{file} is empty."
Пример #3
0
def model_1(args):
    if os.path.isdir(os.getcwd() + '/results/images/' +
                     args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/images/' + args.run_name)

    if os.path.isdir(os.getcwd() + '/results/history/' +
                     args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/history/' + args.run_name)

    if os.path.isdir(os.getcwd() + '/results/files/' + args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/files/' + args.run_name)

    datapath = args.datadir
    #args.batch_size = 2
    args.img_size = 224
    dataset, data_loader = utils.get_dataset(datapath, args.img_size, \
            args.batch_size)
    classes, class_to_idx, idx_to_class = utils.get_classes(dataset)
    word_dim = 300
    label_criterion = nn.CrossEntropyLoss()
    reconstr_criterion = nn.L1Loss()
    #reconstr_criterion = nn.MSELoss()

    model = BimodalDAEImage(300, 2048, n_classes=len(classes))
    cnn = resnet101(pretrained=True)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.learning_rate,
                                 weight_decay=1e-5)
    print('\nNum classes: %r, num images: %r' % (len(classes), len(dataset)))

    #### change temp
    #word_vecs = utils.get_wvecs_json(os.getcwd() + '/data/files/wvecs.json', classes, word_dim)
    word_vecs = utils.get_word_vectors(os.getcwd() + '/data/files/wvecs.json',
                                       classes, word_dim)

    loss_hist, metric_hist = {}, {}
    softmax = nn.Softmax(dim=1)
    for epoch in range(args.epochs):
        print('Epoch %r' % epoch)
        log.info('Epoch %r' % epoch)
        loss_hist[epoch], metric_hist[epoch] = {}, {}
        for batch_idx, (img, target_tensor) in enumerate(data_loader):
            batch_acc, batch_loss = [], {'reconstr': [], 'classification': []}
            target_idxs = target_tensor.data.numpy().tolist()
            target_names = [idx_to_class[idx] for idx in target_idxs]
            target_labels = torch.tensor([[1 if i == idx else 0 for i in \
                    range(len(classes))] for idx in target_idxs], \
                    dtype=torch.long)

            # previously target dist reps
            target_textual = torch.tensor([word_vecs[name] for name in target_names], \
                                            dtype=torch.float32)
            #print('Text', target_textual.size())

            #img_rep = img[0].reshape(1, 3, args.img_size, args.img_size)
            #print(img_rep.size())
            #rep = vgg.forward(img_rep)
            #print(rep.size())
            target_visual = torch.tensor([
                cnn.forward(img[idx].reshape(1, 3, args.img_size,
                                             args.img_size)).data.numpy()
                for idx in range(len(target_idxs))
            ],
                                         dtype=torch.float32)

            #print('Visual', target_visual.size())

            n_samples = len(target_idxs)
            optimizer.zero_grad()

            img_reconstr, text_reconstr, hidden = model.forward(target_visual, \
                                                              target_textual)
            textual_loss = reconstr_criterion(text_reconstr, target_textual)
            textual_loss.backward(retain_graph=True)
            visual_loss = reconstr_criterion(img_reconstr, target_visual)
            visual_loss.backward(retain_graph=True)

            #print('Textual reconstr', text_reconstr.size())

            #print('Visual reconstr', img_reconstr.size())
            #print('Hidden', hidden.size())
            preds = softmax(hidden)

            pred_loss = label_criterion(preds, target_tensor)
            pred_loss.backward()

            optimizer.step()

            if epoch % 10 == 0:
                state = {'epoch': epoch + 1, 'state_dict': \
                        model.state_dict(), 'optimizer': optimizer.state_dict()}
                torch.save(state,
                           os.getcwd() + "/model_states/" + args.run_name)

    return
Пример #4
0
def eval(args):
    args = handle_args(args)
    if os.path.isdir(os.getcwd() + '/results/images/' + args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/images/' + args.run_name)

    if os.path.isdir(os.getcwd() + '/results/history/' + args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/history/' + args.run_name)

    if os.path.isdir(os.getcwd() + '/results/files/' + args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/files/' + args.run_name)


    datapath = args.datadir
    dataset, data_loader = utils.get_dataset(datapath, args.img_size, \
            args.batch_size)
    classes, class_to_idx, idx_to_class = utils.get_classes(dataset)
    word_dim = 300
    label_dim = len(classes)
    # todo! change the number of classes to intersect with quickdraws classes!
    #label_criterion = nn.L1Loss()
    label_criterion = nn.CrossEntropyLoss()
    #label_criterion = nn.MultiLabelMarginLoss()
    reconstr_criterion = nn.MSELoss()

    model = DistributedWordLabeller(width=args.img_size, height=args.img_size, \
                                    word_dim=word_dim, label_dim=label_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
                             weight_decay=1e-5)
    print('\nNum classes: %r, num images: %r' % (len(classes), len(dataset)))

    #word_vecs = {i: [0, 0, 0] for i in range(300)}
    word_vecs = utils.get_word_vectors('/data/nlp/glove/glove_300d.json', classes, word_dim)

    matrix = np.zeros((len(classes), len(classes)))
    if args.eval == 'True':
        datapath = args.test_datadir
        dataset, data_loader = utils.get_dataset(datapath, args.img_size, \
            args.batch_size)
        checkpoint = torch.load(os.getcwd() + '/model_states/' + args.run_name)
        model.load_state_dict(checkpoint['state_dict'])

        with torch.no_grad():
            for batch_idx, (img, target_tensor) in enumerate(data_loader):
                target_idxs = target_tensor.data.numpy().tolist()
                target_names = [idx_to_class[idx] for idx in target_idxs]
                print(target_names)
                target_labels = torch.tensor([[1 if i == idx else 0 for i in \
                    range(len(classes))] for idx in target_idxs], \
                    dtype=torch.float32)

                reconstr, word_dist, label_dist = model.forward(img)
                labels = model.pred_labels(label_dist)
                matrix = update_label_matrix(matrix, labels, target_idxs)
        avg_acc, metric_dict = matrix_to_metrics(train_matrix, idx_to_class)

        print(avg_acc); print(metric_dict)
        return
        f = open(os.getcwd() + '/results/files/' + args.run_name + \
                         '/matrix.json', 'w+')
        temp = {'matrix': matrix.tolist(), 'metrics': metric_dict, 'avg_acc': avg_acc}
        f.write(json.dumps(temp))
Пример #5
0
def save_encodings(args):

    datapath = args.datadir
    dataset, data_loader = utils.get_dataset(datapath, args.img_size, \
            args.batch_size)
    classes, class_to_idx, idx_to_class = utils.get_temp_classes(dataset)
    word_dim = 300
    label_dim = len(classes)


    model = DistributedWordOnly(width=args.img_size, height=args.img_size, \
                                    word_dim=word_dim, label_dim=label_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
                             weight_decay=1e-5)

    dict = {class_name: [] for class_name in classes}
    print(dict)
    args.run_name = 'photo-word-only'
    checkpoint = torch.load(os.getcwd() + '/model_states/' + args.run_name)
    model.load_state_dict(checkpoint['state_dict'])

    print('Model loaded!')

    encoding_dict = {}
    with torch.no_grad():
        for batch_idx, (img, target_tensor) in enumerate(data_loader):
            target_idxs = target_tensor.data.numpy().tolist()
            target_names = [idx_to_class[idx] for idx in target_idxs]
            print(target_names)
            target_labels = torch.tensor([[1 if i == idx else 0 for i in \
                    range(len(classes))] for idx in target_idxs], \
                    dtype=torch.float32)

            word_dist, label_dist = model.forward(img)
            print(word_dist)
            word_reps = word_dist.data.numpy()
            print(word_reps)
            print(len(word_reps))
            for idx in range(len(word_reps)):
                target = target_idxs[idx]
                if target not in encoding_dict.keys(): encoding_dict[target] = []
                encoding_dict[target] = list(word_reps[idx].tolist())

            print(encoding_dict)
            labels = model.pred_labels(label_dist)
            print(target_idxs)
            print(labels)

            train_matrix = update_label_matrix(np.zeros((len(classes), len(classes))), \
                                               labels, target_idxs)
            avg_acc, metric_dict = matrix_to_metrics(train_matrix, idx_to_class)




    avg_acc, metric_dict = matrix_to_metrics(train_matrix, idx_to_class)

    print('Done!')


    for item in encoding_dict:
        print(item)
        print(encoding_dict[item])
        print(len(encoding_dict[item]))

    f = open(os.getcwd() + '/results/files/' + args.run_name + '/encoding_dict.json', 'w+')
    f.write(json.dumps(encoding_dict))
Пример #6
0
def extract_weights(args):
    args = handle_args(args)

    datapath = args.datadir

    dataset, data_loader = utils.get_dataset(datapath, args.img_size, \
            args.batch_size)
    classes, class_to_idx, idx_to_class = utils.get_temp_classes(dataset)
    word_dim = 300
    label_dim = len(classes)


    print('Model: ', args.run_name)
    model = DistributedWordLabeller(width=args.img_size, height=args.img_size, \
                                    word_dim=word_dim, label_dim=label_dim)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
                             weight_decay=1e-5)

    # get the matrix and classify the matrix

    f = open(os.getcwd() + '/results/files/' + args.run_name + '/matrix.json', 'r')
    for line in f: temp = json.loads(line)

    matrix = temp['matrix']

    #print(len(matrix))
    #print(matrix)
    #print(matrix[0][0])
    print(idx_to_class)
    avg_acc, metric_dict = matrix_to_metrics(matrix, idx_to_class)
    #print(metric_dict)
    #print(avg_acc)


    print(idx_to_class)
    for idx in metric_dict:
        print(idx_to_class[idx])
        print(metric_dict[idx])

    return


    # get the weights from the linear layer, this forms the matrix, classify this matrix,
    matrix = np.zeros((len(classes), len(classes)))
    identity = torch.eye(300)

    m = model.label_classifier(identity)
    m = torch.transpose(m, 0, 1)    #print(m)
    f = open(os.getcwd() + '/results/files/sketch_sketchy/' + 'idx_to_class.json', 'r')
    for line in f: idx_to_class = json.loads(line)

    rep_dict = {}
    for i in range(len(m)):
        class_name = idx_to_class[str(i)]
        rep_dict[class_name] = m[i].data.numpy()

    f = open(os.getcwd() + '/data/files/wvecs.json', 'r')
    for line in f: wvecs = json.loads(line)

    for name in wvecs:
        wvecs[name] = np.array(wvecs[name])
    classes = list(rep_dict.keys())
    sims = []

    def get_sim(class_1, class_2, rep_dict):
        dot_product = rep_dict[class_1].reshape(-1, 1) * rep_dict[class_2].reshape(-1, 1)
        #print(dot_product)
        norm_1 = (np.linalg.norm(rep_dict[class_1].reshape(-1, 1)))
        norm_2 = (np.linalg.norm(rep_dict[class_2].reshape(-1, 1)))
        cos_sim = np.sum(dot_product)/(norm_1*norm_2)
        return cos_sim

    for (class_1, class_2) in itertools.product(classes, classes):
        pair = class_1 + '#' + class_2
        cos_sim = get_sim(class_1, class_2, rep_dict)
        word_sim = get_sim(class_1, class_2, wvecs)

        #print(cos_sim)
        sims.append((pair, round(cos_sim, 3), round(word_sim, 3)))

    sorted_sims = sorted(sims, key=operator.itemgetter(1), reverse=True)

    for item in sorted_sims:
        if item[1] > 0.9: continue
        print(item)

    print(len(sorted_sims))


    def get_corrs(pair_tuples):
        temp = 'sailboat,piano,sheep,pistol,snail,harp,cat,rocket,cannon,rabbit'
        temp = temp.split(',')
        f = open(os.getcwd() + '/data/files/sem-vis-sketchy.tsv', 'r')
        lines = [line.strip().split('\t') for line in f.readlines()]

        pairs = {item[0]:item[1] for item in pair_tuples}
        vals = []
        for line in lines:
            class_1, class_2, sem, vis = line[0], line[1], float(line[2]), float(line[3])
            if class_1 in temp or class_2 in temp: continue
            pair = class_1 + '#' + class_2
            if pair in pairs:
                vals.append((pair, sem, vis, pairs[pair]))


        for val in vals:
            print(val)

        cos_list = [item[-1] for item in vals]
        sem_list = [item[1] for item in vals]
        vis_list = [item[2] for item in vals]

        #print(vals)
        spearman_sem = stats.spearmanr(cos_list, sem_list)
        spearman_vis = stats.spearmanr(cos_list, vis_list)

        pearson_sem = stats.pearsonr(cos_list, sem_list)
        pearson_vis = stats.pearsonr(cos_list, vis_list)

        print('Semantic: Pearson: %f, Spearman: %f' %(round(pearson_sem[0], 3), \
                                                      round(spearman_sem[0], 3)))
        print('Visual: Pearson: %f, Spearman: %f' %(round(pearson_vis[0], 3), \
                                                      round(spearman_vis[0], 3)))

    print('\n\nImage!')
    get_corrs([(item[0], item[1]) for item in sims])
    #print('Word!')

    #get_corrs([(item[0], item[2]) for item in sims])





    return

    if args.eval == 'True':
        datapath = args.test_datadir
        dataset, data_loader = utils.get_dataset(datapath, args.img_size, \
            args.batch_size)
        checkpoint = torch.load(os.getcwd() + '/model_states/' + args.run_name)
        model.load_state_dict(checkpoint['state_dict'])

        with torch.no_grad():
            for batch_idx, (img, target_tensor) in enumerate(data_loader):
                target_idxs = target_tensor.data.numpy().tolist()
                target_names = [idx_to_class[idx] for idx in target_idxs]
                print(target_names)
                target_labels = torch.tensor([[1 if i == idx else 0 for i in \
                    range(len(classes))] for idx in target_idxs], \
                    dtype=torch.float32)

                reconstr, word_dist, label_dist = model.forward(img)
                labels = model.pred_labels(label_dist)
                matrix = update_label_matrix(matrix, labels, target_idxs)
        avg_acc, metric_dict = matrix_to_metrics(matrix, idx_to_class)

    print(avg_acc); print(metric_dict)
    return
Пример #7
0
def model_1(args):
    if os.path.isdir(os.getcwd() + '/results/images/' + args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/images/' + args.run_name)

    if os.path.isdir(os.getcwd() + '/results/history/' + args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/history/' + args.run_name)

    if os.path.isdir(os.getcwd() + '/results/files/' + args.run_name) is False:
        os.mkdir(os.getcwd() + '/results/files/' + args.run_name)

    datapath = args.datadir
    args.img_size = 224

    dataset, data_loader = utils.get_dataset(datapath, args.img_size, \
            args.batch_size)
    classes, class_to_idx, idx_to_class = utils.get_classes(dataset)
    word_dim = 300
    label_dim = len(classes)


    model = BimodalDAEImage(300, 2048, n_classes=len(classes))
    cnn = resnet101(pretrained=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate,
                             weight_decay=1e-5)
    print('\nNum classes: %r, num images: %r' % (len(classes), len(dataset)))

    word_vecs = utils.get_wvecs_json(os.getcwd() + '/data/files/wvecs.json', classes, word_dim)
    #word_vecs = utils.get_word_vectors(os.getcwd() + '/data/files/wvecs.json', classes, word_dim)

    encoding_dict = {}
    with torch.no_grad():
        for batch_idx, (img, target_tensor) in enumerate(data_loader):

            target_idxs = target_tensor.data.numpy().tolist()
            target_names = [idx_to_class[idx] for idx in target_idxs]
            target_labels = torch.tensor([[1 if i == idx else 0 for i in \
                    range(len(classes))] for idx in target_idxs], \
                    dtype=torch.long)

            # previously target dist reps
            target_textual = torch.tensor([word_vecs[name] for name in target_names], \
                                            dtype=torch.float32)

            target_visual = torch.tensor(
                [cnn.forward(
                    img[idx].reshape(1, 3, args.img_size, args.img_size)).data.numpy() for idx in range(len(target_idxs))], dtype=torch.float32
            )

            n_samples = len(target_idxs)

            img_reconstr, text_reconstr, hidden = model.forward(target_visual, \
                                                              target_textual)

            print('Hidden', hidden.size())
            #preds = softmax(hidden)
            reps = hidden.data.numpy()
            for idx in range(len(reps)):
                target = target_names[idx]
                print(target)
                if target not in encoding_dict.keys():
                    encoding_dict[target] = []
                #val = reps[idx].view(1, -1)
                encoding_dict[target].append(list(reps[idx].tolist()))


    f = open(os.getcwd() + '/results/files/' + args.run_name + '/encoding_dict.json', 'w+')
    f.write(json.dumps(encoding_dict))
    print('Eval done!')
Пример #8
0
def main(args):
    """
    Call the configuration function of the model, build the model and load data, then start training.
    model_config:
        a json file  with the hyperparameters,such as dropout rate ,learning rate,num tasks and so on;
    num_tasks:
        it means the number of task that each dataset contains, it's related to the dataset;
    DownstreamModel:
        It means the PretrainGNNModel for different strategies and it is an supervised GNN model which predicts the tasks.
    """
    compound_encoder_config = load_json_config(args.compound_encoder_config)
    model_config = load_json_config(args.model_config)
    if not args.dropout_rate is None:
        compound_encoder_config['dropout_rate'] = args.dropout_rate
        model_config['dropout_rate'] = args.dropout_rate
    task_names = get_downstream_task_names(args.dataset_name, args.data_path)
    model_config['num_tasks'] = len(task_names)

    ### build model
    compound_encoder = PretrainGNNModel(compound_encoder_config)
    model = DownstreamModel(model_config, compound_encoder)
    criterion = nn.BCELoss(reduction='none')
    encoder_params = compound_encoder.parameters()
    head_params = exempt_parameters(model.parameters(), encoder_params)
    encoder_opt = paddle.optimizer.Adam(args.encoder_lr,
                                        parameters=encoder_params)
    head_opt = paddle.optimizer.Adam(args.head_lr, parameters=head_params)
    print('Total param num: %s' % (len(model.parameters())))
    print('Encoder param num: %s' % (len(encoder_params)))
    print('Head param num: %s' % (len(head_params)))

    if not args.init_model is None and not args.init_model == "":
        compound_encoder.set_state_dict(paddle.load(args.init_model))
        print('Load state_dict from %s' % args.init_model)

    ### load data
    # featurizer:
    #     Gen features according to the raw data and return the graph data.
    #     Collate features about the graph data and return the feed dictionary.
    # splitter:
    #     split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit.
    #     `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold,
    #     then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set
    #     and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on
    #     out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter`
    #     and `IndexSplitter` is also available."

    dataset = get_dataset(args.dataset_name, args.data_path, task_names)
    dataset.transform(DownstreamTransformFn(), num_workers=args.num_workers)
    splitter = create_splitter(args.split_type)
    train_dataset, valid_dataset, test_dataset = splitter.split(dataset,
                                                                frac_train=0.8,
                                                                frac_valid=0.1,
                                                                frac_test=0.1)
    print("Train/Valid/Test num: %s/%s/%s" %
          (len(train_dataset), len(valid_dataset), len(test_dataset)))

    ### start train
    # Load the train function and calculate the train loss in each epoch.
    # Here we set the epoch is in range of max epoch,you can change it if you want.

    # Then we will calculate the train loss ,valid auc,test auc and print them.
    # Finally we save it to the model according to the dataset.
    list_val_auc, list_test_auc = [], []
    collate_fn = DownstreamCollateFn(
        atom_names=compound_encoder_config['atom_names'],
        bond_names=compound_encoder_config['bond_names'])
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, model, train_dataset, collate_fn, criterion,
                           encoder_opt, head_opt)
        val_auc = evaluate(args, model, valid_dataset, collate_fn)
        test_auc = evaluate(args, model, test_dataset, collate_fn)

        list_val_auc.append(val_auc)
        list_test_auc.append(test_auc)
        test_auc_by_eval = list_test_auc[np.argmax(list_val_auc)]
        print("epoch:%s train/loss:%s" % (epoch_id, train_loss))
        print("epoch:%s val/auc:%s" % (epoch_id, val_auc))
        print("epoch:%s test/auc:%s" % (epoch_id, test_auc))
        print("epoch:%s test/auc_by_eval:%s" % (epoch_id, test_auc_by_eval))
        paddle.save(
            compound_encoder.state_dict(),
            '%s/epoch%d/compound_encoder.pdparams' %
            (args.model_dir, epoch_id))
        paddle.save(model.state_dict(),
                    '%s/epoch%d/model.pdparams' % (args.model_dir, epoch_id))

    outs = {
        'model_config': basename(args.model_config).replace('.json', ''),
        'metric': '',
        'dataset': args.dataset_name,
        'split_type': args.split_type,
        'batch_size': args.batch_size,
        'dropout_rate': args.dropout_rate,
        'encoder_lr': args.encoder_lr,
        'head_lr': args.head_lr,
        'exp_id': args.exp_id,
    }
    best_epoch_id = np.argmax(list_val_auc)
    for metric, value in [('test_auc', list_test_auc[best_epoch_id]),
                          ('max_valid_auc', np.max(list_val_auc)),
                          ('max_test_auc', np.max(list_test_auc))]:
        outs['metric'] = metric
        print('\t'.join(['FINAL'] + ["%s:%s" % (k, outs[k])
                                     for k in outs] + [str(value)]))
Пример #9
0
def main():
    # Argparse custom actions
    class SetModes(argparse.Action):
        """Set the modes of operations."""
        def __call__(self, parser, args, values, option_string=None):
            for value in values:
                setattr(args, value, True)

    # yapf: disable
    parser = argparse.ArgumentParser(description='Fake News Classifier')
    # Initialization
    parser.add_argument('--init', action='store_true', default=False,
                        help='perform initialization')
    # Modes
    parser.add_argument('-m', '--mode', action=SetModes, nargs='+', choices=['train', 'test', 'demo', 'plot'],
                        help='specify the mode of operation: train, test, demo, plot')
    parser.add_argument('--train', action='store_true', default=False,
                        help='train the model')
    parser.add_argument('--test', action='store_true', default=False,
                        help='test the model (must either train or load a model)')
    parser.add_argument('--demo', action='store_true', default=False,
                        help='demo the model on linewise samples from a file (must either train or load a model)')
    parser.add_argument('--plot', action='store_true', default=False,
                        help='plot training data (must either train or have existing training data)')
    # Options
    parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('-c', '--config', type=str,
                        help='path to configuration json file (overrides args)')
    parser.add_argument('--data-loader', type=str, default='BatchLoader',
                        help='data loader to use (default: "BatchLoader")')
    parser.add_argument('--dataset', type=str, default='FakeRealNews',
                        help='dataset to use (default: "FakeRealNews")')
    parser.add_argument('-e', '--epochs', type=int, default=10,
                        help='number of epochs to train (default: 10)')
    parser.add_argument('-f', '--file', type=str,
                        help='specify a file for another argument')
    parser.add_argument('--lr', '--learning-rate', dest='learning_rate', type=float, default=1e-4,
                        help='learning rate (default: 1e-4)')
    parser.add_argument('-l', '--load', type=int, metavar='EPOCH',
                        help='load a model and its training data')
    parser.add_argument('--loss', type=str, default='BCEWithLogitsLoss',
                        help='loss function (default: "BCEWithLogitsLoss")')
    parser.add_argument('--model', type=str, default='FakeNewsNet',
                        help='model architecture to use (default: "FakeNewsNet")')
    parser.add_argument('-s', '--sample-size', type=int, metavar='N',
                        help='limit sample size for training')
    parser.add_argument('--seed', type=int, default=0,
                        help='random seed (default: 0)')
    parser.add_argument('--save', action='store_true', default=True,
                        help='save model checkpoints and training data (default: True)')
    parser.add_argument('--no-save', dest='save', action='store_false')
    args = parser.parse_args()
    # yapf: enable

    # Print help if no args
    if len(sys.argv) == 1:
        parser.print_help()
        parser.exit()

    # Configure logger
    logging.basicConfig(level=logging.DEBUG)
    logging.getLogger('matplotlib').setLevel(logging.WARNING)

    # Load configuration file if specified
    if args.config is not None:
        utils.load_config(args)

    # Exit if no mode is specified
    if not args.init and not args.train and not args.test and not args.demo and not args.plot:
        logging.error(
            'No mode specified. Please specify with: --mode {init,train,test,demo,plot}'
        )
        exit(1)
    # Exit on `--load` if run directory not found
    if (args.load is not None or
        (args.plot
         and not args.train)) and not os.path.isdir(utils.get_path(args)):
        logging.error(
            'Could not find directory for current configuration {}'.format(
                utils.get_path(args)))
        exit(1)
    # Exit on `test` or `demo` without `train` or `--load EPOCH`
    if (args.test or args.demo) and not (args.train or args.load is not None):
        logging.error(
            'Cannot run `test` or `demo` without a model. Try again with either `train` or `--load EPOCH`.'
        )
        exit(1)
    # Exit on `demo` without a string file
    if args.demo and not args.file:
        logging.error(
            'Cannot run `demo` without a file. Try again with `--file FILE`.')
        exit(1)

    # Setup run directory
    if args.save and not args.init and not (args.train or args.test
                                            or args.demo or args.plot):
        utils.save_config(args)
        path = utils.get_path(args) + '/output.log'
        os.makedirs(os.path.dirname(path), exist_ok=True)
        logging.getLogger().addHandler(logging.FileHandler(path))

    # Set random seeds
    random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Variable declarations
    training_data = None

    # Load GloVe vocabulary
    if args.init or args.train or args.test or args.demo:
        glove = torchtext.vocab.GloVe(name='6B', dim=50)

    # Perform initialization
    if args.init or args.train or args.test:
        # Determine which dataset to use
        dataset = utils.get_dataset(args)
        # Preload the dataset
        dataset.load()
        # Get preprocessed samples
        samples = preprocessing.get_samples(dataset, glove, args.init)
        random.shuffle(samples)

    # DataLoader setup for `train`, `test`
    if args.train or args.test:
        # Select data loader to use
        DataLoader = utils.get_data_loader(args)

        # Split samples
        split_ratio = [.6, .2, .2]
        trainset, validset, testset = list(
            DataLoader.splits(samples, split_ratio))
        if args.sample_size is not None:  # limit samples used in training
            trainset = trainset[:args.sample_size]
            validset = validset[:int(args.sample_size * split_ratio[1] /
                                     split_ratio[0])]

        # Get data loaders
        train_loader, valid_loader, test_loader = [
            DataLoader(split, batch_size=args.batch_size)
            for split in [trainset, validset, testset]
        ]

    # Load samples for demo
    if args.demo:
        if os.path.isfile(args.file):
            # Read samples from the input file
            with open(args.file, 'r') as f:
                samples = [line for line in f if line.strip()]
            data = pd.DataFrame({
                'text': samples,
                'label': [0.5] * len(samples)
            })
            # Preprocess samples
            preprocessing.clean(data)
            samples = preprocessing.encode(data, glove)
            samples = [(torch.tensor(text).long(), label)
                       for text, label in samples]

            # Select data loader to use
            DataLoader = utils.get_data_loader(args)

            # Get data loader
            data_loader = DataLoader(samples, batch_size=1, shuffle=False)
        else:
            logging.error('Could not find file for demo at {}'.format(
                args.file))
            exit(1)

    # Model setup for `train`, `test`, `demo`
    if args.train or args.test or args.demo:
        # Create the model
        model = utils.get_model(glove, args)

        # Load a model
        if args.load is not None:
            utils.load_model(args.load, model, args)

    # Run `train`
    if args.train:
        training_data = training.train(model, train_loader, valid_loader, args)

    # Run `test`
    if args.test:
        if args.train or args.load is not None:
            criterion = utils.get_criterion(args.loss)
            acc, loss = training.evaluate(model, test_loader, criterion)
            logging.info('Testing accuracy: {:.4%}, loss: {:.6f}'.format(
                acc, loss))
        else:
            logging.error('No model loaded for testing')
            exit(1)

    # Run `demo`
    if args.demo:
        if args.train or args.load is not None:
            model.eval()  # set model to evaluate mode
            logging.info('-- Results --')
            for i, (text, _) in enumerate(data_loader):
                preview = data['text'][i][:32] + '...'
                out = model(text).flatten()
                prob = torch.sigmoid(out)  # apply sigmoid to get probability
                pred = (prob >
                        0.5).long()  # predict `true` if greater than 0.5
                label = ['fake', 'true'][pred.item()]
                label = '{}{}{}'.format(
                    '\033[92m' if pred.item() else '\033[93m', label,
                    '\033[0m')
                confidence = (prob if pred.item() else 1 - prob).item()
                logging.info(
                    'Report {}: {} with {:.2%} confidence - "{}"'.format(
                        i, label, confidence, preview))
        else:
            logging.error('No model loaded for demo')
            exit(1)

    # Run `plot`
    if args.plot:
        if training_data is None:
            training_data = utils.load_training_data(args, allow_missing=False)
        if args.load is not None and not args.train:
            for k, v in training_data.items():
                training_data[k] = v[:args.load + 1]

        logging.info('Plotting training data')
        training.plot(training_data)
Пример #10
0
It may be necessary to gather the resources manually depending on your needs.
"""

# standard library
import os
import zipfile
from pathlib import Path

# local
from src.config import PATHS, GEONAMES, MAPPING
from src.utils import get_dataset, download_from_url

# change the working directory
os.chdir('../')

# download the place name data files
get_dataset(GEONAMES, PATHS.resources / 'geonames')

# download shapefiles
for field, url in zip(MAPPING._fields, MAPPING):
    suffix = Path(url).suffix
    path_out = PATHS.resources / 'shapefiles' / field
    filename = f"{field}{suffix}"

    if not (path_out / filename).exists():
        download_from_url(url, filename=filename, path_out=path_out)

    if suffix == '.zip':
        with zipfile.ZipFile(path_out / filename, 'r') as zip_ref:
            zip_ref.extractall(path_out)
Пример #11
0
from src.utils import get_dataset
from src.configuration import Configuration

dataset = get_dataset(Configuration.from_file('config.cfg'))

i = 0
for key in dataset.keys():
    song = dataset[key]
    if song.lyrics is not None:
        i += 1
print(i)
Пример #12
0
def main(args):
    """
    Call the configuration function of the model, build the model and load data, then start training.
    model_config:
        a json file  with the hyperparameters,such as dropout rate ,learning rate,num tasks and so on;
    num_tasks:
        it means the number of task that each dataset contains, it's related to the dataset;
    """
    ### config for the body
    compound_encoder_config = load_json_config(args.compound_encoder_config)
    if not args.dropout_rate is None:
        compound_encoder_config['dropout_rate'] = args.dropout_rate

    ### config for the downstream task
    task_type = 'regr'
    metric = get_metric(args.dataset_name)
    task_names = get_downstream_task_names(args.dataset_name, args.data_path)
    dataset_stat = get_dataset_stat(args.dataset_name, args.data_path,
                                    task_names)
    label_mean = np.reshape(dataset_stat['mean'], [1, -1])
    label_std = np.reshape(dataset_stat['std'], [1, -1])

    model_config = load_json_config(args.model_config)
    if not args.dropout_rate is None:
        model_config['dropout_rate'] = args.dropout_rate
    model_config['task_type'] = task_type
    model_config['num_tasks'] = len(task_names)
    print('model_config:')
    print(model_config)

    ### build model
    compound_encoder = GeoGNNModel(compound_encoder_config)
    model = DownstreamModel(model_config, compound_encoder)
    if metric == 'square':
        criterion = nn.MSELoss()
    else:
        criterion = nn.L1Loss()
    encoder_params = compound_encoder.parameters()
    head_params = exempt_parameters(model.parameters(), encoder_params)
    encoder_opt = paddle.optimizer.Adam(args.encoder_lr,
                                        parameters=encoder_params)
    head_opt = paddle.optimizer.Adam(args.head_lr, parameters=head_params)
    print('Total param num: %s' % (len(model.parameters())))
    print('Encoder param num: %s' % (len(encoder_params)))
    print('Head param num: %s' % (len(head_params)))
    for i, param in enumerate(model.named_parameters()):
        print(i, param[0], param[1].name)

    if not args.init_model is None and not args.init_model == "":
        compound_encoder.set_state_dict(paddle.load(args.init_model))
        print('Load state_dict from %s' % args.init_model)

    ### load data
    if args.task == 'data':
        print('Preprocessing data...')
        dataset = get_dataset(args.dataset_name, args.data_path, task_names)
        transform_fn = DownstreamTransformFn()
        dataset.transform(transform_fn, num_workers=args.num_workers)
        dataset.save_data(args.cached_data_path)
        return
    else:
        if args.cached_data_path is None or args.cached_data_path == "":
            print('Processing data...')
            dataset = get_dataset(args.dataset_name, args.data_path,
                                  task_names)
            transform_fn = DownstreamTransformFn()
            dataset.transform(transform_fn, num_workers=args.num_workers)
        else:
            print('Read preprocessing data...')
            dataset = InMemoryDataset(npz_data_path=args.cached_data_path)

    splitter = create_splitter(args.split_type)
    train_dataset, valid_dataset, test_dataset = splitter.split(dataset,
                                                                frac_train=0.8,
                                                                frac_valid=0.1,
                                                                frac_test=0.1)
    print("Train/Valid/Test num: %s/%s/%s" %
          (len(train_dataset), len(valid_dataset), len(test_dataset)))
    print('Train min/max/mean %s/%s/%s' % get_label_stat(train_dataset))
    print('Valid min/max/mean %s/%s/%s' % get_label_stat(valid_dataset))
    print('Test min/max/mean %s/%s/%s' % get_label_stat(test_dataset))

    ### start train
    list_val_metric, list_test_metric = [], []
    collate_fn = DownstreamCollateFn(
        atom_names=compound_encoder_config['atom_names'],
        bond_names=compound_encoder_config['bond_names'],
        bond_float_names=compound_encoder_config['bond_float_names'],
        bond_angle_float_names=compound_encoder_config[
            'bond_angle_float_names'],
        task_type=task_type)
    for epoch_id in range(args.max_epoch):
        train_loss = train(args, model, label_mean, label_std, train_dataset,
                           collate_fn, criterion, encoder_opt, head_opt)
        val_metric = evaluate(args, model, label_mean, label_std,
                              valid_dataset, collate_fn, metric)
        test_metric = evaluate(args, model, label_mean, label_std,
                               test_dataset, collate_fn, metric)

        list_val_metric.append(val_metric)
        list_test_metric.append(test_metric)
        test_metric_by_eval = list_test_metric[np.argmin(list_val_metric)]
        print("epoch:%s train/loss:%s" % (epoch_id, train_loss))
        print("epoch:%s val/%s:%s" % (epoch_id, metric, val_metric))
        print("epoch:%s test/%s:%s" % (epoch_id, metric, test_metric))
        print("epoch:%s test/%s_by_eval:%s" %
              (epoch_id, metric, test_metric_by_eval))
        paddle.save(
            compound_encoder.state_dict(),
            '%s/epoch%d/compound_encoder.pdparams' %
            (args.model_dir, epoch_id))
        paddle.save(model.state_dict(),
                    '%s/epoch%d/model.pdparams' % (args.model_dir, epoch_id))

    outs = {
        'model_config': basename(args.model_config).replace('.json', ''),
        'metric': '',
        'dataset': args.dataset_name,
        'split_type': args.split_type,
        'batch_size': args.batch_size,
        'dropout_rate': args.dropout_rate,
        'encoder_lr': args.encoder_lr,
        'head_lr': args.head_lr,
    }
    best_epoch_id = np.argmin(list_val_metric)
    for metric, value in [('test_%s' % metric,
                           list_test_metric[best_epoch_id]),
                          ('max_valid_%s' % metric, np.min(list_val_metric)),
                          ('max_test_%s' % metric, np.min(list_test_metric))]:
        outs['metric'] = metric
        print('\t'.join(['FINAL'] + ["%s:%s" % (k, outs[k])
                                     for k in outs] + [str(value)]))
Пример #13
0
# paths
PRETRAINED_PATH = 'huggingface/'
SQUAD_TRAIN_DATA_PATH = 'data/squad/train-v2.0.json'
SQUAD_VAL_DATA_PATH = 'data/squad/dev-v2.0.json'
MODEL_SAVE_PATH = 'models/with_answers_only/'
LOG_DIR = 'logs/gradient_tape/'

if not os.path.exists(MODEL_SAVE_PATH):
    os.makedirs(MODEL_SAVE_PATH)

if TENSORBOARD:
    %load_ext tensorboard
    %tensorboard --logdir logs/gradient_tape

if LOAD_DATA:
    train_dataset, input_tokens_train = get_dataset(SQUAD_TRAIN_DATA_PATH, PRETRAINED_PATH)
    train_dataset = train_dataset.batch(BATCH_SIZE)
    val_dataset, input_tokens_val = get_dataset(SQUAD_VAL_DATA_PATH, PRETRAINED_PATH)
    val_dataset = val_dataset.batch(BATCH_SIZE)
    if SHUFFLE:
        # doesn't work with reshuffle_each_iteration --> other data at every epoch?
        train_dataset = train_dataset.shuffle(buffer_size=SHUFFLE_BUF, reshuffle_each_iteration=True)
        val_dataset = val_dataset.shuffle(buffer_size=SHUFFLE_BUF, reshuffle_each_iteration=True)

tokenizer = AlbertTokenizer.from_pretrained(PRETRAINED_PATH)
with open(PRETRAINED_PATH + 'config.json') as f:
    config = json.load(f)

# training configurations loop
for desired_batch_size, lr in itertools.product(DESIRED_BATCH_SIZES, LR):
    model = QaAlbertModel(config, PRETRAINED_PATH)