예제 #1
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    # Setup tpu-cluster
    cluster = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(cluster)
    tf.tpu.experimental.initialize_tpu_system(cluster)
    distribute_strategy = tf.distribute.TPUStrategy(cluster)

    with distribute_strategy.scope():
        model, input_size = build_model(num_classes=FLAGS.num_classes)
        optimizer = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])
    model.summary()

    tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"{FLAGS.job_dir}/finetune/logs", histogram_freq=1)
    callbacks = [tboard_callback]

    train_ds = get_dataset(FLAGS.dataset, "train", read_tfrecord, FLAGS.global_batch_size, input_size, FLAGS.percentage)
    valid_ds = get_dataset(FLAGS.dataset, "valid", read_tfrecord, FLAGS.global_batch_size, input_size, FLAGS.percentage)
    for epoch in range(FLAGS.epochs):
        model.fit(train_ds, validation_data=valid_ds, callbacks=callbacks, initial_epoch=epoch, epochs=epoch+1)
        model.save(f"{FLAGS.job_dir}/finetune/checkpoints/{epoch+1}", include_optimizer=True)
    
    model.save(f"{FLAGS.job_dir}/finetune/saved_model", include_optimizer=False)
예제 #2
0
def check_dataset(dataset, dataroot, augment, download):
    if dataset == "cifar64":
        # cifar64 = get_CIFAR64(augment, dataroot, download)
        # input_size, num_classes, train_dataset, test_dataset = cifar64
        train_data = get_dataset('cifar-fs-train-train', args.dataroot)
        test_data = get_dataset('cifar-fs-train-test', args.dataroot)
        transform = transforms.Compose([transforms.ToTensor(), preprocess])
        train_dataset = SimpleDataset(train_data['x'], transform)
        test_dataset = SimpleDataset(test_data['x'], transform)
        input_size = (32, 32, 3)
        num_classes = 64
    if dataset == "cifar10":
        cifar10 = get_CIFAR10(augment, dataroot, download)
        input_size, num_classes, train_dataset, test_dataset = cifar10
    if dataset == "svhn":
        svhn = get_SVHN(augment, dataroot, download)
        input_size, num_classes, train_dataset, test_dataset = svhn
    if dataset == "miniimagenet":
        train_data = get_dataset('miniimagenet-train-train', args.dataroot)
        test_data = get_dataset('miniimagenet-train-test', args.dataroot)
        transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((32, 32)),
            transforms.ToTensor(), preprocess
        ])
        train_dataset = SimpleDataset(train_data['x'], transform)
        test_dataset = SimpleDataset(test_data['x'], transform)
        input_size = (32, 32, 3)
        num_classes = 64

    return input_size, num_classes, train_dataset, test_dataset
예제 #3
0
def main(learning_rate, use_daft, dataset_name, epochs, _seed, _config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    task_name = f"{'daftmac' if use_daft else 'mac'}_{_config['dataset_name']}_step{_config['max_step']}_{_seed}"
    os.makedirs("result/log", exist_ok=True)
    logger.add(f"result/log/{task_name}.txt")

    logger.info(f"Making Code Deterministic with seed {_seed}")
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(_seed)
    np.random.seed(_seed)  # numpy cpu
    random.seed(_seed)  # python cpu
    if torch.cuda.is_available():
        torch.cuda.manual_seed(_seed)
        torch.cuda.manual_seed_all(_seed)

    train_dataset = get_dataset("train")
    val_dataset = get_dataset("val")

    n_words = len(train_dataset.qdic["w2i"])
    n_answers = len(train_dataset.adic["w2i"])

    net = MAC(n_words,
              classes=n_answers,
              use_daft=use_daft,
              qdic=train_dataset.qdic)
    net_running = MAC(n_words,
                      classes=n_answers,
                      use_daft=use_daft,
                      qdic=train_dataset.qdic)
    accumulate(net_running, net, 0)

    if dataset_name == "clevr":
        criterion = nn.CrossEntropyLoss()
    elif dataset_name == "gqa":
        criterion = TFBCELoss(train_dataset.pos_weight.to(device))
    else:
        raise KeyError(f"Dataset {dataset_name} does not exist")
    writer = SummaryWriter(f"result/summary/{task_name}")
    optimizer = torch.optim.Adam(net.parameters(), learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, factor=0.5, mode="max")

    for epoch in range(epochs):
        train_acc, train_loss = train(train_dataset, net, net_running,
                                      criterion, optimizer, epoch, writer)

        val_acc = valid(val_dataset, net_running, epoch, writer)
        scheduler.step(val_acc)
        os.makedirs(f"result/model/{task_name}", exist_ok=True)
        torch.save(
            net_running.state_dict(),
            f"result/model/{task_name}/checkpoint_{epoch:02}.model",
        )

        if optimizer.param_groups[0]["lr"] < 1e-7:
            break
예제 #4
0
    def get_data(self, mode, size=None):
        "Allow for custom mode and dataset size"
        size = self.config.dataset_size if size is None else size

        if self.config.preprocess_device == 'cpu':
            dataset = get_dataset(self.config,
                                  mode,
                                  size,
                                  preprocess=self.preprocess)
        else:
            dataset = get_dataset(self.config, mode, size)

        print(f"Dataset Size: {len(dataset)}")
        dataloader = get_dataloader(self.config, dataset)
        return dataloader
예제 #5
0
def test_overall(Net, index):

    Net.load_jointnet("/temp_disk/yyl/save/overall-{}.pkl".format(index))
    test_loader, lentest = get_dataset(
        "/disk1/yyl/multi_model_distill/data/images_finetune_rename/test/",
        batchsize=16)
    print("Test dataset size: {}".format(lentest))

    layerlist = []
    for id_x, l in enumerate(Net.layers):
        layerlist.append(l.student.reallayer)

    Net.student_net = InnerBlock(layerlist)
    get_cuda(Net.student_net)

    num_image = 0
    accuracy = 0.0
    for i, (data, labels) in enumerate(test_loader):
        if use_cuda:
            data = get_cuda(data)
        data = get_variable(data)

        for model in Net.student_net.block:
            if type(model) == torch.nn.modules.linear.Linear:
                data = pyreshape(data)
            data = model(data)
        scores = data
        acc = cal_accuracy(scores.data.cpu().numpy(),
                           labels.cpu().numpy().astype(np.int))
        accuracy += acc * data.size(0)
        num_image += data.size(0)

    print(index)
    print(accuracy / num_image)
def generate(args):
    # # Get the dataset
    dataset, n_monet_samples, n_photo_samples = get_dataset(
        args.dataset,
        augment=args.augment,
        repeat=True,
        shuffle=False,
        from_npy=args.from_npy,
        batch_size=1)
    dataset_iter = iter(dataset)

    # # Get the model and restore the checkpoint
    model_name = get_model_name(args)
    model = get_model(args)
    model.load(os.path.join(args.checkpoint_dir, model_name))
    # model.load(filepath=os.path.join(args.checkpoint_dir, model_name, "model_name.h5"))

    out_dir = os.path.join(args.result_dir, model_name)
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)

    for i in tqdm(range(n_photo_samples)):
        # Get the image from the dataset iterator
        style_ref, img = next(dataset_iter)

        # Get a prediction and save
        # if args.architecture == 'munit':
        #     prediction = model.generate_guide(img, style_ref)
        # else:
        prediction = model.generate(img)
        prediction = tf.squeeze(prediction).numpy()
        prediction = (prediction * 127.5 + 127.5).astype(np.uint8)
        out_img = PIL.Image.fromarray(prediction)
        out_img.save(os.path.join(out_dir, str(i).zfill(4) + '.jpg'))
예제 #7
0
def create_train_model(model_creator, hps, scope=None, extra_args=None):

    graph = tf.Graph()

    with graph.as_default(), tf.container(scope or "train"):
        vocab_table = data.create_vocab_tables(hps.vocab_file, hps.vocab_size,
                                               hps.unk_id)
        train_dataset = data.get_dataset(hps.data_dir, hps.train_prefix)

        skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64)
        iterator = data.get_iterator(train_dataset, vocab_table, hps)

        # Note: One can set model_device_fn to
        # `tf.train.replica_device_setter(ps_tasks)` for distributed training.
        model_device_fn = None
        if extra_args: model_device_fn = extra_args.model_device_fn
        with tf.device(model_device_fn):
            model = model_creator(iterator=iterator,
                                  hps=hps,
                                  mode=tf.contrib.learn.ModeKeys.TRAIN,
                                  vocab_table=vocab_table,
                                  scope=scope)

    return TrainModel(graph=graph,
                      model=model,
                      iterator=iterator,
                      skip_count_placeholder=skip_count_placeholder)
예제 #8
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    # Setup tpu-cluster
    cluster = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(cluster)
    tf.tpu.experimental.initialize_tpu_system(cluster)
    distribute_strategy = tf.distribute.TPUStrategy(cluster)

    with distribute_strategy.scope():
        simclr_model, input_size = build_model(model_type=FLAGS.model, n_dim=FLAGS.embedded_dim)
        optimizer = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate)
    simclr_model.compile(loss=simclr_loss_func,
                         optimizer=optimizer,
                         metrics=None)
    simclr_model.summary()

    tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"{FLAGS.job_dir}/pretrain/logs", histogram_freq=1)
    callbacks = [tboard_callback]

    train_ds = get_dataset(FLAGS.dataset, "train", read_tfrecord, FLAGS.global_batch_size, input_size)

    for epoch in range(FLAGS.epochs):
        simclr_model.fit(train_ds, callbacks=callbacks, initial_epoch=epoch, epochs=epoch+1)
        simclr_model.save(f"{FLAGS.job_dir}/pretrain/checkpoints/{epoch+1}", include_optimizer=True)
    
    simclr_model.save(f"{FLAGS.job_dir}/pretrain/saved_model", include_optimizer=False)
예제 #9
0
def load_ae(path, target_dataset, batch, all_aes, return_dataset=False):
    r_param = re.compile('(?P<name>[a-zA-Z][a-z_]*)(?P<value>(True)|(False)|(\d+(\.\d+)?(,\d+)*))')
    folders = [x for x in os.path.abspath(path).split('/') if x]
    dataset = folders[-2]
    if dataset != target_dataset:
        tf.compat.v1.logging.log(tf.compat.v1.logging.WARN,
                       'Mismatched datasets between classfier and AE (%s, %s)',
                       target_dataset, dataset)
    class_name, argpairs = folders[-1].split('_', 1)
    params = {}
    for x in r_param.findall(argpairs):
        name, value = x[:2]
        if ',' in value:
            pass
        elif value in ('True', 'False'):
            value = {'True': True, 'False': False}[value]
        elif '.' in value:
            value = float(value)
        else:
            value = int(value)
        params[name] = value
    class_ = all_aes[class_name]
    dataset = data.get_dataset(dataset, dict(batch_size=batch))
    ae = class_(dataset, '/' + os.path.join(*(folders[:-2])), **params)
    if return_dataset:
        return ae, dataset
    else:
        return ae, folders[-1]
예제 #10
0
def train(args):
    #torch.manual_seed(args.seed)
    #torch.cuda.manual_seed(args.seed)

    dataset = data.get_dataset(args.dataset, training=True)
    model = DFVE(args.image_channels, args.image_size, args.n_latent, args.lambda_, args.gamma).to(args.device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.learning_rate,
                           betas=(args.adam_beta1, args.adam_beta2),
                           weight_decay=args.weight_decay)

    model.train()
    step = 0
    epoch = 0
    for _ in range(args.n_epochs):
        epoch += 1
        loader, _ = data.get_dataloader(dataset, args.batch_size)
        for samples, labels in loader:
            step += 1
            x = samples.to(args.device).float()
            z_mean, z_logvar = model(x)
            loss, mmd_loss_from_prior, mmd_loss_for_mi = model.loss(z_mean, z_logvar, args.repeats)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if step % args.print_freq == 0:
                print('[Epoch {:d}, Step {:d}] loss: {:.4f}, mmd_loss_from_prior: {:.4f}, mmd_loss_for_mi: {:.4f}'.format(
                    epoch, step, loss.item(), mmd_loss_from_prior.item(), mmd_loss_for_mi.item()))

                monitor(z_mean, z_logvar, labels, epoch, step)
        model.monitor()
예제 #11
0
def train(args):
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    dataset = data.get_dataset(args.dataset, training=True)
    model = DFVE(args.image_channels, args.image_size, args.n_latent).to(args.device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.learning_rate,
                           betas=(args.adam_beta1, args.adam_beta2),
                           weight_decay=args.weight_decay)

    model.train()
    step = 0
    epoch = 0
    for _ in range(args.n_epochs):
        epoch += 1
        loader, _ = data.get_dataloader(dataset, args.batch_size)
        for samples, labels in loader:
            step += 1
            x = samples.to(args.device).float()
            z = model(x, args.repeats, args.noise_sigma)
            loss, mmd_loss_all, mmd_loss_avg = model.loss(z, args.gamma, args.kernel_gamma, args.kernel_power,
                                                          args.gnorm_mu, args.gnorm_sigma, args.gnorm_alpha)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if step % args.print_freq == 0:
                print('[Epoch {:d}, Step {:d}] loss: {:.4f}, mmd_loss_all: {:.4f}, mmd_loss_avg: {:.4f}'.format(
                    epoch, step, loss.item(), mmd_loss_all.item(), mmd_loss_avg.item()))

            if step % args.show_freq == 0:
                monitor(z, labels, epoch, step)
                model.monitor()
예제 #12
0
    def _combine_preds(self, X_train, X_cv, y, train=None, predict=None,
                       stack=False, fwls=False):
        """
        Combine preds, returning in order:
            - mean_preds: the simple average of all model predictions
            - stack_preds: the predictions of the stage 1 generalizer
            - fwls_preds: same as stack_preds, but optionally using more
                complex blending schemes (meta-features, different
                generalizers, etc.)
        """
        mean_preds = np.mean(X_cv, axis=1)
        stack_preds = None
        fwls_preds = None

        if stack:
            self.generalizer.fit(X_train, y)
            stack_preds = self.generalizer.predict(X_cv)

        if self.fwls:
            meta, meta_cv = get_dataset('metafeatures', train, predict)
            fwls_train = np.hstack((X_train, meta))
            fwls_cv = np.hstack((X_cv, meta))
            self.generalizer.fit(fwls_train)
            fwls_preds = self.generalizer.predict(fwls_cv)

        return mean_preds, stack_preds, fwls_preds
예제 #13
0
    def __init__(self,
                 module_path,
                 hyper_params,
                 use_cuda,
                 test_rate=1.0,
                 USE_EXIST_RES=False,
                 mission=1):

        print("Test rate:", test_rate)
        _, self.dataset = get_dataset(valid_rate=test_rate,
                                      USE_TRANSFORM=False,
                                      mission=mission)
        print("test number:", len(self.dataset))

        self.hyper_params = hyper_params
        self.data_loader = DataLoader(
            dataset=self.dataset,
            num_workers=self.hyper_params["threads"],
            batch_size=self.hyper_params["batch_size"],
            shuffle=False)

        self.resnet = get_network(mission=mission)
        self.resnet.load_state_dict(torch.load(module_path))
        if use_cuda:
            self.resnet = self.resnet.cuda()

        self.v = Validator(resnet=self.resnet,
                           hyper_params=hyper_params,
                           use_cuda=use_cuda,
                           data_loader=self.data_loader)
예제 #14
0
def main(dataset, factor, outputdir):
    factor = float(factor)
    dset = get_dataset(dataset)
    X = (factor * dset.instances).astype(int)

    # Remove irrelevant columns (all feature values identical)
    relevant = np.nonzero(np.max(X, axis=0) - np.min(X, axis=0))[0]
    X = X[:, relevant]

    namesfile = os.path.join(outputdir, NAMES_FMT % dataset)
    datafile = os.path.join(outputdir, DATA_FMT % dataset)

    with open(namesfile, 'w+') as f:
        f.write('0,1.\n')
        f.write('bag_id: %s.\n' % ','.join(dset.bag_ids))
        f.write('instance_id: %s.\n' %
                ','.join([iid[1] for iid in dset.instance_ids]))
        for i in range(X.shape[1]):
            f.write('f%d: continuous.\n' % (i + 1))

    with open(datafile, 'w+') as f:
        for (bid, iid), xx, y in zip(dset.instance_ids, X,
                                     dset.instance_labels):
            xs = ','.join(map(str, xx))
            f.write('%s,%s,%s,%d.\n' % (bid, iid, xs, y))
예제 #15
0
파일: utils.py 프로젝트: shikharbahl/acai
def load_ae(path, target_dataset, batch, all_aes, return_dataset=False):
    r_param = re.compile('(?P<name>[a-zA-Z][a-z_]*)(?P<value>(True)|(False)|(\d+(\.\d+)?(,\d+)*))')
    folders = [x for x in os.path.abspath(path).split('/') if x]
    dataset = folders[-2]
    if dataset != target_dataset:
        tf.logging.log(tf.logging.WARN,
                       'Mismatched datasets between classfier and AE (%s, %s)',
                       target_dataset, dataset)
    class_name, argpairs = folders[-1].split('_', 1)
    params = {}
    for x in r_param.findall(argpairs):
        name, value = x[:2]
        if ',' in value:
            pass
        elif value in ('True', 'False'):
            value = dict(True=True, False=False)[value]
        elif '.' in value:
            value = float(value)
        else:
            value = int(value)
        params[name] = value
    class_ = all_aes[class_name]
    dataset = data.get_dataset(dataset, dict(batch_size=batch))
    ae = class_(dataset, '/' + os.path.join(*(folders[:-2])), **params)
    if return_dataset:
        return ae, dataset
    else:
        return ae, folders[-1]
예제 #16
0
def main(arguments):
    print('===Start of program.===')

    tf.random.set_random_seed(12345)
    input_pattern = arguments.input_pattern
    output_name = arguments.output
    base_batch_size = 32
    available_gpus = get_available_gpus()
    total_batch_size = base_batch_size * len(available_gpus)
    dataset = parse_dataset(get_dataset(input_pattern),
                            batch_size=total_batch_size)
    strategy = get_train_strategy(available_gpus)

    callback = get_callbacks()
    with strategy.scope():
        optimizer = get_sgd_optimizer()
        model = create_model()
        cce_loss = losses.CategoricalCrossentropy(from_logits=True)
        print(model.summary())
        model.compile(loss=cce_loss, optimizer=optimizer)

        model.fit(dataset, epochs=20, callbacks=callback)
        if not output_name.endswith('.h5'):
            output_name += 'model.h5'
        model.save_weights(output_name, overwrite=True)
    print('===End of program.===')
예제 #17
0
def build_data_iterator(hps, files, current_res_h, current_res_w, batch_size=None, label_list=None,
                        num_shards=None, shard_index=None):
    random.shuffle(files)
    dataset = get_dataset(files, current_res_h, current_res_w, hps.epochs_per_res, batch_size,
                          label_list=label_list, num_shards=None, shard_index=None)
    it = dataset.make_one_shot_iterator()
    return it
예제 #18
0
    def _combine_preds(self,
                       X_train,
                       X_cv,
                       y,
                       train=None,
                       predict=None,
                       stack=False,
                       fwls=False):
        """
        Combine preds, returning in order:
            - mean_preds: the simple average of all model predictions
            - stack_preds: the predictions of the stage 1 generalizer
            - fwls_preds: same as stack_preds, but optionally using more
                complex blending schemes (meta-features, different
                generalizers, etc.)
        """
        mean_preds = np.mean(X_cv, axis=1)
        stack_preds = None
        fwls_preds = None

        if stack:
            self.generalizer.fit(X_train, y)
            stack_preds = self.generalizer.predict(X_cv)

        if self.fwls:
            meta, meta_cv = get_dataset('metafeatures', train, predict)
            fwls_train = np.hstack((X_train, meta))
            fwls_cv = np.hstack((X_cv, meta))
            self.generalizer.fit(fwls_train)
            fwls_preds = self.generalizer.predict(fwls_cv)

        return mean_preds, stack_preds, fwls_preds
예제 #19
0
def train():
    data = get_dataset()
    x = torch.tensor(data['x'], requires_grad=True,
                     dtype=torch.float32)  # x 实际上是位置和速度
    test_x = torch.tensor(data['test_x'],
                          requires_grad=True,
                          dtype=torch.float32)
    _, acce = torch.Tensor(data['test_dx']).chunk(2, 1)
    _, test_acce = torch.Tensor(data['test_dx']).chunk(2, 1)
    N, freedom = x.shape
    freedom /= 2
    input_dim = int(freedom * 2)
    output_dim = int(freedom)
    model_nn = MLP(input_dim, 50, output_dim, 'tanh')
    model = LNN(input_dim, differentiable_model=model_nn)
    optim = torch.optim.Adam(model.parameters(), 5e-3, weight_decay=1e-4)
    # vanilla train loop
    stats = {'train_loss': [], 'test_loss': []}
    torch.autograd.set_detect_anomaly(True)
    for step in range(500):  #500 epoch
        # train step
        loss = 0
        for i in range(100):
            acce_hat = model.forward_new(x[i])
            loss = loss + L1_loss(acce[i], acce_hat)
        loss.backward()
        loss /= 100
        optim.step()
        optim.zero_grad()
        print("step {}, train_loss {:.4e}, ".format(step, loss))
        writer.add_scalar('LNN/spring_train_loss', loss, step)
예제 #20
0
    def test_get_dataset_raw(self):
        with self.test_session():
            test_image1 = tf.constant(np.arange(4 * 4 * 3),
                                      shape=[4, 4, 3],
                                      dtype=tf.uint8)
            encoded = tf.image.encode_png(test_image1)
            image = encoded.eval()
            print(os.getcwd())
            with open(os.path.join("test_files", "test1.png"), "wb") as f:
                f.write(image)

            test_image2 = tf.constant(np.flip(np.arange(4 * 4 * 3), axis=0),
                                      shape=[4, 4, 3],
                                      dtype=tf.uint8)
            encoded = tf.image.encode_png(test_image2)
            image = encoded.eval()
            with open(os.path.join("test_files", "test2.png"), "wb") as f:
                f.write(image)

            files = glob.glob(os.path.join("test_files", "test*.png"))
            dataset = get_dataset(files)

            it = dataset.make_one_shot_iterator()
            self.assertAllClose(it.get_next(), test_image1)
            self.assertAllClose(it.get_next(), test_image2)
예제 #21
0
    def test_preprocess_dataset_batch2_float_tfrecord(self):
        with self.test_session():
            test_image1 = tf.constant(np.arange(4 * 4 * 3) * 5,
                                      shape=[4, 4, 3],
                                      dtype=tf.uint8)
            encoded = tf.image.encode_png(test_image1)
            image1 = encoded.eval()
            with open(os.path.join("test_files", "test1.png"), "wb") as f:
                f.write(image1)

            test_image2 = tf.constant(np.flip(np.arange(4 * 4 * 3) * 5,
                                              axis=0),
                                      shape=[4, 4, 3],
                                      dtype=tf.uint8)
            encoded = tf.image.encode_png(test_image2)
            image2 = encoded.eval()
            with open(os.path.join("test_files", "test2.png"), "wb") as f:
                f.write(image2)

            files = glob.glob(os.path.join("test_files", "test*.png"))
            dataset = get_dataset(files)

            dataset = preprocess_dataset(dataset,
                                         size=[64, 64],
                                         batch_size=2,
                                         float_pixels=True)

            it = dataset.make_one_shot_iterator()
            data = it.get_next().eval()
            self.assertEqual(data.shape, (2, 64, 64, 3))
            self.assertAllClose(max(data.flatten()),
                                max(test_image1.eval().flatten()) / 127.5 - 1.)
            self.assertAllClose(min(data.flatten()),
                                min(test_image1.eval().flatten()) / 127.5 - 1.)
예제 #22
0
def main(configfile, folddir, resultsdir, outputfile):
    with open(configfile, 'r') as f:
        configuration = yaml.load(f)

    # Generate tasks from experiment list
    tasks = {}
    for experiment in configuration['experiments']:
        classifier = experiment['classifier']
        dataset = experiment['dataset']
        folds = data.get_folds(folddir, dataset)
        for f in range(len(folds)):
            for r in range(experiment['reps']):
                key = (classifier, dataset, experiment['kernel'], f, r)
                task = Task(*key)
                tasks[key] = task

    # Mark finished tasks
    for task in tasks.values():
        predfile = os.path.join(resultsdir, task.filebase('preds'))
        task.predfile = predfile
        if os.path.exists(predfile):
            task.finish()

    reindexed = defaultdict(lambda: defaultdict(list))
    for (c, d, k, f, r), task in tasks.items():
        reindexed[(c, d, k)][r].append(task)

    existing_keys = set()
    if os.path.exists(outputfile):
        with open(outputfile, 'r') as f:
            for line in f:
                c, d, k = line.strip().split(',')[:3]
                existing_keys.add((c, d, k))

    with open(outputfile, 'a+') as f:
        rep_aucs = defaultdict(list)
        for key, reps in sorted(reindexed.items()):
            if key in existing_keys:
                print 'Skipping %s (already finished)...' % str(key)
                continue
            data_dict = data.get_dataset(key[1])
            bag_ids = sorted(data_dict.keys())
            y_true = [data_dict[bid][1] for bid in bag_ids]

            predictions = []
            for rep, task_list in sorted(reps.items()):
                if all(task.finished for task in task_list):
                    predictions.append(get_preds(key, task_list, bag_ids))
                else:
                    break
            if len(predictions) != len(reps):
                print 'Skipping %s (incomplete)...' % str(key)
                continue
            predictions = np.vstack(predictions)
            # We want a cumulative average, but doesn't matter for AUC
            cumpreds = np.cumsum(predictions, axis=0)
            aucs = [auc_score(y_true, cp) for cp in cumpreds]
            line = ','.join(map(str, key) + map(str, aucs))
            print line
            f.write(line + '\n')
def train(args):
    # Get the dataset
    dataset, n_monet_samples, n_photo_samples = get_dataset(
        args.dataset,
        augment=args.augment,
        repeat=True,
        shuffle=True,
        batch_size=args.batch_size,
        autotune=1,
        from_npy=args.from_npy,
        cache=False)

    # Get the model
    model = get_model(args)

    # Try loading pretrained weights
    model_name = get_model_name(args)
    # try:
    #     model.load(filepath=os.path.join(args.checkpoint_dir, model_name))
    #     print("Model weights restored.")
    # except:
    #     print("Could not find model weights.")

    # Train the model
    history = model.fit(
        dataset,
        epochs=args.epochs,
        batch_size=args.batch_size,
        steps_per_epoch=(max(n_monet_samples, n_photo_samples) //
                         args.batch_size),
        # steps_per_epoch=1000,
        callbacks=get_callbacks(args))
예제 #24
0
def train_encoding(args):
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    tag = 'latents_{:d}_alpha_{:d}'.format(args.n_latent,
                                           int(args.gnorm_alpha))
    save_dir = os.path.join(args.save_base, tag)
    U.mkdir(save_dir)

    dataset = data.get_dataset(args.dataset, training=True)
    model = Model(args.image_channels, args.image_size, args.n_latent,
                  args.n_dims).to(args.device)
    optimizer = optim.Adam(model.encoder.parameters(),
                           lr=args.learning_rate,
                           betas=(args.adam_beta1, args.adam_beta2),
                           weight_decay=args.weight_decay)

    model.train()
    step = 0
    epoch = 0
    examples = 0
    while examples < args.max_examples:
        epoch += 1
        loader, _ = data.get_dataloader(dataset, args.batch_size)
        for samples, labels in loader:
            step += 1
            x = samples.to(args.device).float()  # B x C x H x W
            z = model(x, args.repeats, args.noise_sigma,
                      'encoding')  # B x repeats x n_latent
            loss, mmd_loss_all, mmd_loss_avg = model.encoding_loss(
                z, args.gamma, args.kernel_gamma, args.kernel_power,
                args.gnorm_mu, args.gnorm_sigma, args.gnorm_alpha)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            prev_examples = examples
            examples += x.size(0)

            if examples // BASE_N > prev_examples // BASE_N:
                print(
                    '[Epoch {:d}, Step {:d}, #Eg. {:d}] loss: {:.4f}, mmd_loss_all: {:.4f}, mmd_loss_avg: {:.4f}'
                    .format(epoch, step, examples, loss.item(),
                            mmd_loss_all.item(), mmd_loss_avg.item()))
                if examples // BASE_N in args.save_points:
                    path = os.path.join(
                        save_dir, 'training_examples_{:d}_10k.ckpt'.format(
                            examples // BASE_N))
                    print('save {}'.format(path))
                    torch.save(
                        {
                            'examples': examples // BASE_N * BASE_N,
                            'loss': loss.item(),
                            'mmd_loss_all': mmd_loss_all.item(),
                            'mmd_loss_avg': mmd_loss_avg.item(),
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict()
                        }, path)
예제 #25
0
def main(base_dataset, new_dataset):
    dset = get_dataset(base_dataset)
    view = ViewBuilder(new_dataset, base_dataset)

    for i, ((bid, iid), yi) in enumerate(zip(dset.instance_ids, dset.instance_labels)):
        view.add(bid, iid, 'b%d' % i, 'i%d' % i, yi)

    view.save(VIEWS_PATH[0])
예제 #26
0
def _true_labels(dataset, level='bags'):
    dset = get_dataset(dataset)
    if level.startswith('b'):
        return dict(zip(dset.bag_ids, dset.bag_labels.flat))
    elif level.startswith('i'):
        return dict(zip(dset.instance_ids, dset.instance_labels.flat))
    else:
        raise ValueError('Bad level type "%s"' % level)
예제 #27
0
def _true_labels(dataset, level='bags'):
    dset = get_dataset(dataset)
    if level.startswith('b'):
        return dict(zip(dset.bag_ids, dset.bag_labels.flat))
    elif level.startswith('i'):
        return dict(zip(dset.instance_ids, dset.instance_labels.flat))
    else:
        raise ValueError('Bad level type "%s"' % level)
예제 #28
0
파일: main.py 프로젝트: yxc775/eecs440
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        # Make sure they use --meta-iters if they want to do bagging/boosting
        raise ValueError("Please indicate number of iterations for %s" %
                         options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" %
                             fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    options['schema'] = schema
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    # import pdb;pdb.set_trace()
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print(options)
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y)
        train_time = (train_start - time.time())

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:, 1]  # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    print('      Accuracy: %.03f %.03f' %
          stats_manager.get_statistic('accuracy', pooled=False))

    print('     Precision: %.03f %.03f' %
          stats_manager.get_statistic('precision', pooled=False))

    print('        Recall: %.03f %.03f' %
          stats_manager.get_statistic('recall', pooled=False))

    print('Area under ROC: %.03f' %
          stats_manager.get_statistic('auc', pooled=True))
예제 #29
0
def create_dataloader(name, transform, val_ratio, batch_size, workers):
    dataset_train = get_dataset(name, 'train', transform['train'])
    dataset_val = get_dataset(name, 'train', transform['eval'])
    dataset_test = get_dataset(name, 'test', transform['eval'])

    idx_sorted = np.argsort(dataset_train.train_labels)
    num_classes = dataset_train.train_labels[idx_sorted[-1]] + 1
    samples_per_class = len(dataset_train) // num_classes
    val_len = int(val_ratio * samples_per_class)
    val_idx = np.array([], dtype=np.int32)
    train_idx = np.array([], dtype=np.int32)
    for i in range(num_classes):
        perm = np.random.permutation(range(samples_per_class))

        val_part = samples_per_class * i + perm[0:val_len]
        val_part = idx_sorted[val_part]
        val_idx = np.concatenate((val_idx, val_part))

        train_part = samples_per_class * i + perm[val_len:]
        train_part = idx_sorted[train_part]
        train_idx = np.concatenate((train_idx, train_part))

    sampler_train = torch.utils.data.sampler.SubsetRandomSampler(train_idx)
    sampler_val = torch.utils.data.sampler.SubsetRandomSampler(val_idx)

    train_loader = torch.utils.data.DataLoader(dataset_train,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               sampler=sampler_train,
                                               num_workers=workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(dataset_val,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             sampler=sampler_val,
                                             num_workers=workers,
                                             pin_memory=True)
    test_loader = torch.utils.data.DataLoader(dataset_test,
                                              batch_size=batch_size,
                                              shuffle=False,
                                              num_workers=workers,
                                              pin_memory=True)

    return train_loader, val_loader, test_loader
예제 #30
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    #import pdb;pdb.set_trace()
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y, schema)
        train_time = (train_start - time.time())
        print train_time
        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X,schema)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:,1]    # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)
        
       
   
    print ('      Accuracy: %.03f %.03f'
        % stats_manager.get_statistic('accuracy', pooled=False))
		
    print ('     Precision: %.03f %.03f'
        % stats_manager.get_statistic('precision', pooled=False))
    
    print ('        Recall: %.03f %.03f'
        % stats_manager.get_statistic('recall', pooled=False))
    
    print ('Area under ROC: %.03f'
        % stats_manager.get_statistic('auc', pooled=True))
예제 #31
0
    def __init__(self, args):
        self.args = args

        args.logger.info('Initializing trainer')
        self.model = get_model(args)
        params_cnt = count_parameters(self.model)
        args.logger.info("params "+str(params_cnt))
        torch.cuda.set_device(args.rank)
        self.model.cuda(args.rank)
        self.model = torch.nn.parallel.DistributedDataParallel(self.model,
                device_ids=[args.rank])
        train_dataset, val_dataset = get_dataset(args)

        if args.split == 'train':
            # train loss
            self.RGBLoss = RGBLoss(args, sharp=False)
            self.SegLoss = nn.CrossEntropyLoss()
            self.RGBLoss.cuda(args.rank)
            self.SegLoss.cuda(args.rank)

            if args.optimizer == "adamax":
                self.optimizer = torch.optim.Adamax(list(self.model.parameters()), lr=args.learning_rate)
            elif args.optimizer == "adam":
                self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.learning_rate)
            elif args.optimizer == "sgd":
                self.optimizer = torch.optim.SGD(self.model.parameters(), lr=args.learning_rate, momentum=0.9)

            train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
            self.train_loader = torch.utils.data.DataLoader(
                train_dataset, batch_size=args.batch_size//args.gpus, shuffle=False,
                num_workers=args.num_workers, pin_memory=True, sampler=train_sampler)

        else:
            # val criteria
            self.L1Loss  = nn.L1Loss().cuda(args.rank)
            self.PSNRLoss = PSNR().cuda(args.rank)
            self.SSIMLoss = SSIM().cuda(args.rank)
            self.IoULoss = IoU().cuda(args.rank)
            self.VGGCosLoss = VGGCosineLoss().cuda(args.rank)

            val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset)
            self.val_loader = torch.utils.data.DataLoader(
                val_dataset, batch_size=args.batch_size//args.gpus, shuffle=False,
                num_workers=args.num_workers, pin_memory=True, sampler=val_sampler)

        torch.backends.cudnn.benchmark = True
        self.global_step = 0
        self.epoch=1
        if args.resume or ( args.split != 'train' and not args.checkepoch_range):
            self.load_checkpoint()

        if args.rank == 0:
            writer_name = args.path+'/{}_int_{}_len_{}_{}_logs'.format(self.args.split, int(self.args.interval), self.args.vid_length, self.args.dataset)
            self.writer = SummaryWriter(writer_name)

        self.stand_heat_map = self.create_stand_heatmap()
예제 #32
0
    def __init__(self, config):

        self.h_dim = config["h_dim"]
        self.z_dim = config["z_dim"]
        self.epochs = config["num_epochs"]
        self.batch_size = config["batch_size"]
        self.sigma_z = config["sigma_z"]
        self.lmbda = config["lambda"]

        # Experiment directory
        self.logdir = os.path.join("runs", \
            datetime.now().strftime("wae_gan_%d_%m_%Y-%H:%M:%S"))
        self.writer = tf.summary.create_file_writer(self.logdir)
        with self.writer.as_default():
            tf.summary.text("Hyperparams", json.dumps(config), step=0)
        self.writer.flush()
        os.mkdir(os.path.join(self.logdir, "img"))
        os.mkdir(os.path.join(self.logdir, "img", "random"))
        os.mkdir(os.path.join(self.logdir, "img", "recons"))
        os.mkdir(os.path.join(self.logdir, "models"))
        os.mkdir(os.path.join(self.logdir, "models", "encoder"))
        os.mkdir(os.path.join(self.logdir, "models", "decoder"))
        os.mkdir(os.path.join(self.logdir, "models", "discriminator"))
        with open(os.path.join(self.logdir, "config.json"), "w") as f:
            json.dump(config, f)

        # Models ================================================================
        self.encoder, self.decoder, self.discriminator = get_ae_disc(config)

        # Optimizers ============================================================
        ae_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=config["ae_lr"],
            decay_steps=config["ae_dec_steps"],
            decay_rate=config["ae_dec_rate"])
        disc_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=config["d_lr"],
            decay_steps=config["d_dec_steps"],
            decay_rate=config["d_dec_rate"])

        self.enc_optim = tf.keras.optimizers.Adam(ae_scheduler)
        self.dec_optim = tf.keras.optimizers.Adam(ae_scheduler)
        self.disc_optim = tf.keras.optimizers.Adam(disc_scheduler)

        # Data ==================================================================
        tf.print("Loading data...")
        self.train_dataset, self.test_dataset = \
            get_dataset(batch_size=self.batch_size)
        tf.print("Done.")

        # Metric trackers =======================================================
        self.avg_d_train_loss = tf.keras.metrics.Mean(dtype=tf.float32)
        self.avg_d_test_loss = tf.keras.metrics.Mean(dtype=tf.float32)
        self.avg_d_z_loss = tf.keras.metrics.Mean(dtype=tf.float32)
        self.avg_mse_test_loss = tf.keras.metrics.Mean(dtype=tf.float32)
        self.avg_enc_dec_train_loss = tf.keras.metrics.Mean(dtype=tf.float32)
        self.avg_enc_dec_test_loss = tf.keras.metrics.Mean(dtype=tf.float32)
예제 #33
0
def train(args):
  # set random seed
  torch.manual_seed(args.seed)
  np.random.seed(args.seed)

  # init model and optimizer
  if args.verbose:
    print("Training baseline model:" if args.baseline else "Training HNN model:")

  output_dim = args.input_dim if args.baseline else 2
  nn_model = MLP(args.input_dim, args.hidden_dim, output_dim, args.nonlinearity)
  model = HNN(args.input_dim, differentiable_model=nn_model,
            field_type=args.field_type, baseline=args.baseline)
  optim = torch.optim.Adam(model.parameters(), args.learn_rate, weight_decay=0)

  # arrange data
  data = get_dataset(args.name, args.save_dir, verbose=True)
  x = torch.tensor( data['coords'], requires_grad=True, dtype=torch.float32)
  test_x = torch.tensor( data['test_coords'], requires_grad=True, dtype=torch.float32)
  dxdt = torch.Tensor(data['dcoords'])
  test_dxdt = torch.Tensor(data['test_dcoords'])

  # vanilla train loop
  stats = {'train_loss': [], 'test_loss': []}
  for step in range(args.total_steps+1):

    # train step
    ixs = torch.randperm(x.shape[0])[:args.batch_size]
    dxdt_hat = model.time_derivative(x[ixs])
    dxdt_hat += args.input_noise * torch.randn(*x[ixs].shape) # add noise, maybe
    loss = L2_loss(dxdt[ixs], dxdt_hat)
    loss.backward()
    grad = torch.cat([p.grad.flatten() for p in model.parameters()]).clone()
    optim.step() ; optim.zero_grad()

    # run test data
    test_ixs = torch.randperm(test_x.shape[0])[:args.batch_size]
    test_dxdt_hat = model.time_derivative(test_x[test_ixs])
    test_dxdt_hat += args.input_noise * torch.randn(*test_x[test_ixs].shape) # add noise, maybe
    test_loss = L2_loss(test_dxdt[test_ixs], test_dxdt_hat)

    # logging
    stats['train_loss'].append(loss.item())
    stats['test_loss'].append(test_loss.item())
    if args.verbose and step % args.print_every == 0:
      print("step {}, train_loss {:.4e}, test_loss {:.4e}, grad norm {:.4e}, grad std {:.4e}"
          .format(step, loss.item(), test_loss.item(), grad@grad, grad.std()))

  train_dxdt_hat = model.time_derivative(x)
  train_dist = (dxdt - train_dxdt_hat)**2
  test_dxdt_hat = model.time_derivative(test_x)
  test_dist = (test_dxdt - test_dxdt_hat)**2
  print('Final train loss {:.4e} +/- {:.4e}\nFinal test loss {:.4e} +/- {:.4e}'
    .format(train_dist.mean().item(), train_dist.std().item()/np.sqrt(train_dist.shape[0]),
            test_dist.mean().item(), test_dist.std().item()/np.sqrt(test_dist.shape[0])))
  return model, stats
예제 #34
0
    def GET(self):
        # top_n
        count_of_top_companies = 3
        
        dataset = data.get_dataset()
        eventBreakdown = data.EventBreakdown(dataset)
        event_type_breakdown, event_type_names = eventBreakdown.break_down_events(count_of_top_companies)
        
#         pdb.set_trace()
        return render.event2comp(event_type_breakdown, event_type_names, count_of_top_companies)
예제 #35
0
    def build_validation_data_loader(self) -> DataLoader:
        if not self.data_downloaded:
            self.download_directory = data.download_dataset(
                download_directory=self.download_directory,
                data_config=self.context.get_data_config(),
            )
            self.data_downloaded = True

        validation_data = data.get_dataset(self.download_directory, train=False)
        return DataLoader(validation_data, batch_size=self.context.get_per_slot_batch_size())
예제 #36
0
def main(outputfile):
    progress = ProgressMonitor(total=len(DATASETS), msg='Extracting statistics')
    with open(outputfile, 'w+') as f:
        stats = ','.join(stat for stat, _ in STATISTICS)
        f.write('#%s\n' % stats)
        for dataset in DATASETS:
            dset = get_dataset(dataset)
            dset.name = dataset
            stats = ','.join(map(str, (f(dset) for _, f in STATISTICS)))
            f.write('%s\n' % stats)
            progress.increment()
예제 #37
0
def get_var(dataset):
    variation = 0.0
    p = 0
    dset = data.get_dataset(dataset)
    for _, bag, y, inst_labels in dset.bag_dict.values():
        if y != True: continue
        p += 1
        pinsts = bag[np.array(inst_labels), :]
        variation += np.average(cdist(pinsts, pinsts, 'euclidean'))
    variation /= p
    return variation
예제 #38
0
def main(dataset):
    dset = get_dataset(dataset)
    i, f = dset.instances.shape
    b = len(dset.bags)
    p = sum(dset.bag_labels)
    n = b - p
    print 'Dataset.............%s' % dataset
    print 'Features............%d' % f
    print 'Instances...........%d' % i
    print 'Bags................%d' % b
    print '    Positive........%d' % p
    print '    Negative........%d' % n
    print 'Avg. Instances/Bag..%.1f' % (float(i)/b)
예제 #39
0
def format_data(dataset_file):
    """
	返回dataset(列表集合)和features(列表)
	"""
    """
	dataset = []
	for index,line in enumerate(open(dataset_file,'rU').readlines()):
		line = line.strip()
		fea_and_label = line.split(',')
		dataset.append([float(fea_and_label[i]) for i in range(len(fea_and_label)-1)]+[fea_and_label[len(fea_and_label)-1]])
	#features = [dataset[0][i] for i in range(len(dataset[0])-1)]
	#sepal length(花萼长度)、sepal width(花萼宽度)、petal length(花瓣长度)、petal width(花瓣宽度)
	
	features=[]
	for j in range(19):
		features.append(str(j))
	#features = ['sepal_length','sepal_width','petal_length','petal_width']
	"""
    schema, X, y = get_dataset(train_file, ".")
    # labels = get_labels(train_file)
    labels = y
    train_dataset = map(list, X)
    for i in range(len(train_dataset)):
        train_dataset[i].append(y[i])
    train_features = []

    """
	for feature_name in schema.feature_names:
		train_features.append(feature_name)
	print schema.feature_names
	print type(schema.feature_names)
	print schema.nominal_values
	print type(schema.nominal_values)
	is_nominal=[]
	for i in range(len(schema.feature_names)):
		is_nominal.append(schema.is_nominal(i))
	print is_nominal
	print type(is_nominal)
	"""
    return train_dataset, schema
예제 #40
0
def find_params(model, feature_set, y, subsample=None, grid_search=False):
    """
    Return parameter set for the model, either predefined
    or found through grid search.
    """
    model_name = model.__class__.__name__
    params = INITIAL_PARAMS.get(model_name, {})
    y = y if subsample is None else y[subsample]

    try:
        with open('saved_params.json') as f:
            saved_params = json.load(f)
    except IOError:
        saved_params = {}

    if (grid_search and model_name in PARAM_GRID and stringify(
            model, feature_set) not in saved_params):
        X, _ = get_dataset(feature_set, subsample, [0])
        clf = GridSearchCV(model, PARAM_GRID[model_name], cv=10, n_jobs=4,
                           scoring="roc_auc")
        #grid search for the best parameter for the learning model
        clf.fit(X, y)
        logger.info("found params (%s > %.4f): %s",
                    stringify(model, feature_set),
                    clf.best_score_, clf.best_params_)
        params.update(clf.best_params_)
        saved_params[stringify(model, feature_set)] = params
        with open('saved_params.json', 'w') as f:
            json.dump(saved_params, f, indent=4, separators=(',', ': '),
                      ensure_ascii=True, sort_keys=True)
    else:
        params.update(saved_params.get(stringify(model, feature_set), {}))
        if grid_search:
            logger.info("using params %s: %s", stringify(model, feature_set),
                        params)

    return params
예제 #41
0
def main(dataset, factor, outputdir):
    factor = float(factor)
    dset = get_dataset(dataset)
    X = (factor*dset.instances).astype(int)

    # Remove irrelevant columns (all feature values identical)
    relevant = np.nonzero(np.max(X, axis=0) - np.min(X, axis=0))[0]
    X = X[:, relevant]

    namesfile = os.path.join(outputdir, NAMES_FMT % dataset)
    datafile = os.path.join(outputdir, DATA_FMT % dataset)

    with open(namesfile, 'w+') as f:
        f.write('0,1.\n')
        f.write('bag_id: %s.\n' % ','.join(dset.bag_ids))
        f.write('instance_id: %s.\n'
            % ','.join([iid[1] for iid in dset.instance_ids]))
        for i in range(X.shape[1]):
            f.write('f%d: continuous.\n' % (i+1))

    with open(datafile, 'w+') as f:
        for (bid, iid), xx, y in zip(dset.instance_ids, X, dset.instance_labels):
            xs = ','.join(map(str, xx))
            f.write('%s,%s,%s,%d.\n' % (bid, iid, xs, y))
예제 #42
0
def main():
    data = get_dataset('musk1')
    test_vocab(data, 'miles')
    test_vocab(data, 'yards')
예제 #43
0
def main(**options):

    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')
    
    #MAX_DEPTH = options.pop('depth')
    
    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"])
    
    fs_alg = None
    
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")
    
    schema, X, y = get_dataset(dataset, dataset_directory)
    
    attr_set=[]
    for i in range(len(schema.feature_names)):
        attr_set.append(schema.is_nominal(i))

    folds = get_folds(X, y, k)
    
    stats_manager = StatisticsManager()
    
    #import pdb;pdb.set_trace()
    
    for train_X, train_y, test_X, test_y in folds:
        
        # Construct classifier instance
        print options
    
        classifier = get_classifier(**options)

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        classifier.fit(train_X, train_y, attr_set)
        
        print 'ff'
        train_time = (train_start - time.time())
        
        if fs_alg:
            test_X = selector.transform(test_X)
            
        predictions=[]
        for t in test_X:
            predictions.append(classifier.predict(t))

        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:,1]    # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)
        
        print classifier.size()
        print classifier.depth()
    print ('      Accuracy: %.03f %.03f'
        % stats_manager.get_statistic('accuracy', pooled=False))
         
    
    '''
예제 #44
0
def client_target(task, callback):
    (experiment_name, experiment_id,
     train_dataset, test_dataset, _, _) = task['key']
    parameters = task['parameters']

    print 'Starting task %s...' % str(experiment_id)
    print 'Training Set: %s' % train_dataset
    print 'Test Set:     %s' % test_dataset
    print 'Parameters:'
    for k, v in parameters.items():
        print '\t%s: %s' % (k, str(v))
    #import pdb;pdb.set_trace()
   
    train = get_dataset(train_dataset)
    test = get_dataset(test_dataset)
    #import pdb;pdb.set_trace()
    """
    data_raw = np.genfromtxt('natural_scene.data',delimiter = ",")
    class data_class(object):
	def __init__(self):
		pass
    train=data_class()
    test=data_class()
    feature_matrix = data_raw[:, 2:-5]
    label_matrix = data_raw[:, -5:]
    num_instances = data_raw.shape[0]
    train.instances = feature_matrix[:int(math.floor(num_instances/2)),: ]
    test.instances = feature_matrix[int(math.floor(num_instances/2)):,: ]
    train.instance_labels = label_matrix[:int(math.floor(num_instances/2)),: ]
    test.instance_labels = label_matrix[int(math.floor(num_instances/2)):,:  ]
    """
    submission = {
        'instance_predictions' : {
            'train' : {},
            'test'  : {},
        },
        'bag_predictions' : {
            'train' : {},
            'test'  : {},
        },
        'statistics' : {}
    }
    timer = Timer()

    if parameters['kernel'] == 'emp':
        dataset = get_base_dataset(train_dataset)
        idxfile = os.path.join(IDX_DIR, IDX_FMT % dataset)
        kernelfile = os.path.join(PRECOMPUTED_DIR,
            PRECOMPUTED_FMT % (dataset, parameters['ktype']))
        parameters['dataset'] = dataset
        parameters['idxfile'] = idxfile
        parameters['kernelfile'] = kernelfile
        empirical_labels = list(map(str, train.bag_ids))
        if parameters.pop('transductive', False):
            empirical_labels += list(map(str, test.bag_ids))
        parameters['empirical_labels'] = empirical_labels
        train.bags = train.bag_ids
        test.bags = test.bag_ids

    classifier_name = parameters.pop('classifier')
    if classifier_name in CLASSIFIERS:
        classifier0 = CLASSIFIERS[classifier_name](**parameters)
	classifier1 = CLASSIFIERS[classifier_name](**parameters)
	classifier2 = CLASSIFIERS[classifier_name](**parameters)
 	classifier3 = CLASSIFIERS[classifier_name](**parameters)
 	classifier4 = CLASSIFIERS[classifier_name](**parameters)
    else:
        print 'Technique "%s" not supported' % classifier_name
        callback.quit = True
        return

    print 'Training...'
    timer.start('training')
    if train.regression:
        classifier1.fit(train.bags, train.bag_labels)
    else:
        #import pdb;pdb.set_trace()
	classifier0.fit(train.instances, train.instance_labels[:,0].reshape((-1,)))
	classifier1.fit(train.instances, train.instance_labels[:,1].reshape((-1,)))
	classifier2.fit(train.instances, train.instance_labels[:,2].reshape((-1,)))
	classifier3.fit(train.instances, train.instance_labels[:,3].reshape((-1,)))
	classifier4.fit(train.instances, train.instance_labels[:,4].reshape((-1,)))
    timer.stop('training')

    print 'Computing test bag predictions...'
    timer.start('test_bag_predict')
    bag_predictions0 = classifier0.predict(test.instances)
    bag_predictions1 = classifier1.predict(test.instances)
    bag_predictions2 = classifier2.predict(test.instances)
    bag_predictions3 = classifier3.predict(test.instances)
    bag_predictions4 = classifier4.predict(test.instances)

    timer.stop('test_bag_predict')

    if INSTANCE_PREDICTIONS:
        print 'Computing test instance predictions...'
        timer.start('test_instance_predict')
        instance_predictions = classifier.predict(test.instances_as_bags)
        timer.stop('test_instance_predict')

    print 'Computing train bag predictions...'
    timer.start('train_bag_predict')
    train_bag_labels = classifier0.predict() # Saves results from training set
    timer.stop('train_bag_predict')

    if INSTANCE_PREDICTIONS:
        print 'Computing train instance predictions...'
        timer.start('train_instance_predict')
        train_instance_labels = classifier.predict(train.instances_as_bags)
        timer.stop('train_instance_predict')

    print 'Constructing submission...'
    # Add statistics
    for attribute in ('linear_obj', 'quadratic_obj'):
        if hasattr(classifier0, attribute):
            submission['statistics'][attribute] = getattr(classifier,
                                                          attribute)
    submission['statistics'].update(timer.get_all('_time'))
    bag_predictions = np.hstack((bag_predictions0[:,np.newaxis], bag_predictions1[:,np.newaxis],bag_predictions2[:,np.newaxis],bag_predictions3[:,np.newaxis],bag_predictions4[:,np.newaxis]  ))
    for ( _,i), y in zip(test.instance_ids, map(tuple,bag_predictions)):
        submission['bag_predictions']['test'][i] = map(float,y)
    for (_, i), y in zip(train.instance_ids, train_bag_labels.flat):
        submission['bag_predictions']['train'][i] = float(y)
    if INSTANCE_PREDICTIONS:
        for i, y in zip(test.instance_ids, instance_predictions.flat):
            submission['instance_predictions']['test'][i] =float(y)
        for i, y in zip(train.instance_ids, train_instance_labels.flat):
            submission['instance_predictions']['train'][i] = float(y)

    # For backwards compatibility with older versions of scikit-learn
    if train.regression:
        from sklearn.metrics import r2_score as score
        scorename = 'R^2'
    else:
        try:
            from sklearn.metrics import roc_auc_score as score
        except:
            from sklearn.metrics import auc_score as score
        scorename = 'AUC'

    try:
        """
        if train.bag_labels.size > 1:
            print ('Training Bag %s Score: %f'
                   % (scorename, score(train.instance_labels, train_bag_labels)))
        if INSTANCE_PREDICTIONS and train.instance_labels.size > 1:
            print ('Training Inst. %s Score: %f'
                   % (scorename, score(train.instance_labels, train_instance_labels)))
        """
        if test.bag_labels.size > 1:
            AUC_list=[]
	    for ii in range(5):
		AUC_list.append(score(test.instance_labels[:,ii], bag_predictions[:,ii]))
	    AUC_mean=np.mean(AUC_list)
	    submission['statistics'][scorename]=AUC_mean
	    print ('Test Bag Average %s Score: %f'
                   % (scorename,AUC_mean ))
	    print( 'Test Bag Individual %s Score: ' %scorename   +','.join(map(str, AUC_list))   )
        """
        if INSTANCE_PREDICTIONS and test.instance_labels.size > 1:
            print ('Test Inst. %s Score: %f'
                   % (scorename, score(test.instance_labels, instance_predictions)))
        """
    except Exception as e:
        print "Couldn't compute scores."
        print e

    print 'Finished task %s.' % str(experiment_id)
    return submission
예제 #45
0
    def fit_predict(self, y, train=None, predict=None, show_steps=True):
        """
        Fit each model on the appropriate dataset, then return the average
        of their individual predictions. If train is specified, use a subset
        of the training set to train the models, then predict the outcome of
        either the remaining samples or (if given) those specified in predict.
        If train is omitted, train the models on the full training set, then
        predict the outcome of the full test set.

        Options:
        ------------------------------
        - y: numpy array. The full vector of the ground truths.
        - train: list. The indices of the elements to be used for training.
            If None, take the entire training set.
        - predict: list. The indices of the elements to be predicted.
        - show_steps: boolean. Whether to compute metrics after each stage
            of the computation.
        """
        y_train = y[train] if train is not None else y
        if train is not None and predict is None:
            predict = [i for i in range(len(y)) if i not in train]

        stage0_train = []
        stage0_predict = []
        for model, feature_set in self.models:
            X_train, X_predict = get_dataset(feature_set, train, predict)

            identifier = train[0] if train is not None else -1
            cache_file = stringify(model, feature_set) + str(identifier)

            model_preds = self._get_model_preds(
                model, X_train, X_predict, y_train, cache_file)
            stage0_predict.append(model_preds)

            # if stacking, compute cross-validated predictions on the train set
            if self.stack:
                model_cv_preds = self._get_model_cv_preds(
                    model, X_train, y_train, cache_file)
                stage0_train.append(model_cv_preds)

            # verbose mode: compute metrics after every model computation
            if show_steps:
                if train is not None:
                    mean_preds, stack_preds, fwls_preds = self._combine_preds(
                        np.array(stage0_train).T, np.array(stage0_predict).T,
                        y_train, train, predict,
                        stack=self.stack, fwls=self.fwls)

                    model_auc = compute_auc(y[predict], stage0_predict[-1])
                    mean_auc = compute_auc(y[predict], mean_preds)
                    stack_auc = compute_auc(y[predict], stack_preds) \
                        if self.stack else 0
                    fwls_auc = compute_auc(y[predict], fwls_preds) \
                        if self.fwls else 0

                    logger.info(
                        "> AUC: %.4f (%.4f, %.4f, %.4f) [%s]", model_auc,
                        mean_auc, stack_auc, fwls_auc,
                        stringify(model, feature_set))
                else:
                    logger.info("> used model %s:\n%s", stringify(
                        model, feature_set), model.get_params())

        if self.model_selection and predict is not None:
            best_subset = self._find_best_subset(y[predict], stage0_predict)
            stage0_train = [pred for i, pred in enumerate(stage0_train)
                            if i in best_subset]
            stage0_predict = [pred for i, pred in enumerate(stage0_predict)
                              if i in best_subset]

        mean_preds, stack_preds, fwls_preds = self._combine_preds(
            np.array(stage0_train).T, np.array(stage0_predict).T,
            y_train, stack=self.stack, fwls=self.fwls)

        if self.stack:
            selected_preds = stack_preds if not self.fwls else fwls_preds
        else:
            selected_preds = mean_preds

        return selected_preds
예제 #46
0
def client_target(task, callback):
    (experiment_name, experiment_id,
     train_dataset, test_dataset, _, _) = task['key']
    parameters = task['parameters']

    print 'Starting task %s...' % str(experiment_id)
    print 'Training Set: %s' % train_dataset
    print 'Test Set:     %s' % test_dataset
    print 'Parameters:'
    for k, v in parameters.items():
        print '\t%s: %s' % (k, str(v))

    train = get_dataset(train_dataset)
    test = get_dataset(test_dataset)

    submission = {
        'instance_predictions' : {
            'train' : {},
            'test'  : {},
        },
        'bag_predictions' : {
            'train' : {},
            'test'  : {},
        },
        'statistics' : {}
    }
    timer = Timer()

    if parameters['kernel'] == 'emp':
        dataset = get_base_dataset(train_dataset)
        idxfile = os.path.join(IDX_DIR, IDX_FMT % dataset)
        kernelfile = os.path.join(PRECOMPUTED_DIR,
            PRECOMPUTED_FMT % (dataset, parameters['ktype']))
        parameters['dataset'] = dataset
        parameters['idxfile'] = idxfile
        parameters['kernelfile'] = kernelfile
        empirical_labels = list(map(str, train.bag_ids))
        if parameters.pop('transductive', False):
            empirical_labels += list(map(str, test.bag_ids))
        parameters['empirical_labels'] = empirical_labels
        train.bags = train.bag_ids
        test.bags = test.bag_ids

    classifier_name = parameters.pop('classifier')
    if classifier_name in CLASSIFIERS:
        classifier = CLASSIFIERS[classifier_name](**parameters)
    else:
        print 'Technique "%s" not supported' % classifier_name
        callback.quit = True
        return

    print 'Training...'
    timer.start('training')
    if train.regression:
        classifier.fit(train.bags, train.bag_labels)
    else:
        classifier.fit(train.bags, train.pm1_bag_labels)
    timer.stop('training')

    print 'Computing test bag predictions...'
    timer.start('test_bag_predict')
    bag_predictions = classifier.predict(test.bags)
    timer.stop('test_bag_predict')

    if INSTANCE_PREDICTIONS:
        print 'Computing test instance predictions...'
        timer.start('test_instance_predict')
        instance_predictions = classifier.predict(test.instances_as_bags)
        timer.stop('test_instance_predict')

    print 'Computing train bag predictions...'
    timer.start('train_bag_predict')
    train_bag_labels = classifier.predict() # Saves results from training set
    timer.stop('train_bag_predict')

    if INSTANCE_PREDICTIONS:
        print 'Computing train instance predictions...'
        timer.start('train_instance_predict')
        train_instance_labels = classifier.predict(train.instances_as_bags)
        timer.stop('train_instance_predict')

    print 'Constructing submission...'
    # Add statistics
    for attribute in ('linear_obj', 'quadratic_obj'):
        if hasattr(classifier, attribute):
            submission['statistics'][attribute] = getattr(classifier,
                                                          attribute)
    submission['statistics'].update(timer.get_all('_time'))

    for i, y in zip(test.bag_ids, bag_predictions.flat):
        submission['bag_predictions']['test'][i] = float(y)
    for i, y in zip(train.bag_ids, train_bag_labels.flat):
        submission['bag_predictions']['train'][i] = float(y)
    if INSTANCE_PREDICTIONS:
        for i, y in zip(test.instance_ids, instance_predictions.flat):
            submission['instance_predictions']['test'][i] = float(y)
        for i, y in zip(train.instance_ids, train_instance_labels.flat):
            submission['instance_predictions']['train'][i] = float(y)

    # For backwards compatibility with older versions of scikit-learn
    if train.regression:
        from sklearn.metrics import r2_score as score
        scorename = 'R^2'
    else:
        try:
            from sklearn.metrics import roc_auc_score as score
        except:
            from sklearn.metrics import auc_score as score
        scorename = 'AUC'

    try:
        if train.bag_labels.size > 1:
            print ('Training Bag %s Score: %f'
                   % (scorename, score(train.bag_labels, train_bag_labels)))
        if INSTANCE_PREDICTIONS and train.instance_labels.size > 1:
            print ('Training Inst. %s Score: %f'
                   % (scorename, score(train.instance_labels, train_instance_labels)))
        if test.bag_labels.size > 1:
            print ('Test Bag %s Score: %f'
                   % (scorename, score(test.bag_labels, bag_predictions)))
        if INSTANCE_PREDICTIONS and test.instance_labels.size > 1:
            print ('Test Inst. %s Score: %f'
                   % (scorename, score(test.instance_labels, instance_predictions)))
    except Exception as e:
        print "Couldn't compute scores."
        print e

    print 'Finished task %s.' % str(experiment_id)
    return submission
예제 #47
0
def main(**options):
    dataset_directory = options.pop('dataset_directory', '.')
    dataset = options.pop('dataset')
    k = options.pop('k')

    if "meta_algorithm" in options and "meta_iters" not in options:
        """ Make sure they use --meta-iters if they want to do bagging/boosting """
        raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"])

    fs_alg = None
    if "fs_algorithm" in options:
        fs_alg = options.pop("fs_algorithm")
        if "fs_features" not in options:
            raise ValueError("Please indicate number of features for %s" % fs_alg)
        fs_n = options.pop("fs_features")

    schema, X, y = get_dataset(dataset, dataset_directory)
    folds = get_folds(X, y, k)
    stats_manager = StatisticsManager()
    #import pdb;pdb.set_trace()
    #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end
    maxSize = -1
    maxDepth = -1
    for train_X, train_y, test_X, test_y in folds:

        # Construct classifier instance
        print options
        classifier = get_classifier(**options)
        classifier.schema = schema

        # Train classifier
        train_start = time.time()
        if fs_alg:
            selector = FS_ALGORITHMS[fs_alg](n=fs_n)
            selector.fit(train_X)
            train_X = selector.transform(train_X)
        #Note that I changed fit to take in the schema
        classifier.fit(train_X, train_y, schema)
        train_time = (train_start - time.time())

        #To see the values and confidences of the root node
        #for attrVal, child in classifier.treeHead.children.iteritems():
        #    print "%d with confidence %f" % (attrVal, child.classLabelConfidence)

        #Maintennce to keep track of the maxSize and maxDepth
        if classifier.size > maxSize:
            maxSize = classifier.size
        if classifier.depth > maxDepth:
            maxDepth = classifier.depth

        #For my testing purposes, I had printed out the train_time
        #print "train time: %f" % train_time
        
        #For spam and voting tests, I printed out the root attribute
        #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute])

        if fs_alg:
            test_X = selector.transform(test_X)
        predictions = classifier.predict(test_X)
        scores = classifier.predict_proba(test_X)
        if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1:
            scores = scores[:,1]    # Get the column for label 1
        stats_manager.add_fold(test_y, predictions, scores, train_time)

    #The printouts specified by the assignments
    print ('\tAccuracy: %.03f %.03f'
        % stats_manager.get_statistic('accuracy', pooled=False))
    print "\tMaximum Size: %d" % maxSize
    print "\tMaximum Depth: %d" % maxDepth
예제 #48
0
def main():
    data = get_dataset('trx')
    #test_nsk(data.bags)
    K1 = test_emd(data.bags)
예제 #49
0
def main():
    data = get_dataset('musk1')
    test_nsk(data.bags)