def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # Setup tpu-cluster cluster = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(cluster) tf.tpu.experimental.initialize_tpu_system(cluster) distribute_strategy = tf.distribute.TPUStrategy(cluster) with distribute_strategy.scope(): model, input_size = build_model(num_classes=FLAGS.num_classes) optimizer = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate) model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]) model.summary() tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"{FLAGS.job_dir}/finetune/logs", histogram_freq=1) callbacks = [tboard_callback] train_ds = get_dataset(FLAGS.dataset, "train", read_tfrecord, FLAGS.global_batch_size, input_size, FLAGS.percentage) valid_ds = get_dataset(FLAGS.dataset, "valid", read_tfrecord, FLAGS.global_batch_size, input_size, FLAGS.percentage) for epoch in range(FLAGS.epochs): model.fit(train_ds, validation_data=valid_ds, callbacks=callbacks, initial_epoch=epoch, epochs=epoch+1) model.save(f"{FLAGS.job_dir}/finetune/checkpoints/{epoch+1}", include_optimizer=True) model.save(f"{FLAGS.job_dir}/finetune/saved_model", include_optimizer=False)
def check_dataset(dataset, dataroot, augment, download): if dataset == "cifar64": # cifar64 = get_CIFAR64(augment, dataroot, download) # input_size, num_classes, train_dataset, test_dataset = cifar64 train_data = get_dataset('cifar-fs-train-train', args.dataroot) test_data = get_dataset('cifar-fs-train-test', args.dataroot) transform = transforms.Compose([transforms.ToTensor(), preprocess]) train_dataset = SimpleDataset(train_data['x'], transform) test_dataset = SimpleDataset(test_data['x'], transform) input_size = (32, 32, 3) num_classes = 64 if dataset == "cifar10": cifar10 = get_CIFAR10(augment, dataroot, download) input_size, num_classes, train_dataset, test_dataset = cifar10 if dataset == "svhn": svhn = get_SVHN(augment, dataroot, download) input_size, num_classes, train_dataset, test_dataset = svhn if dataset == "miniimagenet": train_data = get_dataset('miniimagenet-train-train', args.dataroot) test_data = get_dataset('miniimagenet-train-test', args.dataroot) transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((32, 32)), transforms.ToTensor(), preprocess ]) train_dataset = SimpleDataset(train_data['x'], transform) test_dataset = SimpleDataset(test_data['x'], transform) input_size = (32, 32, 3) num_classes = 64 return input_size, num_classes, train_dataset, test_dataset
def main(learning_rate, use_daft, dataset_name, epochs, _seed, _config): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") task_name = f"{'daftmac' if use_daft else 'mac'}_{_config['dataset_name']}_step{_config['max_step']}_{_seed}" os.makedirs("result/log", exist_ok=True) logger.add(f"result/log/{task_name}.txt") logger.info(f"Making Code Deterministic with seed {_seed}") torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.manual_seed(_seed) np.random.seed(_seed) # numpy cpu random.seed(_seed) # python cpu if torch.cuda.is_available(): torch.cuda.manual_seed(_seed) torch.cuda.manual_seed_all(_seed) train_dataset = get_dataset("train") val_dataset = get_dataset("val") n_words = len(train_dataset.qdic["w2i"]) n_answers = len(train_dataset.adic["w2i"]) net = MAC(n_words, classes=n_answers, use_daft=use_daft, qdic=train_dataset.qdic) net_running = MAC(n_words, classes=n_answers, use_daft=use_daft, qdic=train_dataset.qdic) accumulate(net_running, net, 0) if dataset_name == "clevr": criterion = nn.CrossEntropyLoss() elif dataset_name == "gqa": criterion = TFBCELoss(train_dataset.pos_weight.to(device)) else: raise KeyError(f"Dataset {dataset_name} does not exist") writer = SummaryWriter(f"result/summary/{task_name}") optimizer = torch.optim.Adam(net.parameters(), learning_rate) scheduler = ReduceLROnPlateau(optimizer, factor=0.5, mode="max") for epoch in range(epochs): train_acc, train_loss = train(train_dataset, net, net_running, criterion, optimizer, epoch, writer) val_acc = valid(val_dataset, net_running, epoch, writer) scheduler.step(val_acc) os.makedirs(f"result/model/{task_name}", exist_ok=True) torch.save( net_running.state_dict(), f"result/model/{task_name}/checkpoint_{epoch:02}.model", ) if optimizer.param_groups[0]["lr"] < 1e-7: break
def get_data(self, mode, size=None): "Allow for custom mode and dataset size" size = self.config.dataset_size if size is None else size if self.config.preprocess_device == 'cpu': dataset = get_dataset(self.config, mode, size, preprocess=self.preprocess) else: dataset = get_dataset(self.config, mode, size) print(f"Dataset Size: {len(dataset)}") dataloader = get_dataloader(self.config, dataset) return dataloader
def test_overall(Net, index): Net.load_jointnet("/temp_disk/yyl/save/overall-{}.pkl".format(index)) test_loader, lentest = get_dataset( "/disk1/yyl/multi_model_distill/data/images_finetune_rename/test/", batchsize=16) print("Test dataset size: {}".format(lentest)) layerlist = [] for id_x, l in enumerate(Net.layers): layerlist.append(l.student.reallayer) Net.student_net = InnerBlock(layerlist) get_cuda(Net.student_net) num_image = 0 accuracy = 0.0 for i, (data, labels) in enumerate(test_loader): if use_cuda: data = get_cuda(data) data = get_variable(data) for model in Net.student_net.block: if type(model) == torch.nn.modules.linear.Linear: data = pyreshape(data) data = model(data) scores = data acc = cal_accuracy(scores.data.cpu().numpy(), labels.cpu().numpy().astype(np.int)) accuracy += acc * data.size(0) num_image += data.size(0) print(index) print(accuracy / num_image)
def generate(args): # # Get the dataset dataset, n_monet_samples, n_photo_samples = get_dataset( args.dataset, augment=args.augment, repeat=True, shuffle=False, from_npy=args.from_npy, batch_size=1) dataset_iter = iter(dataset) # # Get the model and restore the checkpoint model_name = get_model_name(args) model = get_model(args) model.load(os.path.join(args.checkpoint_dir, model_name)) # model.load(filepath=os.path.join(args.checkpoint_dir, model_name, "model_name.h5")) out_dir = os.path.join(args.result_dir, model_name) if not os.path.isdir(out_dir): os.makedirs(out_dir) for i in tqdm(range(n_photo_samples)): # Get the image from the dataset iterator style_ref, img = next(dataset_iter) # Get a prediction and save # if args.architecture == 'munit': # prediction = model.generate_guide(img, style_ref) # else: prediction = model.generate(img) prediction = tf.squeeze(prediction).numpy() prediction = (prediction * 127.5 + 127.5).astype(np.uint8) out_img = PIL.Image.fromarray(prediction) out_img.save(os.path.join(out_dir, str(i).zfill(4) + '.jpg'))
def create_train_model(model_creator, hps, scope=None, extra_args=None): graph = tf.Graph() with graph.as_default(), tf.container(scope or "train"): vocab_table = data.create_vocab_tables(hps.vocab_file, hps.vocab_size, hps.unk_id) train_dataset = data.get_dataset(hps.data_dir, hps.train_prefix) skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) iterator = data.get_iterator(train_dataset, vocab_table, hps) # Note: One can set model_device_fn to # `tf.train.replica_device_setter(ps_tasks)` for distributed training. model_device_fn = None if extra_args: model_device_fn = extra_args.model_device_fn with tf.device(model_device_fn): model = model_creator(iterator=iterator, hps=hps, mode=tf.contrib.learn.ModeKeys.TRAIN, vocab_table=vocab_table, scope=scope) return TrainModel(graph=graph, model=model, iterator=iterator, skip_count_placeholder=skip_count_placeholder)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') # Setup tpu-cluster cluster = tf.distribute.cluster_resolver.TPUClusterResolver() tf.config.experimental_connect_to_cluster(cluster) tf.tpu.experimental.initialize_tpu_system(cluster) distribute_strategy = tf.distribute.TPUStrategy(cluster) with distribute_strategy.scope(): simclr_model, input_size = build_model(model_type=FLAGS.model, n_dim=FLAGS.embedded_dim) optimizer = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate) simclr_model.compile(loss=simclr_loss_func, optimizer=optimizer, metrics=None) simclr_model.summary() tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"{FLAGS.job_dir}/pretrain/logs", histogram_freq=1) callbacks = [tboard_callback] train_ds = get_dataset(FLAGS.dataset, "train", read_tfrecord, FLAGS.global_batch_size, input_size) for epoch in range(FLAGS.epochs): simclr_model.fit(train_ds, callbacks=callbacks, initial_epoch=epoch, epochs=epoch+1) simclr_model.save(f"{FLAGS.job_dir}/pretrain/checkpoints/{epoch+1}", include_optimizer=True) simclr_model.save(f"{FLAGS.job_dir}/pretrain/saved_model", include_optimizer=False)
def load_ae(path, target_dataset, batch, all_aes, return_dataset=False): r_param = re.compile('(?P<name>[a-zA-Z][a-z_]*)(?P<value>(True)|(False)|(\d+(\.\d+)?(,\d+)*))') folders = [x for x in os.path.abspath(path).split('/') if x] dataset = folders[-2] if dataset != target_dataset: tf.compat.v1.logging.log(tf.compat.v1.logging.WARN, 'Mismatched datasets between classfier and AE (%s, %s)', target_dataset, dataset) class_name, argpairs = folders[-1].split('_', 1) params = {} for x in r_param.findall(argpairs): name, value = x[:2] if ',' in value: pass elif value in ('True', 'False'): value = {'True': True, 'False': False}[value] elif '.' in value: value = float(value) else: value = int(value) params[name] = value class_ = all_aes[class_name] dataset = data.get_dataset(dataset, dict(batch_size=batch)) ae = class_(dataset, '/' + os.path.join(*(folders[:-2])), **params) if return_dataset: return ae, dataset else: return ae, folders[-1]
def train(args): #torch.manual_seed(args.seed) #torch.cuda.manual_seed(args.seed) dataset = data.get_dataset(args.dataset, training=True) model = DFVE(args.image_channels, args.image_size, args.n_latent, args.lambda_, args.gamma).to(args.device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.weight_decay) model.train() step = 0 epoch = 0 for _ in range(args.n_epochs): epoch += 1 loader, _ = data.get_dataloader(dataset, args.batch_size) for samples, labels in loader: step += 1 x = samples.to(args.device).float() z_mean, z_logvar = model(x) loss, mmd_loss_from_prior, mmd_loss_for_mi = model.loss(z_mean, z_logvar, args.repeats) optimizer.zero_grad() loss.backward() optimizer.step() if step % args.print_freq == 0: print('[Epoch {:d}, Step {:d}] loss: {:.4f}, mmd_loss_from_prior: {:.4f}, mmd_loss_for_mi: {:.4f}'.format( epoch, step, loss.item(), mmd_loss_from_prior.item(), mmd_loss_for_mi.item())) monitor(z_mean, z_logvar, labels, epoch, step) model.monitor()
def train(args): torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) dataset = data.get_dataset(args.dataset, training=True) model = DFVE(args.image_channels, args.image_size, args.n_latent).to(args.device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.weight_decay) model.train() step = 0 epoch = 0 for _ in range(args.n_epochs): epoch += 1 loader, _ = data.get_dataloader(dataset, args.batch_size) for samples, labels in loader: step += 1 x = samples.to(args.device).float() z = model(x, args.repeats, args.noise_sigma) loss, mmd_loss_all, mmd_loss_avg = model.loss(z, args.gamma, args.kernel_gamma, args.kernel_power, args.gnorm_mu, args.gnorm_sigma, args.gnorm_alpha) optimizer.zero_grad() loss.backward() optimizer.step() if step % args.print_freq == 0: print('[Epoch {:d}, Step {:d}] loss: {:.4f}, mmd_loss_all: {:.4f}, mmd_loss_avg: {:.4f}'.format( epoch, step, loss.item(), mmd_loss_all.item(), mmd_loss_avg.item())) if step % args.show_freq == 0: monitor(z, labels, epoch, step) model.monitor()
def _combine_preds(self, X_train, X_cv, y, train=None, predict=None, stack=False, fwls=False): """ Combine preds, returning in order: - mean_preds: the simple average of all model predictions - stack_preds: the predictions of the stage 1 generalizer - fwls_preds: same as stack_preds, but optionally using more complex blending schemes (meta-features, different generalizers, etc.) """ mean_preds = np.mean(X_cv, axis=1) stack_preds = None fwls_preds = None if stack: self.generalizer.fit(X_train, y) stack_preds = self.generalizer.predict(X_cv) if self.fwls: meta, meta_cv = get_dataset('metafeatures', train, predict) fwls_train = np.hstack((X_train, meta)) fwls_cv = np.hstack((X_cv, meta)) self.generalizer.fit(fwls_train) fwls_preds = self.generalizer.predict(fwls_cv) return mean_preds, stack_preds, fwls_preds
def __init__(self, module_path, hyper_params, use_cuda, test_rate=1.0, USE_EXIST_RES=False, mission=1): print("Test rate:", test_rate) _, self.dataset = get_dataset(valid_rate=test_rate, USE_TRANSFORM=False, mission=mission) print("test number:", len(self.dataset)) self.hyper_params = hyper_params self.data_loader = DataLoader( dataset=self.dataset, num_workers=self.hyper_params["threads"], batch_size=self.hyper_params["batch_size"], shuffle=False) self.resnet = get_network(mission=mission) self.resnet.load_state_dict(torch.load(module_path)) if use_cuda: self.resnet = self.resnet.cuda() self.v = Validator(resnet=self.resnet, hyper_params=hyper_params, use_cuda=use_cuda, data_loader=self.data_loader)
def main(dataset, factor, outputdir): factor = float(factor) dset = get_dataset(dataset) X = (factor * dset.instances).astype(int) # Remove irrelevant columns (all feature values identical) relevant = np.nonzero(np.max(X, axis=0) - np.min(X, axis=0))[0] X = X[:, relevant] namesfile = os.path.join(outputdir, NAMES_FMT % dataset) datafile = os.path.join(outputdir, DATA_FMT % dataset) with open(namesfile, 'w+') as f: f.write('0,1.\n') f.write('bag_id: %s.\n' % ','.join(dset.bag_ids)) f.write('instance_id: %s.\n' % ','.join([iid[1] for iid in dset.instance_ids])) for i in range(X.shape[1]): f.write('f%d: continuous.\n' % (i + 1)) with open(datafile, 'w+') as f: for (bid, iid), xx, y in zip(dset.instance_ids, X, dset.instance_labels): xs = ','.join(map(str, xx)) f.write('%s,%s,%s,%d.\n' % (bid, iid, xs, y))
def load_ae(path, target_dataset, batch, all_aes, return_dataset=False): r_param = re.compile('(?P<name>[a-zA-Z][a-z_]*)(?P<value>(True)|(False)|(\d+(\.\d+)?(,\d+)*))') folders = [x for x in os.path.abspath(path).split('/') if x] dataset = folders[-2] if dataset != target_dataset: tf.logging.log(tf.logging.WARN, 'Mismatched datasets between classfier and AE (%s, %s)', target_dataset, dataset) class_name, argpairs = folders[-1].split('_', 1) params = {} for x in r_param.findall(argpairs): name, value = x[:2] if ',' in value: pass elif value in ('True', 'False'): value = dict(True=True, False=False)[value] elif '.' in value: value = float(value) else: value = int(value) params[name] = value class_ = all_aes[class_name] dataset = data.get_dataset(dataset, dict(batch_size=batch)) ae = class_(dataset, '/' + os.path.join(*(folders[:-2])), **params) if return_dataset: return ae, dataset else: return ae, folders[-1]
def main(arguments): print('===Start of program.===') tf.random.set_random_seed(12345) input_pattern = arguments.input_pattern output_name = arguments.output base_batch_size = 32 available_gpus = get_available_gpus() total_batch_size = base_batch_size * len(available_gpus) dataset = parse_dataset(get_dataset(input_pattern), batch_size=total_batch_size) strategy = get_train_strategy(available_gpus) callback = get_callbacks() with strategy.scope(): optimizer = get_sgd_optimizer() model = create_model() cce_loss = losses.CategoricalCrossentropy(from_logits=True) print(model.summary()) model.compile(loss=cce_loss, optimizer=optimizer) model.fit(dataset, epochs=20, callbacks=callback) if not output_name.endswith('.h5'): output_name += 'model.h5' model.save_weights(output_name, overwrite=True) print('===End of program.===')
def build_data_iterator(hps, files, current_res_h, current_res_w, batch_size=None, label_list=None, num_shards=None, shard_index=None): random.shuffle(files) dataset = get_dataset(files, current_res_h, current_res_w, hps.epochs_per_res, batch_size, label_list=label_list, num_shards=None, shard_index=None) it = dataset.make_one_shot_iterator() return it
def train(): data = get_dataset() x = torch.tensor(data['x'], requires_grad=True, dtype=torch.float32) # x 实际上是位置和速度 test_x = torch.tensor(data['test_x'], requires_grad=True, dtype=torch.float32) _, acce = torch.Tensor(data['test_dx']).chunk(2, 1) _, test_acce = torch.Tensor(data['test_dx']).chunk(2, 1) N, freedom = x.shape freedom /= 2 input_dim = int(freedom * 2) output_dim = int(freedom) model_nn = MLP(input_dim, 50, output_dim, 'tanh') model = LNN(input_dim, differentiable_model=model_nn) optim = torch.optim.Adam(model.parameters(), 5e-3, weight_decay=1e-4) # vanilla train loop stats = {'train_loss': [], 'test_loss': []} torch.autograd.set_detect_anomaly(True) for step in range(500): #500 epoch # train step loss = 0 for i in range(100): acce_hat = model.forward_new(x[i]) loss = loss + L1_loss(acce[i], acce_hat) loss.backward() loss /= 100 optim.step() optim.zero_grad() print("step {}, train_loss {:.4e}, ".format(step, loss)) writer.add_scalar('LNN/spring_train_loss', loss, step)
def test_get_dataset_raw(self): with self.test_session(): test_image1 = tf.constant(np.arange(4 * 4 * 3), shape=[4, 4, 3], dtype=tf.uint8) encoded = tf.image.encode_png(test_image1) image = encoded.eval() print(os.getcwd()) with open(os.path.join("test_files", "test1.png"), "wb") as f: f.write(image) test_image2 = tf.constant(np.flip(np.arange(4 * 4 * 3), axis=0), shape=[4, 4, 3], dtype=tf.uint8) encoded = tf.image.encode_png(test_image2) image = encoded.eval() with open(os.path.join("test_files", "test2.png"), "wb") as f: f.write(image) files = glob.glob(os.path.join("test_files", "test*.png")) dataset = get_dataset(files) it = dataset.make_one_shot_iterator() self.assertAllClose(it.get_next(), test_image1) self.assertAllClose(it.get_next(), test_image2)
def test_preprocess_dataset_batch2_float_tfrecord(self): with self.test_session(): test_image1 = tf.constant(np.arange(4 * 4 * 3) * 5, shape=[4, 4, 3], dtype=tf.uint8) encoded = tf.image.encode_png(test_image1) image1 = encoded.eval() with open(os.path.join("test_files", "test1.png"), "wb") as f: f.write(image1) test_image2 = tf.constant(np.flip(np.arange(4 * 4 * 3) * 5, axis=0), shape=[4, 4, 3], dtype=tf.uint8) encoded = tf.image.encode_png(test_image2) image2 = encoded.eval() with open(os.path.join("test_files", "test2.png"), "wb") as f: f.write(image2) files = glob.glob(os.path.join("test_files", "test*.png")) dataset = get_dataset(files) dataset = preprocess_dataset(dataset, size=[64, 64], batch_size=2, float_pixels=True) it = dataset.make_one_shot_iterator() data = it.get_next().eval() self.assertEqual(data.shape, (2, 64, 64, 3)) self.assertAllClose(max(data.flatten()), max(test_image1.eval().flatten()) / 127.5 - 1.) self.assertAllClose(min(data.flatten()), min(test_image1.eval().flatten()) / 127.5 - 1.)
def main(configfile, folddir, resultsdir, outputfile): with open(configfile, 'r') as f: configuration = yaml.load(f) # Generate tasks from experiment list tasks = {} for experiment in configuration['experiments']: classifier = experiment['classifier'] dataset = experiment['dataset'] folds = data.get_folds(folddir, dataset) for f in range(len(folds)): for r in range(experiment['reps']): key = (classifier, dataset, experiment['kernel'], f, r) task = Task(*key) tasks[key] = task # Mark finished tasks for task in tasks.values(): predfile = os.path.join(resultsdir, task.filebase('preds')) task.predfile = predfile if os.path.exists(predfile): task.finish() reindexed = defaultdict(lambda: defaultdict(list)) for (c, d, k, f, r), task in tasks.items(): reindexed[(c, d, k)][r].append(task) existing_keys = set() if os.path.exists(outputfile): with open(outputfile, 'r') as f: for line in f: c, d, k = line.strip().split(',')[:3] existing_keys.add((c, d, k)) with open(outputfile, 'a+') as f: rep_aucs = defaultdict(list) for key, reps in sorted(reindexed.items()): if key in existing_keys: print 'Skipping %s (already finished)...' % str(key) continue data_dict = data.get_dataset(key[1]) bag_ids = sorted(data_dict.keys()) y_true = [data_dict[bid][1] for bid in bag_ids] predictions = [] for rep, task_list in sorted(reps.items()): if all(task.finished for task in task_list): predictions.append(get_preds(key, task_list, bag_ids)) else: break if len(predictions) != len(reps): print 'Skipping %s (incomplete)...' % str(key) continue predictions = np.vstack(predictions) # We want a cumulative average, but doesn't matter for AUC cumpreds = np.cumsum(predictions, axis=0) aucs = [auc_score(y_true, cp) for cp in cumpreds] line = ','.join(map(str, key) + map(str, aucs)) print line f.write(line + '\n')
def train(args): # Get the dataset dataset, n_monet_samples, n_photo_samples = get_dataset( args.dataset, augment=args.augment, repeat=True, shuffle=True, batch_size=args.batch_size, autotune=1, from_npy=args.from_npy, cache=False) # Get the model model = get_model(args) # Try loading pretrained weights model_name = get_model_name(args) # try: # model.load(filepath=os.path.join(args.checkpoint_dir, model_name)) # print("Model weights restored.") # except: # print("Could not find model weights.") # Train the model history = model.fit( dataset, epochs=args.epochs, batch_size=args.batch_size, steps_per_epoch=(max(n_monet_samples, n_photo_samples) // args.batch_size), # steps_per_epoch=1000, callbacks=get_callbacks(args))
def train_encoding(args): torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) tag = 'latents_{:d}_alpha_{:d}'.format(args.n_latent, int(args.gnorm_alpha)) save_dir = os.path.join(args.save_base, tag) U.mkdir(save_dir) dataset = data.get_dataset(args.dataset, training=True) model = Model(args.image_channels, args.image_size, args.n_latent, args.n_dims).to(args.device) optimizer = optim.Adam(model.encoder.parameters(), lr=args.learning_rate, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.weight_decay) model.train() step = 0 epoch = 0 examples = 0 while examples < args.max_examples: epoch += 1 loader, _ = data.get_dataloader(dataset, args.batch_size) for samples, labels in loader: step += 1 x = samples.to(args.device).float() # B x C x H x W z = model(x, args.repeats, args.noise_sigma, 'encoding') # B x repeats x n_latent loss, mmd_loss_all, mmd_loss_avg = model.encoding_loss( z, args.gamma, args.kernel_gamma, args.kernel_power, args.gnorm_mu, args.gnorm_sigma, args.gnorm_alpha) optimizer.zero_grad() loss.backward() optimizer.step() prev_examples = examples examples += x.size(0) if examples // BASE_N > prev_examples // BASE_N: print( '[Epoch {:d}, Step {:d}, #Eg. {:d}] loss: {:.4f}, mmd_loss_all: {:.4f}, mmd_loss_avg: {:.4f}' .format(epoch, step, examples, loss.item(), mmd_loss_all.item(), mmd_loss_avg.item())) if examples // BASE_N in args.save_points: path = os.path.join( save_dir, 'training_examples_{:d}_10k.ckpt'.format( examples // BASE_N)) print('save {}'.format(path)) torch.save( { 'examples': examples // BASE_N * BASE_N, 'loss': loss.item(), 'mmd_loss_all': mmd_loss_all.item(), 'mmd_loss_avg': mmd_loss_avg.item(), 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict() }, path)
def main(base_dataset, new_dataset): dset = get_dataset(base_dataset) view = ViewBuilder(new_dataset, base_dataset) for i, ((bid, iid), yi) in enumerate(zip(dset.instance_ids, dset.instance_labels)): view.add(bid, iid, 'b%d' % i, 'i%d' % i, yi) view.save(VIEWS_PATH[0])
def _true_labels(dataset, level='bags'): dset = get_dataset(dataset) if level.startswith('b'): return dict(zip(dset.bag_ids, dset.bag_labels.flat)) elif level.startswith('i'): return dict(zip(dset.instance_ids, dset.instance_labels.flat)) else: raise ValueError('Bad level type "%s"' % level)
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: # Make sure they use --meta-iters if they want to do bagging/boosting raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) options['schema'] = schema folds = get_folds(X, y, k) stats_manager = StatisticsManager() # import pdb;pdb.set_trace() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print(options) classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y) train_time = (train_start - time.time()) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:, 1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print(' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print(' Precision: %.03f %.03f' % stats_manager.get_statistic('precision', pooled=False)) print(' Recall: %.03f %.03f' % stats_manager.get_statistic('recall', pooled=False)) print('Area under ROC: %.03f' % stats_manager.get_statistic('auc', pooled=True))
def create_dataloader(name, transform, val_ratio, batch_size, workers): dataset_train = get_dataset(name, 'train', transform['train']) dataset_val = get_dataset(name, 'train', transform['eval']) dataset_test = get_dataset(name, 'test', transform['eval']) idx_sorted = np.argsort(dataset_train.train_labels) num_classes = dataset_train.train_labels[idx_sorted[-1]] + 1 samples_per_class = len(dataset_train) // num_classes val_len = int(val_ratio * samples_per_class) val_idx = np.array([], dtype=np.int32) train_idx = np.array([], dtype=np.int32) for i in range(num_classes): perm = np.random.permutation(range(samples_per_class)) val_part = samples_per_class * i + perm[0:val_len] val_part = idx_sorted[val_part] val_idx = np.concatenate((val_idx, val_part)) train_part = samples_per_class * i + perm[val_len:] train_part = idx_sorted[train_part] train_idx = np.concatenate((train_idx, train_part)) sampler_train = torch.utils.data.sampler.SubsetRandomSampler(train_idx) sampler_val = torch.utils.data.sampler.SubsetRandomSampler(val_idx) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=False, sampler=sampler_train, num_workers=workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=False, sampler=sampler_val, num_workers=workers, pin_memory=True) test_loader = torch.utils.data.DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=True) return train_loader, val_loader, test_loader
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y, schema) train_time = (train_start - time.time()) print train_time if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X,schema) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:,1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print (' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print (' Precision: %.03f %.03f' % stats_manager.get_statistic('precision', pooled=False)) print (' Recall: %.03f %.03f' % stats_manager.get_statistic('recall', pooled=False)) print ('Area under ROC: %.03f' % stats_manager.get_statistic('auc', pooled=True))
def __init__(self, args): self.args = args args.logger.info('Initializing trainer') self.model = get_model(args) params_cnt = count_parameters(self.model) args.logger.info("params "+str(params_cnt)) torch.cuda.set_device(args.rank) self.model.cuda(args.rank) self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[args.rank]) train_dataset, val_dataset = get_dataset(args) if args.split == 'train': # train loss self.RGBLoss = RGBLoss(args, sharp=False) self.SegLoss = nn.CrossEntropyLoss() self.RGBLoss.cuda(args.rank) self.SegLoss.cuda(args.rank) if args.optimizer == "adamax": self.optimizer = torch.optim.Adamax(list(self.model.parameters()), lr=args.learning_rate) elif args.optimizer == "adam": self.optimizer = torch.optim.Adam(self.model.parameters(), lr=args.learning_rate) elif args.optimizer == "sgd": self.optimizer = torch.optim.SGD(self.model.parameters(), lr=args.learning_rate, momentum=0.9) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) self.train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size//args.gpus, shuffle=False, num_workers=args.num_workers, pin_memory=True, sampler=train_sampler) else: # val criteria self.L1Loss = nn.L1Loss().cuda(args.rank) self.PSNRLoss = PSNR().cuda(args.rank) self.SSIMLoss = SSIM().cuda(args.rank) self.IoULoss = IoU().cuda(args.rank) self.VGGCosLoss = VGGCosineLoss().cuda(args.rank) val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset) self.val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size//args.gpus, shuffle=False, num_workers=args.num_workers, pin_memory=True, sampler=val_sampler) torch.backends.cudnn.benchmark = True self.global_step = 0 self.epoch=1 if args.resume or ( args.split != 'train' and not args.checkepoch_range): self.load_checkpoint() if args.rank == 0: writer_name = args.path+'/{}_int_{}_len_{}_{}_logs'.format(self.args.split, int(self.args.interval), self.args.vid_length, self.args.dataset) self.writer = SummaryWriter(writer_name) self.stand_heat_map = self.create_stand_heatmap()
def __init__(self, config): self.h_dim = config["h_dim"] self.z_dim = config["z_dim"] self.epochs = config["num_epochs"] self.batch_size = config["batch_size"] self.sigma_z = config["sigma_z"] self.lmbda = config["lambda"] # Experiment directory self.logdir = os.path.join("runs", \ datetime.now().strftime("wae_gan_%d_%m_%Y-%H:%M:%S")) self.writer = tf.summary.create_file_writer(self.logdir) with self.writer.as_default(): tf.summary.text("Hyperparams", json.dumps(config), step=0) self.writer.flush() os.mkdir(os.path.join(self.logdir, "img")) os.mkdir(os.path.join(self.logdir, "img", "random")) os.mkdir(os.path.join(self.logdir, "img", "recons")) os.mkdir(os.path.join(self.logdir, "models")) os.mkdir(os.path.join(self.logdir, "models", "encoder")) os.mkdir(os.path.join(self.logdir, "models", "decoder")) os.mkdir(os.path.join(self.logdir, "models", "discriminator")) with open(os.path.join(self.logdir, "config.json"), "w") as f: json.dump(config, f) # Models ================================================================ self.encoder, self.decoder, self.discriminator = get_ae_disc(config) # Optimizers ============================================================ ae_scheduler = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=config["ae_lr"], decay_steps=config["ae_dec_steps"], decay_rate=config["ae_dec_rate"]) disc_scheduler = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=config["d_lr"], decay_steps=config["d_dec_steps"], decay_rate=config["d_dec_rate"]) self.enc_optim = tf.keras.optimizers.Adam(ae_scheduler) self.dec_optim = tf.keras.optimizers.Adam(ae_scheduler) self.disc_optim = tf.keras.optimizers.Adam(disc_scheduler) # Data ================================================================== tf.print("Loading data...") self.train_dataset, self.test_dataset = \ get_dataset(batch_size=self.batch_size) tf.print("Done.") # Metric trackers ======================================================= self.avg_d_train_loss = tf.keras.metrics.Mean(dtype=tf.float32) self.avg_d_test_loss = tf.keras.metrics.Mean(dtype=tf.float32) self.avg_d_z_loss = tf.keras.metrics.Mean(dtype=tf.float32) self.avg_mse_test_loss = tf.keras.metrics.Mean(dtype=tf.float32) self.avg_enc_dec_train_loss = tf.keras.metrics.Mean(dtype=tf.float32) self.avg_enc_dec_test_loss = tf.keras.metrics.Mean(dtype=tf.float32)
def train(args): # set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) # init model and optimizer if args.verbose: print("Training baseline model:" if args.baseline else "Training HNN model:") output_dim = args.input_dim if args.baseline else 2 nn_model = MLP(args.input_dim, args.hidden_dim, output_dim, args.nonlinearity) model = HNN(args.input_dim, differentiable_model=nn_model, field_type=args.field_type, baseline=args.baseline) optim = torch.optim.Adam(model.parameters(), args.learn_rate, weight_decay=0) # arrange data data = get_dataset(args.name, args.save_dir, verbose=True) x = torch.tensor( data['coords'], requires_grad=True, dtype=torch.float32) test_x = torch.tensor( data['test_coords'], requires_grad=True, dtype=torch.float32) dxdt = torch.Tensor(data['dcoords']) test_dxdt = torch.Tensor(data['test_dcoords']) # vanilla train loop stats = {'train_loss': [], 'test_loss': []} for step in range(args.total_steps+1): # train step ixs = torch.randperm(x.shape[0])[:args.batch_size] dxdt_hat = model.time_derivative(x[ixs]) dxdt_hat += args.input_noise * torch.randn(*x[ixs].shape) # add noise, maybe loss = L2_loss(dxdt[ixs], dxdt_hat) loss.backward() grad = torch.cat([p.grad.flatten() for p in model.parameters()]).clone() optim.step() ; optim.zero_grad() # run test data test_ixs = torch.randperm(test_x.shape[0])[:args.batch_size] test_dxdt_hat = model.time_derivative(test_x[test_ixs]) test_dxdt_hat += args.input_noise * torch.randn(*test_x[test_ixs].shape) # add noise, maybe test_loss = L2_loss(test_dxdt[test_ixs], test_dxdt_hat) # logging stats['train_loss'].append(loss.item()) stats['test_loss'].append(test_loss.item()) if args.verbose and step % args.print_every == 0: print("step {}, train_loss {:.4e}, test_loss {:.4e}, grad norm {:.4e}, grad std {:.4e}" .format(step, loss.item(), test_loss.item(), grad@grad, grad.std())) train_dxdt_hat = model.time_derivative(x) train_dist = (dxdt - train_dxdt_hat)**2 test_dxdt_hat = model.time_derivative(test_x) test_dist = (test_dxdt - test_dxdt_hat)**2 print('Final train loss {:.4e} +/- {:.4e}\nFinal test loss {:.4e} +/- {:.4e}' .format(train_dist.mean().item(), train_dist.std().item()/np.sqrt(train_dist.shape[0]), test_dist.mean().item(), test_dist.std().item()/np.sqrt(test_dist.shape[0]))) return model, stats
def GET(self): # top_n count_of_top_companies = 3 dataset = data.get_dataset() eventBreakdown = data.EventBreakdown(dataset) event_type_breakdown, event_type_names = eventBreakdown.break_down_events(count_of_top_companies) # pdb.set_trace() return render.event2comp(event_type_breakdown, event_type_names, count_of_top_companies)
def build_validation_data_loader(self) -> DataLoader: if not self.data_downloaded: self.download_directory = data.download_dataset( download_directory=self.download_directory, data_config=self.context.get_data_config(), ) self.data_downloaded = True validation_data = data.get_dataset(self.download_directory, train=False) return DataLoader(validation_data, batch_size=self.context.get_per_slot_batch_size())
def main(outputfile): progress = ProgressMonitor(total=len(DATASETS), msg='Extracting statistics') with open(outputfile, 'w+') as f: stats = ','.join(stat for stat, _ in STATISTICS) f.write('#%s\n' % stats) for dataset in DATASETS: dset = get_dataset(dataset) dset.name = dataset stats = ','.join(map(str, (f(dset) for _, f in STATISTICS))) f.write('%s\n' % stats) progress.increment()
def get_var(dataset): variation = 0.0 p = 0 dset = data.get_dataset(dataset) for _, bag, y, inst_labels in dset.bag_dict.values(): if y != True: continue p += 1 pinsts = bag[np.array(inst_labels), :] variation += np.average(cdist(pinsts, pinsts, 'euclidean')) variation /= p return variation
def main(dataset): dset = get_dataset(dataset) i, f = dset.instances.shape b = len(dset.bags) p = sum(dset.bag_labels) n = b - p print 'Dataset.............%s' % dataset print 'Features............%d' % f print 'Instances...........%d' % i print 'Bags................%d' % b print ' Positive........%d' % p print ' Negative........%d' % n print 'Avg. Instances/Bag..%.1f' % (float(i)/b)
def format_data(dataset_file): """ 返回dataset(列表集合)和features(列表) """ """ dataset = [] for index,line in enumerate(open(dataset_file,'rU').readlines()): line = line.strip() fea_and_label = line.split(',') dataset.append([float(fea_and_label[i]) for i in range(len(fea_and_label)-1)]+[fea_and_label[len(fea_and_label)-1]]) #features = [dataset[0][i] for i in range(len(dataset[0])-1)] #sepal length(花萼长度)、sepal width(花萼宽度)、petal length(花瓣长度)、petal width(花瓣宽度) features=[] for j in range(19): features.append(str(j)) #features = ['sepal_length','sepal_width','petal_length','petal_width'] """ schema, X, y = get_dataset(train_file, ".") # labels = get_labels(train_file) labels = y train_dataset = map(list, X) for i in range(len(train_dataset)): train_dataset[i].append(y[i]) train_features = [] """ for feature_name in schema.feature_names: train_features.append(feature_name) print schema.feature_names print type(schema.feature_names) print schema.nominal_values print type(schema.nominal_values) is_nominal=[] for i in range(len(schema.feature_names)): is_nominal.append(schema.is_nominal(i)) print is_nominal print type(is_nominal) """ return train_dataset, schema
def find_params(model, feature_set, y, subsample=None, grid_search=False): """ Return parameter set for the model, either predefined or found through grid search. """ model_name = model.__class__.__name__ params = INITIAL_PARAMS.get(model_name, {}) y = y if subsample is None else y[subsample] try: with open('saved_params.json') as f: saved_params = json.load(f) except IOError: saved_params = {} if (grid_search and model_name in PARAM_GRID and stringify( model, feature_set) not in saved_params): X, _ = get_dataset(feature_set, subsample, [0]) clf = GridSearchCV(model, PARAM_GRID[model_name], cv=10, n_jobs=4, scoring="roc_auc") #grid search for the best parameter for the learning model clf.fit(X, y) logger.info("found params (%s > %.4f): %s", stringify(model, feature_set), clf.best_score_, clf.best_params_) params.update(clf.best_params_) saved_params[stringify(model, feature_set)] = params with open('saved_params.json', 'w') as f: json.dump(saved_params, f, indent=4, separators=(',', ': '), ensure_ascii=True, sort_keys=True) else: params.update(saved_params.get(stringify(model, feature_set), {})) if grid_search: logger.info("using params %s: %s", stringify(model, feature_set), params) return params
def main(dataset, factor, outputdir): factor = float(factor) dset = get_dataset(dataset) X = (factor*dset.instances).astype(int) # Remove irrelevant columns (all feature values identical) relevant = np.nonzero(np.max(X, axis=0) - np.min(X, axis=0))[0] X = X[:, relevant] namesfile = os.path.join(outputdir, NAMES_FMT % dataset) datafile = os.path.join(outputdir, DATA_FMT % dataset) with open(namesfile, 'w+') as f: f.write('0,1.\n') f.write('bag_id: %s.\n' % ','.join(dset.bag_ids)) f.write('instance_id: %s.\n' % ','.join([iid[1] for iid in dset.instance_ids])) for i in range(X.shape[1]): f.write('f%d: continuous.\n' % (i+1)) with open(datafile, 'w+') as f: for (bid, iid), xx, y in zip(dset.instance_ids, X, dset.instance_labels): xs = ','.join(map(str, xx)) f.write('%s,%s,%s,%d.\n' % (bid, iid, xs, y))
def main(): data = get_dataset('musk1') test_vocab(data, 'miles') test_vocab(data, 'yards')
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') #MAX_DEPTH = options.pop('depth') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) attr_set=[] for i in range(len(schema.feature_names)): attr_set.append(schema.is_nominal(i)) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) classifier.fit(train_X, train_y, attr_set) print 'ff' train_time = (train_start - time.time()) if fs_alg: test_X = selector.transform(test_X) predictions=[] for t in test_X: predictions.append(classifier.predict(t)) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:,1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) print classifier.size() print classifier.depth() print (' Accuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) '''
def client_target(task, callback): (experiment_name, experiment_id, train_dataset, test_dataset, _, _) = task['key'] parameters = task['parameters'] print 'Starting task %s...' % str(experiment_id) print 'Training Set: %s' % train_dataset print 'Test Set: %s' % test_dataset print 'Parameters:' for k, v in parameters.items(): print '\t%s: %s' % (k, str(v)) #import pdb;pdb.set_trace() train = get_dataset(train_dataset) test = get_dataset(test_dataset) #import pdb;pdb.set_trace() """ data_raw = np.genfromtxt('natural_scene.data',delimiter = ",") class data_class(object): def __init__(self): pass train=data_class() test=data_class() feature_matrix = data_raw[:, 2:-5] label_matrix = data_raw[:, -5:] num_instances = data_raw.shape[0] train.instances = feature_matrix[:int(math.floor(num_instances/2)),: ] test.instances = feature_matrix[int(math.floor(num_instances/2)):,: ] train.instance_labels = label_matrix[:int(math.floor(num_instances/2)),: ] test.instance_labels = label_matrix[int(math.floor(num_instances/2)):,: ] """ submission = { 'instance_predictions' : { 'train' : {}, 'test' : {}, }, 'bag_predictions' : { 'train' : {}, 'test' : {}, }, 'statistics' : {} } timer = Timer() if parameters['kernel'] == 'emp': dataset = get_base_dataset(train_dataset) idxfile = os.path.join(IDX_DIR, IDX_FMT % dataset) kernelfile = os.path.join(PRECOMPUTED_DIR, PRECOMPUTED_FMT % (dataset, parameters['ktype'])) parameters['dataset'] = dataset parameters['idxfile'] = idxfile parameters['kernelfile'] = kernelfile empirical_labels = list(map(str, train.bag_ids)) if parameters.pop('transductive', False): empirical_labels += list(map(str, test.bag_ids)) parameters['empirical_labels'] = empirical_labels train.bags = train.bag_ids test.bags = test.bag_ids classifier_name = parameters.pop('classifier') if classifier_name in CLASSIFIERS: classifier0 = CLASSIFIERS[classifier_name](**parameters) classifier1 = CLASSIFIERS[classifier_name](**parameters) classifier2 = CLASSIFIERS[classifier_name](**parameters) classifier3 = CLASSIFIERS[classifier_name](**parameters) classifier4 = CLASSIFIERS[classifier_name](**parameters) else: print 'Technique "%s" not supported' % classifier_name callback.quit = True return print 'Training...' timer.start('training') if train.regression: classifier1.fit(train.bags, train.bag_labels) else: #import pdb;pdb.set_trace() classifier0.fit(train.instances, train.instance_labels[:,0].reshape((-1,))) classifier1.fit(train.instances, train.instance_labels[:,1].reshape((-1,))) classifier2.fit(train.instances, train.instance_labels[:,2].reshape((-1,))) classifier3.fit(train.instances, train.instance_labels[:,3].reshape((-1,))) classifier4.fit(train.instances, train.instance_labels[:,4].reshape((-1,))) timer.stop('training') print 'Computing test bag predictions...' timer.start('test_bag_predict') bag_predictions0 = classifier0.predict(test.instances) bag_predictions1 = classifier1.predict(test.instances) bag_predictions2 = classifier2.predict(test.instances) bag_predictions3 = classifier3.predict(test.instances) bag_predictions4 = classifier4.predict(test.instances) timer.stop('test_bag_predict') if INSTANCE_PREDICTIONS: print 'Computing test instance predictions...' timer.start('test_instance_predict') instance_predictions = classifier.predict(test.instances_as_bags) timer.stop('test_instance_predict') print 'Computing train bag predictions...' timer.start('train_bag_predict') train_bag_labels = classifier0.predict() # Saves results from training set timer.stop('train_bag_predict') if INSTANCE_PREDICTIONS: print 'Computing train instance predictions...' timer.start('train_instance_predict') train_instance_labels = classifier.predict(train.instances_as_bags) timer.stop('train_instance_predict') print 'Constructing submission...' # Add statistics for attribute in ('linear_obj', 'quadratic_obj'): if hasattr(classifier0, attribute): submission['statistics'][attribute] = getattr(classifier, attribute) submission['statistics'].update(timer.get_all('_time')) bag_predictions = np.hstack((bag_predictions0[:,np.newaxis], bag_predictions1[:,np.newaxis],bag_predictions2[:,np.newaxis],bag_predictions3[:,np.newaxis],bag_predictions4[:,np.newaxis] )) for ( _,i), y in zip(test.instance_ids, map(tuple,bag_predictions)): submission['bag_predictions']['test'][i] = map(float,y) for (_, i), y in zip(train.instance_ids, train_bag_labels.flat): submission['bag_predictions']['train'][i] = float(y) if INSTANCE_PREDICTIONS: for i, y in zip(test.instance_ids, instance_predictions.flat): submission['instance_predictions']['test'][i] =float(y) for i, y in zip(train.instance_ids, train_instance_labels.flat): submission['instance_predictions']['train'][i] = float(y) # For backwards compatibility with older versions of scikit-learn if train.regression: from sklearn.metrics import r2_score as score scorename = 'R^2' else: try: from sklearn.metrics import roc_auc_score as score except: from sklearn.metrics import auc_score as score scorename = 'AUC' try: """ if train.bag_labels.size > 1: print ('Training Bag %s Score: %f' % (scorename, score(train.instance_labels, train_bag_labels))) if INSTANCE_PREDICTIONS and train.instance_labels.size > 1: print ('Training Inst. %s Score: %f' % (scorename, score(train.instance_labels, train_instance_labels))) """ if test.bag_labels.size > 1: AUC_list=[] for ii in range(5): AUC_list.append(score(test.instance_labels[:,ii], bag_predictions[:,ii])) AUC_mean=np.mean(AUC_list) submission['statistics'][scorename]=AUC_mean print ('Test Bag Average %s Score: %f' % (scorename,AUC_mean )) print( 'Test Bag Individual %s Score: ' %scorename +','.join(map(str, AUC_list)) ) """ if INSTANCE_PREDICTIONS and test.instance_labels.size > 1: print ('Test Inst. %s Score: %f' % (scorename, score(test.instance_labels, instance_predictions))) """ except Exception as e: print "Couldn't compute scores." print e print 'Finished task %s.' % str(experiment_id) return submission
def fit_predict(self, y, train=None, predict=None, show_steps=True): """ Fit each model on the appropriate dataset, then return the average of their individual predictions. If train is specified, use a subset of the training set to train the models, then predict the outcome of either the remaining samples or (if given) those specified in predict. If train is omitted, train the models on the full training set, then predict the outcome of the full test set. Options: ------------------------------ - y: numpy array. The full vector of the ground truths. - train: list. The indices of the elements to be used for training. If None, take the entire training set. - predict: list. The indices of the elements to be predicted. - show_steps: boolean. Whether to compute metrics after each stage of the computation. """ y_train = y[train] if train is not None else y if train is not None and predict is None: predict = [i for i in range(len(y)) if i not in train] stage0_train = [] stage0_predict = [] for model, feature_set in self.models: X_train, X_predict = get_dataset(feature_set, train, predict) identifier = train[0] if train is not None else -1 cache_file = stringify(model, feature_set) + str(identifier) model_preds = self._get_model_preds( model, X_train, X_predict, y_train, cache_file) stage0_predict.append(model_preds) # if stacking, compute cross-validated predictions on the train set if self.stack: model_cv_preds = self._get_model_cv_preds( model, X_train, y_train, cache_file) stage0_train.append(model_cv_preds) # verbose mode: compute metrics after every model computation if show_steps: if train is not None: mean_preds, stack_preds, fwls_preds = self._combine_preds( np.array(stage0_train).T, np.array(stage0_predict).T, y_train, train, predict, stack=self.stack, fwls=self.fwls) model_auc = compute_auc(y[predict], stage0_predict[-1]) mean_auc = compute_auc(y[predict], mean_preds) stack_auc = compute_auc(y[predict], stack_preds) \ if self.stack else 0 fwls_auc = compute_auc(y[predict], fwls_preds) \ if self.fwls else 0 logger.info( "> AUC: %.4f (%.4f, %.4f, %.4f) [%s]", model_auc, mean_auc, stack_auc, fwls_auc, stringify(model, feature_set)) else: logger.info("> used model %s:\n%s", stringify( model, feature_set), model.get_params()) if self.model_selection and predict is not None: best_subset = self._find_best_subset(y[predict], stage0_predict) stage0_train = [pred for i, pred in enumerate(stage0_train) if i in best_subset] stage0_predict = [pred for i, pred in enumerate(stage0_predict) if i in best_subset] mean_preds, stack_preds, fwls_preds = self._combine_preds( np.array(stage0_train).T, np.array(stage0_predict).T, y_train, stack=self.stack, fwls=self.fwls) if self.stack: selected_preds = stack_preds if not self.fwls else fwls_preds else: selected_preds = mean_preds return selected_preds
def client_target(task, callback): (experiment_name, experiment_id, train_dataset, test_dataset, _, _) = task['key'] parameters = task['parameters'] print 'Starting task %s...' % str(experiment_id) print 'Training Set: %s' % train_dataset print 'Test Set: %s' % test_dataset print 'Parameters:' for k, v in parameters.items(): print '\t%s: %s' % (k, str(v)) train = get_dataset(train_dataset) test = get_dataset(test_dataset) submission = { 'instance_predictions' : { 'train' : {}, 'test' : {}, }, 'bag_predictions' : { 'train' : {}, 'test' : {}, }, 'statistics' : {} } timer = Timer() if parameters['kernel'] == 'emp': dataset = get_base_dataset(train_dataset) idxfile = os.path.join(IDX_DIR, IDX_FMT % dataset) kernelfile = os.path.join(PRECOMPUTED_DIR, PRECOMPUTED_FMT % (dataset, parameters['ktype'])) parameters['dataset'] = dataset parameters['idxfile'] = idxfile parameters['kernelfile'] = kernelfile empirical_labels = list(map(str, train.bag_ids)) if parameters.pop('transductive', False): empirical_labels += list(map(str, test.bag_ids)) parameters['empirical_labels'] = empirical_labels train.bags = train.bag_ids test.bags = test.bag_ids classifier_name = parameters.pop('classifier') if classifier_name in CLASSIFIERS: classifier = CLASSIFIERS[classifier_name](**parameters) else: print 'Technique "%s" not supported' % classifier_name callback.quit = True return print 'Training...' timer.start('training') if train.regression: classifier.fit(train.bags, train.bag_labels) else: classifier.fit(train.bags, train.pm1_bag_labels) timer.stop('training') print 'Computing test bag predictions...' timer.start('test_bag_predict') bag_predictions = classifier.predict(test.bags) timer.stop('test_bag_predict') if INSTANCE_PREDICTIONS: print 'Computing test instance predictions...' timer.start('test_instance_predict') instance_predictions = classifier.predict(test.instances_as_bags) timer.stop('test_instance_predict') print 'Computing train bag predictions...' timer.start('train_bag_predict') train_bag_labels = classifier.predict() # Saves results from training set timer.stop('train_bag_predict') if INSTANCE_PREDICTIONS: print 'Computing train instance predictions...' timer.start('train_instance_predict') train_instance_labels = classifier.predict(train.instances_as_bags) timer.stop('train_instance_predict') print 'Constructing submission...' # Add statistics for attribute in ('linear_obj', 'quadratic_obj'): if hasattr(classifier, attribute): submission['statistics'][attribute] = getattr(classifier, attribute) submission['statistics'].update(timer.get_all('_time')) for i, y in zip(test.bag_ids, bag_predictions.flat): submission['bag_predictions']['test'][i] = float(y) for i, y in zip(train.bag_ids, train_bag_labels.flat): submission['bag_predictions']['train'][i] = float(y) if INSTANCE_PREDICTIONS: for i, y in zip(test.instance_ids, instance_predictions.flat): submission['instance_predictions']['test'][i] = float(y) for i, y in zip(train.instance_ids, train_instance_labels.flat): submission['instance_predictions']['train'][i] = float(y) # For backwards compatibility with older versions of scikit-learn if train.regression: from sklearn.metrics import r2_score as score scorename = 'R^2' else: try: from sklearn.metrics import roc_auc_score as score except: from sklearn.metrics import auc_score as score scorename = 'AUC' try: if train.bag_labels.size > 1: print ('Training Bag %s Score: %f' % (scorename, score(train.bag_labels, train_bag_labels))) if INSTANCE_PREDICTIONS and train.instance_labels.size > 1: print ('Training Inst. %s Score: %f' % (scorename, score(train.instance_labels, train_instance_labels))) if test.bag_labels.size > 1: print ('Test Bag %s Score: %f' % (scorename, score(test.bag_labels, bag_predictions))) if INSTANCE_PREDICTIONS and test.instance_labels.size > 1: print ('Test Inst. %s Score: %f' % (scorename, score(test.instance_labels, instance_predictions))) except Exception as e: print "Couldn't compute scores." print e print 'Finished task %s.' % str(experiment_id) return submission
def main(**options): dataset_directory = options.pop('dataset_directory', '.') dataset = options.pop('dataset') k = options.pop('k') if "meta_algorithm" in options and "meta_iters" not in options: """ Make sure they use --meta-iters if they want to do bagging/boosting """ raise ValueError("Please indicate number of iterations for %s" % options["meta_algorithm"]) fs_alg = None if "fs_algorithm" in options: fs_alg = options.pop("fs_algorithm") if "fs_features" not in options: raise ValueError("Please indicate number of features for %s" % fs_alg) fs_n = options.pop("fs_features") schema, X, y = get_dataset(dataset, dataset_directory) folds = get_folds(X, y, k) stats_manager = StatisticsManager() #import pdb;pdb.set_trace() #I am keeping track of the maxSize and maxDepth of each of the k tests, to print out at the end maxSize = -1 maxDepth = -1 for train_X, train_y, test_X, test_y in folds: # Construct classifier instance print options classifier = get_classifier(**options) classifier.schema = schema # Train classifier train_start = time.time() if fs_alg: selector = FS_ALGORITHMS[fs_alg](n=fs_n) selector.fit(train_X) train_X = selector.transform(train_X) #Note that I changed fit to take in the schema classifier.fit(train_X, train_y, schema) train_time = (train_start - time.time()) #To see the values and confidences of the root node #for attrVal, child in classifier.treeHead.children.iteritems(): # print "%d with confidence %f" % (attrVal, child.classLabelConfidence) #Maintennce to keep track of the maxSize and maxDepth if classifier.size > maxSize: maxSize = classifier.size if classifier.depth > maxDepth: maxDepth = classifier.depth #For my testing purposes, I had printed out the train_time #print "train time: %f" % train_time #For spam and voting tests, I printed out the root attribute #print "Root Attribute: [%d] %s" % (classifier.treeHead.attribute, schema.feature_names[classifier.treeHead.attribute]) if fs_alg: test_X = selector.transform(test_X) predictions = classifier.predict(test_X) scores = classifier.predict_proba(test_X) if len(np.shape(scores)) > 1 and np.shape(scores)[1] > 1: scores = scores[:,1] # Get the column for label 1 stats_manager.add_fold(test_y, predictions, scores, train_time) #The printouts specified by the assignments print ('\tAccuracy: %.03f %.03f' % stats_manager.get_statistic('accuracy', pooled=False)) print "\tMaximum Size: %d" % maxSize print "\tMaximum Depth: %d" % maxDepth
def main(): data = get_dataset('trx') #test_nsk(data.bags) K1 = test_emd(data.bags)
def main(): data = get_dataset('musk1') test_nsk(data.bags)