Exemplo n.º 1
0
    def transform(self):
        """Transform data by mapping it into the latent space."""
        # Note: This maps to mean of distribution, we could alternatively
        # sample from Gaussian distribution
        # get batch data
        # return r are only (69952,62) from original dataset which are 70000, 50 datapoints
        #
        # Reload the data without shuffling before mapping it to z space
        if self.label != -1:
            X, y = utils_parent.load_mnist(self.dataset_name, shuffle=False)
            d = split_data_according_to_label(X, y, self.num_labels)
            noshuffle_data_X = X[d[str(self.label)]]
            # y represent the index with label i
            noshuffle_data_y = d[str(self.label)]
        else:
            noshuffle_data_X, noshuffle_data_y = utils_parent.load_mnist(self.dataset_name,
                                                               shuffle=False)

        batch_images = noshuffle_data_X[0:self.batch_size]
        r = self.sess.run(self.mu, feed_dict={self.inputs: batch_images})
        for idx in range(1, self.num_batches):     # from the beginning to the end of the dataset, each time do batchsize, conform to the net tensor definiation
            batch_images = noshuffle_data_X[idx * self.batch_size:(idx + 1) * self.batch_size]
            z = self.sess.run(self.mu, feed_dict={self.inputs: batch_images})
            r = tf.concat([r, z], 0)
        return r, noshuffle_data_y
Exemplo n.º 2
0
    def __init__(self, sess, epoch, batch_size, z_dim, dataset_name, checkpoint_dir, result_dir, log_dir, label = -1, num_labels=10,config_manager=None):
        self.sess = sess
        self.dataset_name = dataset_name
        self.checkpoint_dir = checkpoint_dir
        self.result_dir = result_dir
        self.log_dir = log_dir
        self.epoch = epoch
        self.batch_size = batch_size
        self.label = label
        self.config_manager = config_manager
        self.num_labels = num_labels

        if dataset_name == 'mnist' or dataset_name == 'fashion-mnist':
            # parameters
            self.input_height = 28
            self.input_width = 28
            self.output_height = 28
            self.output_width = 28

            self.z_dim = z_dim         # dimension of noise-vector
            self.c_dim = 1

            # train
            self.learning_rate = 0.0002
            self.beta1 = 0.5

            # test
            self.sample_num = 64  # number of generated images to be saved

            # load mnist
            # if flag labeled is true, train data is the subset of data(Mnist) which has same label
            if label != -1:
                X,y=utils_parent.load_mnist(self.dataset_name)
                # dict[i] represent data index with label i
                dict = split_data_according_to_label(X,y,num_labels)
                # extract data with label i from global training data
                self.data_X = X[dict[str(label)]]
                # y represent the index with label i
                self.data_y = dict[str(label)]
                # self.data_y = y[dict[str(label)]]
            else:
                self.data_X, self.data_y = utils_parent.load_mnist(self.dataset_name)

            # get number of batches for a single epoch
            self.num_batches = len(self.data_X) // self.batch_size
        elif dataset_name =='imagenet':

            raise NotImplementedError
        else: raise NotImplementedError
 def __init__(self,
              pattern="/global_index_cluster_data.npy",
              root_dir='../results/VAE_fashion-mnist_64_62',
              transform=None,
              list_idx=[0],
              dsname="fashion-mnist",
              num_labels=10,
              num_cluster=5):
     """
     Args:
         pattern (string): Path to the npy file.
         root_dir (string): Directory with all the images.
         transform (callable, optional): Optional transform to be applied
             on a sample.
         list_idx (list): the list of indexes of the cluster to choose as trainset or testset
         for example
         trainset = VGMMDataset(list_idx = [0,1, 2, 3])
         testset = VGMMDataset(list_idx = [4])
         dsname: currently dsname is fashion-mnist, but not used at all
     """
     X, y = utils_parent.load_mnist(dsname)
     y = y.argmax(axis=1)
     self.root_dir = root_dir
     self.pattern = pattern
     self.transform = transform
     #if cluster ==True:
     if not tf.gfile.Exists(self.root_dir + self.pattern):
         _, self.global_index = concatenate_data_from_dir(
             self.root_dir, num_labels=num_labels, num_clusters=num_cluster)
     else:
         self.global_index = np.load(self.root_dir + pattern,
                                     allow_pickle=True)
     self.list_idx = list_idx
     all_inds = []
     print('cluster index list:' + str(list_idx))
     for index in self.list_idx:
         to_append = self.global_index.item().get(
             str(index)
         )  # self.global_index is a dictionary of {'0': [15352, 2152,21, 25,...], '1':[1121, 1252, 3195,...]}
         print('\n size of cluster:' + str(np.shape(to_append)) + '\n')
         all_inds = np.append(all_inds, to_append)
         print(all_inds.shape)
     self.all_inds = all_inds.tolist()
     # self.all_inds = map(round, self.all_inds)
     if self.all_inds is not None:
         self.all_inds = [round(a) for a in self.all_inds]
         self.samples = {
             "x": X.take(self.all_inds, axis=0),
             "y": y.take(self.all_inds, axis=0)
         }
         print('\n size of dataset:' + str(np.shape(self.all_inds)) + '\n')
Exemplo n.º 4
0
def counting_label(num_labels,num_clusters):
    # load data
    _,y = utils_parent.load_mnist(config.dataset_name)
    global_index = np.load(config.data_path+config.global_index_name,allow_pickle=True)
    results = {}
    for i in range(num_clusters):
        index = global_index.item().get(str(i))
        temp_y = np.sum(y[index],axis=0)
        sum = np.sum(temp_y)
        temp_y = temp_y/sum
        results[str(i)]= temp_y

    with open("distribution_y.txt", 'a') as lf:
        lf.write(str(results))
    return results
def main(unused_argv):
    # parse arguments
    args = parse_args()
    if args is None:
        exit()

    # load training and eval data
    X, y = utils_parent.load_mnist(args.dataset)
    results_random_ressample = cross_validation(X, y, config.num_clusters,
                                                args)
    results_shifted = cross_validation_for_clustered_data(
        X, y, config.data_path, config.num_labels, config.num_clusters, args)
    print("***********random************")
    print(results_random_ressample)
    print("***********shifted************")
    print(results_shifted)
    utils_parent.write_results_convnet_to_csv("results_random.csv",
                                              results_random_ressample)
    utils_parent.write_results_convnet_to_csv("results_cluster.csv",
                                              results_shifted)
    def __init__(self, sess, epoch, batch_size, z_dim, dataset_name,
                 checkpoint_dir, result_dir, log_dir):
        self.sess = sess
        self.dataset_name = dataset_name
        self.checkpoint_dir = checkpoint_dir
        self.result_dir = result_dir
        self.log_dir = log_dir
        self.epoch = epoch
        self.batch_size = batch_size

        if dataset_name == 'mnist' or dataset_name == 'fashion-mnist':
            # parameters
            self.input_height = 28
            self.input_width = 28
            self.output_height = 28
            self.output_width = 28

            self.z_dim = z_dim  # dimension of noise-vector
            self.y_dim = 10  # dimension of code-vector (label)
            self.c_dim = 1

            # train
            self.learning_rate = 0.0002
            self.beta1 = 0.5

            # test
            self.sample_num = 64  # number of generated images to be saved

            # code
            self.len_discrete_code = 10  # categorical distribution (i.e. label)
            self.len_continuous_code = 2  # gaussian distribution (e.g. rotation, thickness)

            # load mnist
            self.data_X, self.data_y = utils_parent.load_mnist(
                self.dataset_name)

            # get number of batches for a single epoch
            self.num_batches = len(self.data_X) // self.batch_size
        else:
            raise NotImplementedError
def cross_validation(num_labels, num_cluster, args):
    print("cross validation for random resampling")
    best_acc = 0
    resize = cf.resize
    start_epoch, num_epochs, batch_size, optim_type = cf.start_epoch, cf.num_epochs, cf.batch_size, cf.optim_type
    results = {}
    X, y = utils_parent.load_mnist('fashion-mnist')
    kf = KFold(n_splits=num_cluster, shuffle=True)
    i = 0
    for train_eval_idx, test_idx in kf.split(X, y):  #iterator
        #breakpoint()  iter = kf.split(X,y); for xx in iter: print(xx);  it seems that KFold.split works
        cv_idx = i
        i = i + 1
        trainset, evalset, testset, inputs, outputs = prepare_data_for_normal_cv(
            args, train_eval_idx, test_idx, resize)
        # Hyper Parameter settings
        use_cuda = torch.cuda.is_available()
        use_cuda = cf.use_cuda()
        if use_cuda is True:
            torch.cuda.set_device(0)
        best_acc = 0
        resize = cf.resize
        start_epoch, num_epochs, batch_size, optim_type = cf.start_epoch, cf.num_epochs, cf.batch_size, cf.optim_type

        trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=batch_size,
                                                  shuffle=True,
                                                  num_workers=4)
        evalloader = torch.utils.data.DataLoader(evalset,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 num_workers=4)
        testloader = torch.utils.data.DataLoader(testset,
                                                 batch_size=batch_size,
                                                 shuffle=False,
                                                 num_workers=4)

        # num_workers: how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0)# Return network & file name

        # Model
        print('\n[Phase 2] : Model setup')
        if args.resume:
            # Load checkpoint
            print('| Resuming from checkpoint...')
            assert os.path.isdir(
                'checkpoint'), 'Error: No checkpoint directory found!'
            _, file_name = getNetwork(args, inputs, outputs)

            checkpoint = torch.load('./checkpoint/' + args.dataset + os.sep +
                                    file_name + args.cv_type + str(cv_idx) +
                                    '.t7')
            net = checkpoint['net']
            best_acc = checkpoint['acc']
            start_epoch = checkpoint['epoch']
        else:
            print('| Building net type [' + args.net_type + ']...')
            net, file_name = getNetwork(args, inputs, outputs)

        if use_cuda:
            net.cuda()

        vi = GaussianVariationalInference(torch.nn.CrossEntropyLoss())

        #logfile = os.path.join('diagnostics_Bayes{}_{}.txt'.format(args.net_type, args.dataset))
        logfile_train = os.path.join(
            'diagnostics_Bayes{}_{}_cv{}_train_rand.txt'.format(
                args.net_type, args.dataset, i))
        logfile_test = os.path.join(
            'diagnostics_Bayes{}_{}_cv{}_test_rand.txt'.format(
                args.net_type, args.dataset, i))
        logfile_eval = os.path.join(
            'diagnostics_Bayes{}_{}_cv{}_val_rand.txt'.format(
                args.net_type, args.dataset, i))

        print('\n[Phase 3] : Training model')
        print('| Training Epochs = ' + str(num_epochs))
        print('| Initial Learning Rate = ' + str(args.lr))
        print('| Optimizer = ' + str(optim_type))

        elapsed_time = 0

        train_return = []
        test_return = []
        eval_return = []

        for epoch in range(start_epoch, start_epoch + num_epochs):
            start_time = time.time()

            temp_train_return = train(epoch, trainset, inputs, net, batch_size,
                                      trainloader, resize, num_epochs,
                                      use_cuda, vi, logfile_train)
            temp_eval_return = test(epoch, evalset, inputs, batch_size,
                                    evalloader, net, use_cuda, num_epochs,
                                    resize, vi, logfile_eval, file_name)
            temp_test_return = test(epoch, testset, inputs, batch_size,
                                    testloader, net, use_cuda, num_epochs,
                                    resize, vi, logfile_test, "test")

            train_return = np.append(train_return, temp_train_return)
            eval_return = np.append(eval_return, temp_eval_return)
            test_return = np.append(test_return, temp_test_return)

            print(temp_train_return)
            print(temp_eval_return)
            print(temp_test_return)

            epoch_time = time.time() - start_time
            elapsed_time += epoch_time
            print('| Elapsed time : %d:%02d:%02d' % (cf.get_hms(elapsed_time)))

        print('\n[Phase 4] : Testing model')
        print('* Test results : Acc@1 = %.2f%%' % (best_acc))
        results[str(i)] = {
            "train": train_return,
            "test": test_return,
            "eval": eval_return
        }
        print(results)
    return results
                        type=bool,
                        help="debug mode has smaller data")
    parser.add_argument('--cv_idx', default=0, type=int, help='index of cv')
    args = parser.parse_args()
    global cv_idx
    cv_idx = 0
    if args.cv_type == "vgmm":
        global result
        # result ={}
        with MyPool(multiprocessing.cpu_count()) as p:
            result = p.map(cross_validation_for_clustered_data_parallel,
                           list(range(config_parent.num_clusters)))

        # result = cross_validation_for_clustered_data(num_labels=config_parent.num_labels,num_cluster=config_parent.num_clusters,args=args)
    else:
        X, y = utils_parent.load_mnist('fashion-mnist')
        kf = KFold(n_splits=config_parent.num_clusters)
        global global_rand_idx
        global_rand_idx = {}
        i = 0
        for train_eval_idx, test_idx in kf.split(X, y):
            global_rand_idx[str(i)] = {
                "train_eval_idx": train_eval_idx,
                "test_idx": test_idx
            }
            with MyPool(multiprocessing.cpu_count()) as p:
                result = p.map(cross_validation_parallel,
                               list(range(config_parent.num_clusters)))
        # result = cross_validation(config_parent.num_labels,config_parent.num_clusters,args)

    final_file_prefix = "Bayes_" + args.cv_type + '_' + args.net_type + '_cross_validation_result'
                                          transform=transform_train)
trainset_refactor = refactor_dataset_class.VGMMDataset(
    transform=transform_train)

trainloader_org = torch.utils.data.DataLoader(trainset_org,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=4)
trainloader_refactor = torch.utils.data.DataLoader(trainset_refactor,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   num_workers=4)
# num_workers: how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0)# Return network & file name
import utils_parent

X, y = utils_parent.load_mnist("fashion-mnist")
type(y)
y.shape
ynew = y.argmax(axis=1)
ynew
type(ynew)

for batch_idx, (inputs_value, targets) in enumerate(trainloader_org):
    print(inputs_value)
    print(targets)
    print("......")
    print(targets.type)
    print(targets.shape)
    break

for batch_idx, (inputs_value, targets) in enumerate(trainloader_refactor):