Exemplo n.º 1
0
def load_and_shuffle(data, im_size, shuffle=True):
    """Read csv into numpy and reshape to match image size.
    Works only for mnist, change image size for each use case
    """
    y, X = np.split(data.values, (1, ), axis=1)
    Y = np.eye(num_classes)[y[:, 0]]
    X = X.reshape(-1, *im_size, 1)
    if shuffle:
        return shuffle_(X, Y)
    return X, Y
Exemplo n.º 2
0
def hacer_blobs(n_muestras=100, n_caracteristica=2, centros=2, cluster_std=1.0,
centro_caja=(-10.0, 10.0), shuffle=True, random_state=None):
    
    generador = check_random_state(random_state)

    if isinstance(centros, numbers.Integral):
        centros = generador.uniform(centro_caja[0], centro_caja[1],
                                    size=(centros, n_caracteristica))
    else:
        centros = check_array(centros)
        n_caracteristica = centros.shape[1]

    if isinstance(cluster_std, numbers.Real):
        cluster_std = np.ones(len(centros)) * cluster_std

    X = []
    y = []

    n_centros = centros.shape[0]
    if isinstance(n_muestras, numbers.Integral):
        n_muestras_por_centro = [int(n_muestras // n_centros)] * n_centros
        for i in range(n_muestras % n_centros):
            n_muestras_por_centro[i] += 1
    else:
        n_muestras_por_centro = n_muestras

    for i, (n, std) in enumerate(zip(n_muestras_por_centro, cluster_std)):
        X.append(centros[i] + generador.normal(scale=std,
                                               size=(n, n_caracteristica)))
        y += [i] * n

    X = np.concatenate(X)
    y = np.array(y)

    if shuffle:
        X, y = shuffle_(X, y, random_state=generador)

    return X, y
Exemplo n.º 3
0
def make_blobs(n_samples=100,
               n_features=2,
               centers=2,
               cluster_std=1.0,
               center_box=(-10.0, 10.0),
               shuffle=True,
               random_state=None):
    """Generate isotropic Gaussian blobs for clustering.

    Read more in the :ref:`User Guide <sample_generators>`.

    Parameters
    ----------
    n_samples : int, or tuple, optional (default=100)
        The total number of points equally divided among clusters.

    n_features : int, optional (default=2)
        The number of features for each sample.

    centers : int or array of shape [n_centers, n_features], optional
        (default=3)
        The number of centers to generate, or the fixed center locations.

    cluster_std: float or sequence of floats, optional (default=1.0)
        The standard deviation of the clusters.

    center_box: pair of floats (min, max), optional (default=(-10.0, 10.0))
        The bounding box for each cluster center when centers are
        generated at random.

    shuffle : boolean, optional (default=True)
        Shuffle the samples.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    X : array of shape [n_samples, n_features]
        The generated samples.

    y : array of shape [n_samples]
        The integer labels for cluster membership of each sample.

    Examples
    --------
    >>> from sklearn.datasets.samples_generator import make_blobs
    >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
    ...                   random_state=0)
    >>> print(X.shape)
    (10, 2)
    >>> y
    array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])

    See also
    --------
    make_classification: a more intricate variant
    """
    generator = check_random_state(random_state)

    if isinstance(centers, numbers.Integral):
        centers = generator.uniform(center_box[0],
                                    center_box[1],
                                    size=(centers, n_features))
    else:
        centers = check_array(centers)
        n_features = centers.shape[1]

    if isinstance(cluster_std, numbers.Real):
        cluster_std = np.ones(len(centers)) * cluster_std

    X = []
    y = []

    n_centers = centers.shape[0]
    if isinstance(n_samples, numbers.Integral):
        n_samples_per_center = [int(n_samples // n_centers)] * n_centers
        for i in range(n_samples % n_centers):
            n_samples_per_center[i] += 1
    else:
        n_samples_per_center = n_samples

    for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
        X.append(centers[i] +
                 generator.normal(scale=std, size=(n, n_features)))
        y += [i] * n

    X = np.concatenate(X)
    y = np.array(y)

    if shuffle:
        X, y = shuffle_(X, y, random_state=generator)

    return X, y
def make_blobs(n_samples=100, n_features=2, centers=2, cluster_std=1.0,
               center_box=(-10.0, 10.0), shuffle=True, random_state=None):
    """Generate isotropic Gaussian blobs for clustering.

    Read more in the :ref:`User Guide <sample_generators>`.

    Parameters
    ----------
    n_samples : int, or tuple, optional (default=100)
        The total number of points equally divided among clusters.

    n_features : int, optional (default=2)
        The number of features for each sample.

    centers : int or array of shape [n_centers, n_features], optional
        (default=3)
        The number of centers to generate, or the fixed center locations.

    cluster_std: float or sequence of floats, optional (default=1.0)
        The standard deviation of the clusters.

    center_box: pair of floats (min, max), optional (default=(-10.0, 10.0))
        The bounding box for each cluster center when centers are
        generated at random.

    shuffle : boolean, optional (default=True)
        Shuffle the samples.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    Returns
    -------
    X : array of shape [n_samples, n_features]
        The generated samples.

    y : array of shape [n_samples]
        The integer labels for cluster membership of each sample.

    Examples
    --------
    >>> from sklearn.datasets.samples_generator import make_blobs
    >>> X, y = make_blobs(n_samples=10, centers=3, n_features=2,
    ...                   random_state=0)
    >>> print(X.shape)
    (10, 2)
    >>> y
    array([0, 0, 1, 0, 2, 2, 2, 1, 1, 0])

    See also
    --------
    make_classification: a more intricate variant
    """
    generator = check_random_state(random_state)

    if isinstance(centers, numbers.Integral):
        centers = generator.uniform(center_box[0], center_box[1],
                                    size=(centers, n_features))
    else:
        centers = check_array(centers)
        n_features = centers.shape[1]

    if isinstance(cluster_std, numbers.Real):
        cluster_std = np.ones(len(centers)) * cluster_std

    X = []
    y = []

    n_centers = centers.shape[0]
    if isinstance(n_samples, numbers.Integral):
        n_samples_per_center = [int(n_samples // n_centers)] * n_centers
        for i in range(n_samples % n_centers):
            n_samples_per_center[i] += 1
    else:
        n_samples_per_center = n_samples

    for i, (n, std) in enumerate(zip(n_samples_per_center, cluster_std)):
        X.append(centers[i] + generator.normal(scale=std,
                                               size=(n, n_features)))
        y += [i] * n

    X = np.concatenate(X)
    y = np.array(y)

    if shuffle:
        X, y = shuffle_(X, y, random_state=generator)

    return X, y
Exemplo n.º 5
0
def to_nparray(dir: str,
               size,
               channels_first=False,
               labels=None,
               shuffle=True,
               verbose=False):
    """Read an image dir and output all images into a numpy array with labels (optional)

    Parameters
    ----------
    dir : str
        Image dir path. There must be no subfolder within, only images
    size : int
        2-element tuple (height, width)
    channels_first : bool
        Decide whether the output array if of dims batch x channels x height x width (like pytorch)
        or batch x height x width x channels (like np)
    labels : None or dict
        if None, then no labels array is output, otherwise must be a dict of form
        {class_name: index, ...}
    shuffle : bool
        Decide whether shuffle data or not. Default True
    verbose : bool
        Decide whether logging or not

    Returns
    -------
    np.array or tuple
        if labels not None, return a tuple of images array and labels list,
        otherwise return only the images array
    """
    imgs = []
    if labels is not None:
        assert isinstance(labels, dict), 'if not None, labels must be a dict'
        y = []

    for fn in glob.glob(os.path.join(dir, '*')):
        if verbose:
            print('loading... {}'.format(os.path.basename(fn)),
                  end='\r',
                  flush=True)
        if labels is not None:
            for name in labels:
                if os.path.basename(fn).startswith(name):
                    y.append(labels[name])
                    break
        im = Image.open(fn)
        im = im.resize(size)
        im = np.array(im)
        if channels_first:
            im = np.transpose(im, (2, 0, 1))
        imgs.append(im)
    if verbose:
        print('\nDone.')
    X = np.stack(imgs, axis=0)
    if labels is not None:
        if shuffle:
            X, y = shuffle_(X, y)
        return X, y
    if shuffle:
        X = shuffle_(X)
    return X
Exemplo n.º 6
0
    def train(self,
              training_data,
              epochs,
              mini_batch_size,
              eta,
              validation_data,
              test_data=None,
              store_accuracies=False,
              shuffle = False,
              SGDR = False,
              save_dir=None,
              calibration=False,
              lmbda=0.0):
        '''Trains the neural network using Kingma and Ba's Adam algorithm (by default)
        Other optimization algorithms such as stochastic gradient descent can also be used. To do so, edit the


        (REQUIRED) training_data: training data in numpy array format
        (REQUIRED) epochs: number of training epochs
        (REQUIRED) mini_batch_size: size of mini batch for stochastic gradient descent
        (REQUIRED) eta: learning rate (Note: learning rate for GradientDescentOptimizer is ~100x larger than AdamOptimizer)
        (REQUIRED) validation_data: validation data in numpy array format
        test_data: test data in numpy array format
        store_accuracies: If True, stores the train, validation and test accuracies (Can be used for plotting against epoch
                            for model calibration and hyperparameter tuning) NOTE: train and test accuracies will be stored only if calibration = True
        shuffle: Shuffle training dataset for training
        SGDR: Stochastic Gradient Descent with Restarts. See https://arxiv.org/abs/1608.03983
        save_dir: Directory to store/load data. If None, stores data in default directory "/tmp/model.ckpt"
        calibration: If True, will calculate train and test accuracy for every epoch. NOTE: Should be disabled unless when calibrating as
                     it will greatly slow down training.
        lmbda: Regularization parameter for l2 regularization

        :return: None
        
        '''

        #Initialize accuracies list
        if store_accuracies:
            self.validation_accuracies = []
            if calibration:
                self.train_accuracies = [] 
                self.test_accuracies=[]


        #Compute number of minibatches for training, validation and testing
        num_training_batches = int(size(training_data)/mini_batch_size)
        num_validation_batches = int(size(validation_data)/mini_batch_size)
        if test_data:
            num_test_batches = int(size(test_data)/mini_batch_size)


        # define the (regularized) cost function
        l2_norm_squared = sum(tf.nn.l2_loss(layer.w) for layer in self.layers)
        cost = self.layers[-1].cost(self)+\
               lmbda*l2_norm_squared/tf.cast(num_training_batches, dtype=tf.float32)

        #Create Global step
        global_step = tf.Variable(0, trainable=False, name='global_step')
        #global_step = tf.train.global_step(sess, global_step_tensor)
    
        #SGDR
        if SGDR:
            first_decay_steps = num_training_batches
            eta = tf.train.cosine_decay_restarts(eta, global_step, first_decay_steps)
        
        # Define optimizer
        with tf.name_scope('optimizer'):
            #train_step = tf.train.GradientDescentOptimizer(lr_decayed).minimize(cost)
            train_step = tf.train.AdamOptimizer(eta).minimize(cost)

        # Define minibatch accuracy operation
        ## Used to get train, validate and test accuracies in session
        mb_accuracy = self.layers[-1].accuracy(self.y)


        # Add ops to save and restore all the variables.
        saver = tf.train.Saver()

        #Loading of data
        training_x, training_y = training_data
        validation_x, validation_y = validation_data
        if test_data:
            test_x, test_y = test_data

        #~~~~~Do the actual training
        with tf.Session() as sess:

            #Initialize best accuracies
            best_validation_accuracy = 0
            best_iteration = 0
            test_accuracy = 0

            if save_dir:
                try:
                    print("\n\nSearching for stored model")
                    saver.restore(sess, save_dir)
                    print("Model restored.\n\n")
                except:
                    print("No model of specified name found")
                    print("Initializing new model...\n\n")
                    sess.run(tf.global_variables_initializer())
            else:
                print("\n\nInitializing new model...\n\n")
                sess.run(tf.global_variables_initializer())

            start_time = time.time() #Track time taken for model

            try:
                for epoch in range(epochs):
                    if shuffle == True:
                        training_x, training_y = shuffle_(training_x, training_y)
                    for minibatch_index in range(num_training_batches):
                        iteration = num_training_batches*epoch+minibatch_index
                        if iteration % 1000 == 0:
                            print("Training mini-batch number {0}".format(iteration))

                        #Training of the model
                        train_step.run(feed_dict={self.x:
                                                      training_x[minibatch_index*self.mini_batch_size: (minibatch_index+1)*self.mini_batch_size],
                                                  self.y:
                                                      training_y[minibatch_index*self.mini_batch_size: (minibatch_index+1)*self.mini_batch_size]})

                        # Calculate and storing of Accuracies
                        if (iteration+1) % num_training_batches == 0:
                            if calibration:
                                train_accuracy = np.mean(
                                    [mb_accuracy.eval(feed_dict={self.x:
                                                          training_x[minibatch_index*self.mini_batch_size: (minibatch_index+1)*self.mini_batch_size],
                                                      self.y:
                                                          training_y[minibatch_index*self.mini_batch_size: (minibatch_index+1)*self.mini_batch_size]}
                                                          ) for j in range(num_training_batches)])
                                print("Epoch {0}: train accuracy {1:.2%}".format(
                                    epoch, train_accuracy))
                                if store_accuracies:
                                    self.train_accuracies.append(train_accuracy)


                            validation_accuracy = np.mean(
                                [mb_accuracy.eval(feed_dict = {self.x:
                                                                   validation_x[j*self.mini_batch_size: (j+1)*self.mini_batch_size],
                                                               self.y:
                                                                   validation_y[j*self.mini_batch_size: (j+1)*self.mini_batch_size]
                                                               }) for j in range(num_validation_batches)])
                            print("Epoch {0}: validation accuracy {1:.2%}".format(
                                epoch, validation_accuracy))
                            if store_accuracies:
                                self.validation_accuracies.append(validation_accuracy)

                            if calibration:
                                if test_data:
                                    test_accuracy = np.mean(
                                        [mb_accuracy.eval(feed_dict = {self.x:
                                                                                test_x[j*self.mini_batch_size: (j+1)*self.mini_batch_size],
                                                                            self.y:
                                                                                test_y[j*self.mini_batch_size: (j+1)*self.mini_batch_size]
                                                                            }) for j in range(num_test_batches)])
                                    print('The corresponding test accuracy is {0:.2%}'.format(
                                        test_accuracy))
                                    if store_accuracies:
                                        self.test_accuracies.append(test_accuracy)



                            if validation_accuracy >= best_validation_accuracy:
                                print("This is the best validation accuracy to date.")
                                best_validation_accuracy = validation_accuracy
                                best_iteration = iteration
                                if test_data:
                                    test_accuracy = np.mean(
                                        [mb_accuracy.eval(feed_dict = {self.x:
                                                                                test_x[j*self.mini_batch_size: (j+1)*self.mini_batch_size],
                                                                            self.y:
                                                                                test_y[j*self.mini_batch_size: (j+1)*self.mini_batch_size]
                                                                            }) for j in range(num_test_batches)])
                                    print('The corresponding test accuracy is {0:.2%}'.format(
                                        test_accuracy))

                                #Saving best weights and biases
                                #save_path = saver.save(sess, "/tmp/best.ckpt")
                                #print("Best variables saved in specified file dir: %s" % save_path)
                                if best_validation_accuracy == 1:
                                    raise GetOutOfLoop

            except GetOutOfLoop:
                print("100% Accuracy achieved. Stopping training...\n\n")
                pass
                                
                                                            

            end_time = time.time()
            total_time = end_time - start_time


            print("Finished training network.")
            print("Time to train network: {}s".format(total_time))
            print("Number of examples trained per sec: {}".format(size(training_data)*epochs/total_time))

            print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format(
                best_validation_accuracy, str(best_iteration)))
            if test_data:
                print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))

            if save_dir:
                save_path = saver.save(sess, save_dir)
                print("Model saved in specified file dir: %s" % save_path)
            else:
                pass

            '''