Exemplo n.º 1
0
def train_classifier(X, y):
    """
    Trains a classifier using best known
    parameters on given data / labels.

    :param X: Samples, a numpy array of
        (N, n_vis) shape where N is number of
        samples and n_vis number of visible
        varliables (sample dimensionality).

    :param y: Labels, a numpy array of
        (N, 1) shape. Each lable should be
        a label index.
    """

    #   split data into minibatches
    X_mnb, y_mnb = util.create_minibatches(X, y, __CLASS_COUNT * 20)

    #   create a DBN and pretrain
    dbn = DBN([32 * 24, 600, 600], __CLASS_COUNT)
    pretrain_params = [[80, 0.05, True, 1, 0.085, 0.1],
                       [80, 0.05, True, 1, 0.000, 0.0]]
    dbn.pretrain(X_mnb, y_mnb, pretrain_params)

    #   fine-tuning
    mlp = dbn.to_mlp()
    mlp.train(X_mnb, y_mnb, 1000, 0.1)

    return mlp
Exemplo n.º 2
0
def main():
    logging.basicConfig(level=logging.INFO)
    log.info("Testing logistic regression class")

    #   generate some data
    #   centers, per class, per dimension
    centers = [[1, 1, 1], [1, -1, 1], [-1, 1, -1]]
    cls_count = len(centers)
    n_dim = len(centers[0])

    #   variances, per class, per dimenzion
    vars = [[1, 1, 1], [1, 1, 1], [1, 1, 1]]
    assert (len(vars) == cls_count)

    N_per_class = 2500
    N = N_per_class * cls_count
    log.info("Generating data, %d classes, %d samples per class",
             cls_count, N_per_class)
    X = np.zeros((N, n_dim))
    y = np.zeros(N, dtype=np.int32)
    for i in range(N):
        cls = i / N_per_class
        y[i] = cls
        for dim in range(n_dim):
            X[i, dim] = np.random.normal(centers[cls][dim], vars[cls][dim])

    log.info("Splitting into train and test sets")
    train_mask = np.random.rand(N) < 0.85
    test_mask = np.logical_not(train_mask)
    X_train = X[train_mask]
    y_train = y[train_mask]
    log.info("%d samples in train set", len(X_train))

    log.info("Creating minibatches")
    X_mnb, y_mnb = util.create_minibatches(X_train, y_train, cls_count * 10)

    log.info("Fitting")
    estimator = LogisticRegression(T.matrix("input"), n_dim, cls_count)
    log.info("Init acc: %.2f", util.acc(
        y[test_mask], estimator.predict(X[test_mask])))
    for i in range(10):
        estimator.train(X_mnb, y_mnb, 1, 0.1)
        #   validate
        acc = util.acc(y[test_mask], estimator.predict(X[test_mask]))
        log.info("Current acc: %.2f", acc)
Exemplo n.º 3
0
    def evaluate(self, x, mnb_size):
        """
        Evaluates model cost on given samples and returns
        the mean.

        :param x: Samples, a numpy array of shape (N, model_input).
        :param mnb_size: Minibatch size, necessary because evaluating
            all the samples in 'x' at once might be too memory demanding.
        :return: Mean cost of samples in 'x'.
        """

        evaluate_f = getattr(self, "_evaluate", None)
        if evaluate_f is None:
            evaluate_f = theano.function([self.input], self.cost)
            self._evaluate = evaluate_f

        #   take into account possibly unbalance mnb sizes
        return np.sum([
            evaluate_f(mnb) * mnb.shape[0]
            for mnb in util.create_minibatches(x, None, mnb_size, False)
        ]) / x.shape[0]
Exemplo n.º 4
0
def test_dbn():

    log.info('Testing DBN')

    #   trainset loading
    cls_count = 9
    X, y, classes = get_data(cls_count=None)

    X_mnb, y_mnb = util.create_minibatches(X, y, 20 * cls_count)

    # lin_eps = util.lin_reducer(0.05, 0.002, 20)
    dbn = DBN([32 * 24, 588, 588], cls_count)
    dbn.train(X_mnb, y_mnb, [{
        'epochs': 50,
        'eps': 0.05,
        'spars': 0.05,
        'spars_cost': 0.3
    }, {
        'epochs': 1,
        'eps': 0.05
    }])
Exemplo n.º 5
0
def test_rbm():

    log.info('Testing RBM')
    rbm = RBM(32 * 24, 100)
    analysis.display_RBM(rbm, 32, 24)

    #   trainset loading
    cls_count = 9
    X, y, classes = get_data(cls_count=cls_count)

    #   train the RBM for a while!
    X_mnb = util.create_minibatches(X, None, 20 * cls_count)

    cost, time, hid_act = rbm.train(
        X_mnb, **{
            'epochs': 5,
            'eps': 0.05,
            'spars': 0.05,
            'spars_cost': 6.0
        })

    analysis.display_RBM(rbm, 32, 24)
Exemplo n.º 6
0
def get_data():
    """
    Returns the data for the workflow: a tuple of two
    dicts (data_train, data_test). data_train maps class
    counts (integers indicating how many classes are used)
    to a tuple of form (X_mnb, y_mnb) where X_mnb are data samples
    split into minibatches and y are corresponding labels.
    data_test maps class counts to a tuple of form (X, y) where
    X are data samples and y are corresponding labels, NOT
    split into minibatches.

    The data is lazily initialized into the global __data variable.
    """

    global __data

    if __data is None:

        X, y, classes = raw_data
        log.info('Read %d samples', len(y))

        def data_subset(cls_count):

            cls = ['A', 'B', 'C', 'D', 'E', 'F', 'X', '_BLANK', '_UNKNOWN']
            cls_subs = cls[:cls_count]
            log.info('Taking a subset of data containing classes %r', cls_subs)

            bool_mask = np.array([(classes[ind] in cls_subs) for ind in y])
            X_subs = X[bool_mask]
            y_subs = y[bool_mask]
            log.info('Subset has %d elements', len(X))

            return X_subs, y_subs

        #   splitting the trainset into train / test
        test_size = 0.1
        test_indices = np.array(np.random.binomial(1, test_size, len(X)),
                                dtype=np.bool)
        train_indices = np.logical_not(test_indices)

        #   create dicts of data subsets. each dict has form:
        #   {class_count: (X, y)}
        #   note that X and y are for 'train' data split into
        #   minibatches, but for 'test' they are not
        data_train = {}
        data_test = {}
        for cls_cnt in [1, 3, 7, 9]:

            #   get data subset
            X_subs, y_subs = data_subset(cls_cnt)
            N = len(X_subs)

            #   split data subset into train and test
            X_subs_train = X_subs[train_indices[:N]]
            y_subs_train = y_subs[train_indices[:N]]
            X_subs_test = X_subs[test_indices[:N]]
            y_subs_test = y_subs[test_indices[:N]]

            data_train[cls_cnt] = util.create_minibatches(
                X_subs_train, y_subs_train, 20 * cls_cnt)
            data_test[cls_cnt] = (X_subs_test, y_subs_test)

        __data = (data_train, data_test)

    return __data