Пример #1
0
    def on_dataset_change(args):
        with dispatcher.get_lock('bootenvs'):
            if args['operation'] == 'create':
                bootenvs.propagate(args, convert_bootenv)

            if args['operation'] == 'delete':
                logger.warn(args)
                for i in args['ids']:
                    pool, dataset = split_dataset(i)
                    if pool != boot_pool_name:
                        continue

                    realname = dataset.split('/')[-1]
                    ds = bootenvs.query(('realname', '=', realname), single=True)
                    if ds:
                        bootenvs.remove(ds['id'])

            if args['operation'] == 'update':
                for i in args['entities']:
                    pool, dataset = split_dataset(i['id'])
                    if pool != boot_pool_name:
                        continue

                    realname = dataset.split('/')[-1]
                    ds = bootenvs.query(('realname', '=', realname), single=True)
                    if not ds:
                        continue

                    nickname = i.get('properties.beadm:nickname.value', realname)
                    if nickname and nickname != ds['id']:
                        bootenvs.rename(ds['id'], nickname)

                    bootenvs.put(nickname, convert_bootenv(i))
Пример #2
0
def make_dataset(name):
    inputs, psi, reference = load('{}.data'.format(name))
    # train/test split
    trainset, testset = utils.split_dataset(  # train/test split
        utils.shuffle((inputs, psi)),  # shuffle dataset
        ratio=0.7)
    # test/valid split
    testset, validset = utils.split_dataset(testset, ratio=0.5)

    return (trainset, testset, validset)
Пример #3
0
def transfer(source_model, target_path, frozen_layer):
    # load data of branch
    df = pd.read_pickle(target_path)
    # create dataframe with netto sales, month, weekday, year
    df = pd.DataFrame(data=df.values, index=df.index, columns=['netto'])
    df = df.assign(month=df.index.month)
    df = df.assign(weekday=df.index.weekday)
    df = df.assign(year=df.index.year)
    # split into train and test
    train, test = split_dataset(df.values, 365)
    # prepare input data for branch
    n_input = 365
    train_x, train_y = to_supervised(train, n_input, 365)

    # load pre-trained model of source branch as base model
    base_model = load_model(source_model)
    # freeze specific layers of base model
    for layer in base_model.layers[:frozen_layer]:
        layer.trainable = False
    print("frozen layers: " + str(frozen_layer))

    # compile the model
    base_model.compile(loss='mse', optimizer='adam')

    # fit base_model with new data from branch
    n_timesteps, n_features, n_outputs = train_x.shape[1], train_x.shape[2], train_y.shape[1]
    input_data = [train_x[:, :, i].reshape((train_x.shape[0], n_timesteps, 1)) for i in range(n_features)]
    base_model.fit(input_data, train_y, epochs=20, batch_size=16, verbose=0)
    # evaluate fitted model
    mape = evaluate_model(train, test, base_model)
    return mape
Пример #4
0
def get_feats_from_csv_in_partitions():
    """
    Extract the original features that are distributed in the dataset. Features
    are splitted according with the config.yaml file.
    """
    conf = utils.get_config()
    rows = [
        row for row in utils.load_csv()
        if utils.check_filter(row, conf['filters'])
    ]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        rows,
        conf['valid_percent'],
        conf['test_percent'],
        rng=conf['rng_seed'])
    X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], []
    prefixes = ['t_', 'i_', 's_']  # Feature names' prefixes
    datasets = [(X_train, y_train, train_rows), (X_test, y_test, test_rows),
                (X_valid, y_valid, valid_rows)]
    out = []
    for X, y, rows in datasets:
        for row in rows:
            X.append([
                float(v) for k, v in row.iteritems()
                if len(filter(k.startswith, prefixes)) > 0
            ])
            y.append(int(row['classification'] == 'Malign'))
        out.extend((np.asarray(X), np.asarray(y)))
    return out
Пример #5
0
def get_feats_from_imagenet_in_partitions():
    conf = utils.get_config()
    imagenet_data = os.path.join(conf['models_path'], 'decafnet',
                                 'imagenet.decafnet.epoch90')
    imagenet_meta = os.path.join(conf['models_path'], 'decafnet',
                                 'imagenet.decafnet.meta')
    net = DecafNet(imagenet_data, imagenet_meta)
    rows = utils.get_filtered_rows()
    sets = utils.split_dataset(rows,
                               conf['valid_percent'],
                               conf['test_percent'],
                               rng=conf['rng_seed'])
    feats = []
    ys = []
    for s in sets:
        X = np.zeros((len(s), 4096))
        y = np.zeros(len(s))
        for i, row in enumerate(s):
            try:
                log.info('processing %i-th of %i' % (i, len(s)))
                origin, im = utils.extract_roi(row, 30, True)
                scores = net.classify(np.asarray(im), center_only=True)
                X[i] = net.feature('fc7_cudanet_out')
                y[i] = utils.is_positive(row)
            except:
                continue
        feats.append(X)
        ys.append(y)

    return feats[0], ys[0], feats[1], ys[1], feats[2], ys[2]
Пример #6
0
def get_feats_in_partitions():
    """
    Extracts features from all dataset and split them in train validation and
    test sets
    """
    conf = utils.get_config()
    paths = utils.get_paths()
    rows = utils.load_csv()
    filters = conf['filters']
    region_size = conf['region_size']
    region_stride = conf['region_stride']

    filtered_rows = [
        row for row in rows if utils.check_filter(row, conf['filters'])
    ]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        filtered_rows,
        conf['valid_percent'],
        conf['test_percent'],
        rng=conf['rng_seed'])

    conv = get_fprop_fn(False)
    print 'Getting features from train...'
    X_train = get_feats_from_rows(train_rows, conv, conf['stride'])
    print 'Getting features from valid...'
    X_valid = get_feats_from_rows(valid_rows, conv, conf['stride'])
    print 'Getting features from test...'
    X_test = get_feats_from_rows(test_rows, conv, conf['stride'])
    y_train = [row['classification'] == 'Malign' for row in train_rows]
    y_valid = [row['classification'] == 'Malign' for row in valid_rows]
    y_test = [row['classification'] == 'Malign' for row in test_rows]
    return X_train, y_train, X_valid, y_valid, X_test, y_test
Пример #7
0
 def initialize_train_test(self, train_ratio=0.75, sample=True, full=False):
     '''
     This function initializes the training and the test sets.
     The training set is initialized to a fraction of the original dataset given by
     the input parameter train_ratio, while the test set will be initialized to the
     remaining part of the original dataset. Notice that the training and the test
     sets are copies of the original dataset, so every changes made on those datasets
     will not affect the original one. The default value for the parameter train_ratio
     is 0.75.
     If the input parameter sample is True, then the training set is built sampling
     rows from the original dataset. Otherwise, it is built taking the first part of
     the dataset. Default value for sample is True.
     If the input parameter full is True, then the values of the other parameters are
     ignored and both the training and the test sets are initialized to be exact
     copies of the full original one.
     '''
     if full:
         self.train = self.dataset.copy()
         self.test = self.dataset.copy()
     else:
         if sample:
             self.train, self.test = utils.split_dataset_sample(
                 self.dataset, train_ratio)
         else:
             self.train, self.test = utils.split_dataset(
                 self.dataset, train_ratio)
Пример #8
0
def get_feats_from_imagenet_in_partitions():
    conf = utils.get_config()
    imagenet_data = os.path.join(
        conf['models_path'], 'decafnet', 'imagenet.decafnet.epoch90')
    imagenet_meta = os.path.join(
        conf['models_path'], 'decafnet', 'imagenet.decafnet.meta')
    net = DecafNet(imagenet_data, imagenet_meta)
    rows = utils.get_filtered_rows()
    sets = utils.split_dataset(
        rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed'])
    feats = []
    ys = []
    for s in sets:
        X = np.zeros((len(s), 4096))
        y = np.zeros(len(s))
        for i, row in enumerate(s):
            try:
                log.info('processing %i-th of %i' % (i, len(s)))
                origin, im = utils.extract_roi(row, 30, True)
                scores = net.classify(np.asarray(im), center_only=True)
                X[i] = net.feature('fc7_cudanet_out')
                y[i] = utils.is_positive(row)
            except:
                continue
        feats.append(X)
        ys.append(y)

    return feats[0], ys[0], feats[1], ys[1], feats[2], ys[2]
Пример #9
0
def get_feats_in_partitions():
    """
    Extracts features from all dataset and split them in train validation and
    test sets
    """
    conf = utils.get_config()
    paths = utils.get_paths()
    rows = utils.load_csv()
    filters = conf['filters']
    region_size = conf['region_size']
    region_stride = conf['region_stride']

    filtered_rows = [
        row for row in rows if utils.check_filter(row, conf['filters'])]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        filtered_rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed'])

    conv = get_fprop_fn(False)
    print 'Getting features from train...'
    X_train = get_feats_from_rows(
        train_rows, conv, conf['stride'])
    print 'Getting features from valid...'
    X_valid = get_feats_from_rows(
        valid_rows, conv, conf['stride'])
    print 'Getting features from test...'
    X_test = get_feats_from_rows(
        test_rows, conv, conf['stride'])
    y_train = [row['classification'] == 'Malign' for row in train_rows]
    y_valid = [row['classification'] == 'Malign' for row in valid_rows]
    y_test = [row['classification'] == 'Malign' for row in test_rows]
    return X_train, y_train, X_valid, y_valid, X_test, y_test
Пример #10
0
def xgb_boost_model():
    df_all = pickle.load(open("../output/features/basic_features.pkl", 'r'))
    test_ind = df_all.relevance == -1
    test_data = df_all[test_ind]
    train_data = df_all[~test_ind]
    test_data = test_data.drop(['relevance'], axis=1)
    le = preprocessing.LabelEncoder()
    le.fit(train_data['relevance'])

    ids = test_data['id']

    train, test, hold_out = utils.split_dataset(train_data)

    relevant_columns =['title_similarity', 'product_desc_similarity',  'title_similarity_common', 'product_desc_similarity_common', 'description_length', 'search_length']
    dTrain = xgb.DMatrix(train['X'][relevant_columns], label=train['Y'])
    dTest = xgb.DMatrix(test['X'][relevant_columns], label=test['Y'])
    dHold_out = xgb.DMatrix(hold_out['X'][relevant_columns], label=hold_out['Y'])
    dSubmit = xgb.DMatrix(test_data[relevant_columns])

    param = {'bst:max_depth':5  , 'bst:eta':0.05, 'silent':1, 'objective':'reg:linear', 'eval_metric':'rmse'}

    evallist = [(dTest, 'eval'), (dTrain, 'train')]
    numRound = 200
    bst = xgb.train(param, dTrain, numRound, evallist)

    predHoldout = bst.predict(dHold_out)
    print "Mean square hold out error ", utils.rmse(hold_out['Y'], predHoldout)

    predY = bst.predict(dSubmit)
    utils.debug_model(hold_out['X'], hold_out['Y'], predY)
Пример #11
0
def run_network(window, model=None, save_model=False, show_plot=False):
    start_time = time.time()

    print('loading and prepare data set...')
    data = read_dataset('../datasets/internet-traffic-data-5minutes.csv')
    X_train, y_train, X_test, y_test, mean, std = split_dataset(
        data, window, ratio=0.90, standardize=True)

    print('number of training samples ', len(y_train))
    print('number of test samples     ', len(y_test))

    if not model:
        print('initialize model...')
        model = compile_model(
            hidden_neurons=25, loss_fn='mse',
            input_dim=sum(1 for x in window if x), activation_fn='tanh')

        print('model ', model.summary())

        print('train model...')
        early_stopping = EarlyStopping(monitor='val_loss', patience=2)
        model.fit(X_train, y_train, nb_epoch=500, validation_split=0.1,
                  callbacks=[early_stopping])

    print('make predictions...')
    prediction = model.predict(X_test).flatten()

    if show_plot:
        plot_result(prediction, y_test, mean, std)
        print('mase = ', mase(y_train, y_test, prediction))

    if save_model:
        store_model(model)

    print('totoal duration: {:.2f} seconds'.format(time.time() - start_time))
Пример #12
0
def main():
    model = create_model()
    model.summary()

    # Building Phase
    data = import_data("./dataset/crx_clean.data.txt")
    X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)

    # Expand data dimension for kernel to convolve over
    X_train = np.expand_dims(X_train, axis=2)  # (None, 46, 1)
    X_test = np.expand_dims(X_test, axis=2)  # (None, 46, 1)

    # create model
    model = KerasClassifier(build_fn=create_model, verbose=0)

    # Operational Phase
    scorer = make_scorer(f1_score, pos_label='+')
    print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
    Y_pred_grid_search = grid_search_cv_CNN(model, X_train, Y_train, X_test,
                                            Y_test, scorer)
    Y_pred_grid_search = np.squeeze(Y_pred_grid_search)
    print()
    print()
    print(Y_pred_grid_search)
    print()
    print(Y_test)
    print()
    print_scores(Y_test, Y_pred_grid_search)
Пример #13
0
    def fit(self, X, y):
        """fit tree in X, y"""
        try:
            # only for numpy arrays for now
            if not isinstance(X, np.ndarray):
                X = np.array(X)
            if not isinstance(y, np.ndarray):
                y = np.array(y)

            if self.max_depth is None:
                # The absolute maximum depth would be N−1, where N is the number of training samples.
                # https://stats.stackexchange.com/questions/65893/maximal-depth-of-a-decision-tree
                self.max_depth = X.shape[0] - 1
            if self.n_clasess is None:
                self.n_clasess = len(set(y))

            assert (X.shape[0] > self.min_samples_split)
            if not (self.max_depth is None):
                assert (self.max_depth > 0)

            gain, column_idx, threshold = self._find_best_split(X, y)
            X_left, X_right, y_left, y_right = split_dataset(X,
                                                             y,
                                                             column=column_idx,
                                                             t=threshold)

            self.node = Node(feature_idx=column_idx,
                             threshold=threshold,
                             labels=y,
                             gain=gain)

            # build left and right child for max
            self.node.left = DecisionTree(criterion=self.criterion,
                                          debug=self.debug,
                                          max_depth=self.max_depth - 1)
            # if base class for random forest -> remove that attribute
            self.node.left.x_columns = self.x_columns
            self.node.left.n_clasess = self.n_clasess
            self.node.left.fit(X_left, y_left)

            self.node.right = DecisionTree(criterion=self.criterion,
                                           debug=self.debug,
                                           max_depth=self.max_depth - 1)
            # if base class for random forest -> remove that attribute
            self.node.right.x_columns = self.x_columns
            self.node.right.n_clasess = self.n_clasess
            self.node.right.fit(X_right, y_right)
        # not the best idea, it is impossible to check for other conditions with assert :(
        except AssertionError:
            self.node = Node()
            self._predict_from_leaf(y)
            # test info about predictions
            if self.debug:
                print("Is Last Node: ", self.node.is_last)
                print("Data shapes: ", X.shape, y.shape)
                print("Y: ", y)
                print("Prediction: ", self.node.node_prediction)
                print("Predict proba: ", self.node.node_prob_prediction)
            return self
Пример #14
0
    def on_dataset_change(args):
        if args['operation'] == 'create':
            with dispatcher.get_lock('bootenvs'):
                boot_pool = dispatcher.call_sync('zfs.pool.get_boot_pool')
                bootenvs.propagate(args,
                                   lambda x: convert_bootenv(boot_pool, x))

        if args['operation'] == 'delete':
            for i in args['ids']:
                pool, dataset = split_dataset(i)
                if pool != boot_pool_name:
                    continue

                with dispatcher.get_lock('bootenvs'):
                    realname = dataset.split('/')[-1]
                    ds = bootenvs.query(('realname', '=', realname),
                                        single=True)
                    if ds:
                        bootenvs.remove(ds['id'])

        if args['operation'] == 'update':
            boot_pool = None
            for i in args['entities']:
                pool, dataset = split_dataset(i['id'])
                if pool != boot_pool_name:
                    continue

                with dispatcher.get_lock('bootenvs'):
                    realname = dataset.split('/')[-1]
                    ds = bootenvs.query(('realname', '=', realname),
                                        single=True)
                    if not ds:
                        continue

                    nickname = q.get(i, 'properties.beadm:nickname.value',
                                     realname)
                    if nickname and nickname != ds['id']:
                        bootenvs.rename(ds['id'], nickname)

                    if not boot_pool:
                        boot_pool = dispatcher.call_sync(
                            'zfs.pool.get_boot_pool')

                    bootenvs.put(nickname, convert_bootenv(boot_pool, i))
Пример #15
0
 def test_split_dataset(self):
     filename = '/path/to/dataset.tsv'
     train_filename = '/path/to/dataset_train.tsv'
     dev_filename = '/path/to/dataset_dev.tsv'
     read_data = 'Rock n Roll is a risk. You rick being ridiculed.\tDo you like rock music?\n' \
         'Rock n Roll is a risk.\tDo you like rock?'
     open_ = patch('utils.open', mock_open(read_data=read_data)).start()
     open_.return_value.__iter__.return_value = read_data.split('\n')
     writer = patch('csv.writer').start()
     split_dataset(filename)
     open_.assert_has_calls([
         call(filename),
         call(train_filename, 'w'),
         call(dev_filename, 'w')
     ],
                            any_order=True)
     writer.assert_called_with(open_.return_value, delimiter='\t')
     writer.return_value.writerow.assert_called()
     patch.stopall()
Пример #16
0
    def on_dataset_change(args):
        if args['operation'] == 'create':
            with dispatcher.get_lock('bootenvs'):
                boot_pool = dispatcher.call_sync('zfs.pool.get_boot_pool')
                bootenvs.propagate(args, lambda x: convert_bootenv(boot_pool, x))

        if args['operation'] == 'delete':
            for i in args['ids']:
                pool, dataset = split_dataset(i)
                if pool != boot_pool_name:
                    continue

                with dispatcher.get_lock('bootenvs'):
                    realname = dataset.split('/')[-1]
                    ds = bootenvs.query(('realname', '=', realname), single=True)
                    if ds:
                        bootenvs.remove(ds['id'])

        if args['operation'] == 'update':
            boot_pool = None
            for i in args['entities']:
                pool, dataset = split_dataset(i['id'])
                if pool != boot_pool_name:
                    continue

                with dispatcher.get_lock('bootenvs'):
                    realname = dataset.split('/')[-1]
                    ds = bootenvs.query(('realname', '=', realname), single=True)
                    if not ds:
                        continue

                    nickname = q.get(i, 'properties.beadm:nickname.value', realname)
                    if nickname and nickname != ds['id']:
                        bootenvs.rename(ds['id'], nickname)

                    if not boot_pool:
                        boot_pool = dispatcher.call_sync('zfs.pool.get_boot_pool')

                    bootenvs.put(nickname, convert_bootenv(boot_pool, i))
Пример #17
0
def do_experiment_for_one_year(run_path, year, config):
    """Performs the specified experiments for one year."""
    X, Y = load_dataset(year, shuffle=config['experiment']['shuffle_data'])
    if config['experiment']['type'] == 'single':
        X_train, Y_train, X_test, Y_test = split_dataset(X, Y, config['experiment']['test_share'])
        results = perform_one_experiment(X_train, Y_train, X_test, Y_test, config)
    elif config['experiment']['type'] == 'cv':
        results = perform_cv_runs(X, Y, config)

    results_path = os.path.join(run_path, 'results_year{}.pkl'.format(year))
    with open(results_path, 'wb') as f:
        pickle.dump(results, f)
    show_results(results, year, **config['analysis'])
Пример #18
0
    def fit(self, samples, labels):
        """Train the model with the samples and lables provided according to
        the parameters of the model."""

        # Split into train and dev
        x_train, y_train, x_dev, y_dev = split_dataset(samples, labels,
                                                       self.dev_share)

        # Create batch iterator
        if self.batch_iterator_type == 'normal':
            batch_iter = _batch_iter
        elif self.batch_iterator_type == 'oversample':
            batch_iter = _oversampling_batch_iter
        else:
            raise ValueError('{} is not a valid batch_iterator_type'.format(
                self.batch_iterator_type))

        # Train model
        train_batch_nr = []
        train_loss_val = []
        dev_batch_nr = []
        dev_loss_val = []
        for i, (x, y) in enumerate(
                batch_iter(x_train, y_train, self.num_epochs,
                           self.batch_size)):
            # Train
            feed_dict = {
                self.graph_nodes['x_input']: x,
                self.graph_nodes['y_input']: y,
                self.graph_nodes['dropout_keep_prob']: self.dropout_keep_prob
            }
            _, loss_val = self.sess.run(
                [self.graph_nodes['optimize'], self.graph_nodes['loss']],
                feed_dict=feed_dict)
            train_batch_nr.append(i)
            train_loss_val.append(loss_val)
            if i % self.evaluate_every_n_steps == 0:
                feed_dict = {
                    self.graph_nodes['x_input']: x_dev,
                    self.graph_nodes['y_input']: y_dev,
                    self.graph_nodes['dropout_keep_prob']: 1.
                }
                loss_val = self.sess.run(self.graph_nodes['loss'],
                                         feed_dict=feed_dict)
                dev_batch_nr.append(i)
                dev_loss_val.append(loss_val)

        if self.plot_training:
            plt.plot(train_batch_nr, train_loss_val)
            plt.plot(dev_batch_nr, dev_loss_val)
            plt.show()
def pretrain_model(path):
    df = pd.read_pickle(path)
    # create dataframe with netto sales, month, weekday, year
    df = pd.DataFrame(data=df.values, index=df.index, columns=['netto'])
    df = df.assign(month=df.index.month)
    df = df.assign(weekday=df.index.weekday)
    df = df.assign(year=df.index.year)
    # split into train and test
    train, test = split_dataset(df.values, 365)
    # evaluate model and get scores
    model = evaluate_model(train, test)
    # save model
    model.save("models/" + path + ".h5")
    del model
Пример #20
0
    def run(self, id, updated_fields):
        share = self.datastore.get_by_id('shares', id)
        remove_unchanged(updated_fields, share)

        path = self.dispatcher.call_sync('share.get_directory_path', share['id'])
        try:
            delete_config(
                path,
                '{0}-{1}'.format(share['type'], share['name'])
            )
        except OSError:
            pass

        if 'type' in updated_fields:
            old_share_type = share['type']
            new_share_type = self.dispatcher.call_sync('share.supported_types').get(updated_fields['type'])
            if share['target_type'] == 'DATASET':
                pool, dataset = split_dataset(share['target_path'])
                self.join_subtasks(
                    self.run_subtask('volume.dataset.update', dataset, {
                        'permissions_type': new_share_type['perm_type']
                    })
                )

            share.update(updated_fields)
            self.join_subtasks(self.run_subtask('share.{0}.delete'.format(old_share_type), id))
            self.join_subtasks(self.run_subtask('share.{0}.create'.format(updated_fields['type']), share))
        else:
            self.join_subtasks(self.run_subtask('share.{0}.update'.format(share['type']), id, updated_fields))

        if 'permissions' in updated_fields:
            path = self.dispatcher.call_sync('share.translate_path', id)
            self.join_subtasks(self.run_subtask('file.set_permissions', path, updated_fields['permissions']))

        self.dispatcher.dispatch_event('share.changed', {
            'operation': 'update',
            'ids': [share['id']]
        })

        updated_share = self.datastore.get_by_id('shares', id)
        path = self.dispatcher.call_sync('share.get_directory_path', updated_share['id'])
        try:
            save_config(
                path,
                '{0}-{1}'.format(updated_share['type'], updated_share['name']),
                updated_share
            )
        except OSError as err:
            self.add_warning(TaskWarning(errno.ENXIO, 'Cannot save backup config file: {0}'.format(str(err))))
Пример #21
0
def train_and_test(df, preds, seed):
    '''
    Run a single trial:
        Shuffle df and split it into training and testing subsets
        Train a new model based on the training sets
        Test the model with testing set
        Add prediction data into preds array

    :param df: dataframe with full set of all available samples
        columns: id, cat1 (primary class), cat2 (secondary),
        title, titlen (claened title)
    :param preds: an array of predictions, each prediction is a dictionary
        cat: true category, pred: predicted category,
        conf: model confidence in its prediction (< 1.0),
        title: actual title of the chapter/sample
    :return: average testing accuracy
    '''
    ret = {}

    # PREPS
    # randomly split the dataset
    df = utils.split_dataset(
        df,
        settings.CAT_DEPTH,
        settings.TRAIN_PER_CLASS_MIN,
        settings.TEST_PER_CLASS,
        settings.VALID_PER_CLASS,
    )

    # TRAIN
    classifier = Classifier.from_name(settings.CLASSIFIER, seed)
    classifier.set_datasets(df, titles_out_path)
    classifier.train()

    df_test = classifier.df_test

    if settings.EVALUATE_TRAINING_SET:
        evaluate_model(classifier,
                       classifier.df_train,
                       display_prefix='TRAIN = ')
    accuracy = evaluate_model(classifier,
                              df_test,
                              preds,
                              display_prefix='TEST  = ')
    classifier_key = utils.get_exp_key(classifier)

    classifier.release_resources()

    return classifier_key, accuracy, classifier.df_train
Пример #22
0
 def train(self, X_train, y_train):
     """
     Train a Regression Forest using the given training data by training a number of Regression Trees
     each with a random sample of the training data.
     """
     train_dataset = np.c_[(X_train, y_train)]
     for i in range(self.n_estimators):
         bootstrap_sample = train_dataset[np.random.choice(
             train_dataset.shape[0],
             size=int(round(train_dataset.shape[0] * self.split)),
             replace=True)]
         X_train, y_train, _, _ = split_dataset(bootstrap_sample,
                                                self.split,
                                                is_print=False)
         tree = RegressionTree(self.n_features, self.max_depth)
         tree.train(X_train, y_train)
         self.random_forest.append(tree)
Пример #23
0
    def __init__(self, hparams=None):
        super().__init__()

        # Metrics
        self.train_acc = pl.metrics.Accuracy()
        self.val_acc = pl.metrics.Accuracy(compute_on_step=False)
        self.test_acc = pl.metrics.Accuracy(compute_on_step=False)

        # Hyperparameters
        self.hparams = hparams

        # Data
        self.train_data, self.test_data, self.val_data = split_dataset()

        # Model initialization
        multiplier = 2 if hparams["bidirectional"] else 1
        self.word_vec_size = 300 * multiplier
        self.amount_classes = 7
        self.rnn = nn.LSTM(input_size=self.word_vec_size,
                           hidden_size=hparams["lstm_hidden_dim"],
                           bidirectional=hparams["bidirectional"],
                           num_layers=hparams["lstm_num_layers"])

        # First FC layer
        modules = [
            nn.Linear(self.word_vec_size, hparams["FC_layer_dims"][0]),
            nn.ReLU(),
            nn.Dropout(hparams["FC_dropouts"][0])
        ]

        # Middle FC layers
        for i, (dim, d_rate) in enumerate(
                zip(hparams["FC_layer_dims"], hparams["FC_dropouts"])):
            if i == len(hparams["FC_layer_dims"]) - 1:
                continue  # we reached the end
            modules.append(nn.Linear(dim, hparams["FC_layer_dims"][i + 1]))
            modules.append(nn.ReLU())
            modules.append(nn.Dropout(d_rate))

        # Last FC layer
        modules.append(
            nn.Linear(hparams["FC_layer_dims"][-1], self.amount_classes))
        modules.append(nn.ReLU())
        modules.append(nn.Dropout(hparams["FC_dropouts"][-1]))

        self.classifier = nn.Sequential(*modules)
def main():
    # Building Phase
    data = import_data("./dataset/crx_clean.data.txt")
    X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)

    # Operational Phase
    scorer = make_scorer(f1_score, pos_label='+')
    print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
    Y_pred_grid_search = grid_search_cv_mlp(X_train, Y_train, X_test, Y_test,
                                            scorer)

    print()
    print()
    print(Y_pred_grid_search)
    print()
    print(Y_test)
    print()
    print_scores(Y_test, Y_pred_grid_search)
Пример #25
0
def main(args: Namespace):
    results_path = args.log_dir / str(datetime.now())

    results_path.mkdir(exist_ok=True, parents=True)
    write_args(results_path, vars(args))

    fix_seed(args.seed)

    height_model = 1000
    width_model = 24

    filenames_train, filenames_valid, filenames_test = split_dataset(args.data_root, args.fracs_dataset)

    train_set = SeisDataset(filenames_train,
                            height_model=height_model,
                            width_model=width_model,
                            prob_aug=args.prob_aug)

    valid_set = SeisDataset(filenames_valid,
                            height_model=height_model,
                            width_model=width_model,
                            prob_aug=args.prob_aug)

    test_set = SeisDataset(filenames_test,
                           height_model=height_model,
                           width_model=width_model,
                           prob_aug=args.prob_aug)

    net = UNetFB()
    picker = Picker(net)

    stopper = Stopper(args.n_wrongs, args.delta_wrongs)

    trainer = Trainer(picker=picker, results_path=results_path,
                      train_set=train_set, valid_set=valid_set,
                      test_set=test_set, device=args.device,
                      batch_size=args.batch_size, lr=args.lr,
                      freq_valid=args.freq_valid, num_workers=args.num_workers,
                      dt_ms=args.dt_ms, height_model=height_model,
                      width_model=width_model, visual=args.visual,
                      stopper=stopper, weights=torch.tensor(args.weights))

    trainer.train(num_epoch=args.num_epoch)
Пример #26
0
def run_custom_classifier(weights=None):

    training_images = utils.load_images_from_dir(TRAINING_DIR)
    testing_images = utils.load_images_from_dir(TESTING_DIR)

    X_train, X_test, y_train, y_test = utils.split_dataset(training_images, testing_images, TRAINING_FILE, TESTING_FILE)

    #preprocess data
    X_train, y_train = preprocess(X_train, y_train)
    X_test, y_test = preprocess(X_test, y_test)

    #compile model
    model = custom_classifier()

    #train if no weights are passed in
    if weights == None:

        history = model.fit(X_train, y_train, epochs=50, verbose=1, validation_data=(X_test, y_test))

        model.save_weights(os.path.join(WEIGHTS_DIR, 'custom_model.h5'))

    else:
        model.load_weights(weights)


    scores = model.evaluate(X_train, y_train, verbose=1)
    print('Digit 1 loss:', scores[1])
    print('Digit 2 loss:', scores[2])
    print('Digit 3 loss:', scores[3])
    print('Digit 4 loss:', scores[4])
    print('Digit 5 loss:', scores[5])
    average_loss = sum([scores[i] for i in range(1, 6)]) / 5
    print('Average loss:', average_loss)

    print('Digit 1 accuracy:', scores[6])
    print('Digit 2 accuracy:', scores[7])
    print('Digit 3 accuracy:', scores[8])
    print('Digit 4 accuracy:', scores[9])
    print('Digit 5 accuracy:', scores[10])

    average_accuracy = sum([scores[i] for i in range(6, 11)]) / 5
    print('Average accueracy:', average_accuracy)
Пример #27
0
def obtain_train_test(path, ifilename, ftrain_name, ftest_name, frac_test=0.2):

    df = pd.read_csv(os.path.join(path, ifilename))
    df['date'] = pd.to_datetime(df['date'])

    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)

    groups = list(df['inlet'].unique())
    dftrain, dftest = split_dataset(df, frac_test=frac_test,
                                    groups=groups)  #80/20% for all inlets

    if ((dftrain.shape[0] + dftest.shape[0]) != df.shape[0]):
        ValueError(
            'The shapes of the resulting files are inconsistent with the shape of input table!'
        )
    else:
        dftrain.to_csv(os.path.join(path, ftrain_name), index=False)
        dftest.to_csv(os.path.join(path, ftest_name), index=False)
        return dftrain, dftest
Пример #28
0
def create_loaders(dataset_name,
                   dataset_train,
                   dataset_val,
                   dataset_test,
                   train_size,
                   val_size,
                   batch_size,
                   test_batch_size,
                   cuda,
                   num_workers,
                   topk=None,
                   noise=False):

    kwargs = {'num_workers': num_workers, 'pin_memory': True} if cuda else {}

    dataset_train, dataset_val = split_dataset(dataset_train, dataset_val,
                                               train_size, val_size)

    print('Dataset sizes: \t train: {} \t val: {} \t test: {}'.format(
        len(dataset_train), len(dataset_val), len(dataset_test)))

    train_loader = data.DataLoader(dataset_train,
                                   batch_size=batch_size,
                                   shuffle=True,
                                   **kwargs)

    val_loader = data.DataLoader(dataset_val,
                                 batch_size=test_batch_size,
                                 shuffle=False,
                                 **kwargs)

    test_loader = data.DataLoader(dataset_test,
                                  batch_size=test_batch_size,
                                  shuffle=False,
                                  **kwargs)

    train_loader.tag = 'train'
    val_loader.tag = 'val'
    test_loader.tag = 'test'

    return train_loader, val_loader, test_loader
Пример #29
0
def get_feats_from_csv_in_partitions():
    """
    Extract the original features that are distributed in the dataset. Features
    are splitted according with the config.yaml file.
    """
    conf = utils.get_config()
    rows = [row for row in utils.load_csv() if utils.check_filter(row, conf['filters'])]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed'])
    X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], []
    prefixes = ['t_', 'i_', 's_']  # Feature names' prefixes
    datasets = [(X_train, y_train, train_rows),
                (X_test, y_test, test_rows), (X_valid, y_valid, valid_rows)]
    out = []
    for X, y, rows in datasets:
        for row in rows:
            X.append(
                [float(v) for k, v in row.iteritems() if len(filter(k.startswith, prefixes)) > 0])
            y.append(int(row['classification'] == 'Malign'))
        out.extend((np.asarray(X), np.asarray(y)))
    return out
Пример #30
0
def exercicio2():
    utils.print_header(2)

    data, classes = load_balance_scale(os.path.join(constants.DATA_DIR, constants.FILENAME_BALANCE_DATABASE))
    print('Nb samples: {}'.format(data.shape[0]))

    gaussian_accuracy, discrete_accuracy, laplace_accuracy = [], [], []
    np.random.seed(constants.SEED)
    for i in range(10):
        x_train, y_train, x_test, y_test = utils.split_dataset(data)
        params = {'mean': {}, 'std': {}, 'classes': classes, 'prior': {}, 'discrete_prob': {}}
        for c in classes:
            params['prior'][c] = sum(y_train == c) / float(x_train.shape[0])
            x_c = x_train[y_train == c]
            params['mean'][c] = np.mean(x_c, axis=0)
            params['std'][c] = np.std(x_c, axis=0)
            params['discrete_prob'][c] = {}
            for j in range(x_c.shape[1]):
                params['discrete_prob'][c][j] = {}
                for k in [1, 2, 3, 4, 5]:
                    params['discrete_prob'][c][j][k] = {
                        'sum': sum(x_c[:, j] == k),
                        'n': x_c.shape[0],
                    }
        gaussian_pred = gaussian_predict(x_test, params)
        gaussian_accuracy.append(utils.accuracy(y_test, gaussian_pred))
        discrete_pred = discrete_predict(x_test, params, laplace=False)
        discrete_accuracy.append(utils.accuracy(y_test, discrete_pred))
        laplace_pred = discrete_predict(x_test, params, laplace=True)
        laplace_accuracy.append(utils.accuracy(y_test, laplace_pred))

    print('a)')
    print('\tGaussian - Accuracy: {:.2f} +- {:.2f}'.format(np.mean(gaussian_accuracy), np.std(gaussian_accuracy)))

    print('b)')
    print('\tDiscrete - Accuracy: {:.2f} +- {:.2f}'.format(np.mean(discrete_accuracy), np.std(discrete_accuracy)))

    print('c)')
    print('\tDiscrete (with Laplace) - Accuracy: {:.2f} +- {:.2f}'.format(np.mean(laplace_accuracy), np.std(laplace_accuracy)))
    exit()
Пример #31
0
def main():
	# Building Phase
	data = import_data(
		"./dataset/crx_clean.data.txt"
		)
	X, Y, X_train, X_test, Y_train, Y_test = split_dataset(data)
	clf_entropy = train_using_entropy(X_train, Y_train)

	# Operational Phase
	print("\n### SINGLE TRAIN-TEST SPLIT ###\n")
	Y_pred_entropy = prediction(X_test, clf_entropy)
	print_scores(Y_test, Y_pred_entropy)

	print("\n### CROSS VAL USING STRATIFIED K FOLD ###\n")
	fold_scores = cv_with_entropy(X, Y)
	print("Cross Validate: ", fold_scores)
	print("Best F1_score: ", max(fold_scores)*100)

	scorer = make_scorer(f1_score, pos_label='+')
	print("\n### GRID SEARCH CROSS VAL USING STRATIFIED K FOLD###\n")
	Y_pred_grid_search = grid_search_cv_DT(X_train, Y_train, X_test, Y_test, scorer)
	print_scores(Y_test, Y_pred_grid_search)
Пример #32
0
    def objective(params):
        nneurons = params['nneurons']

        if params['season'] == 'full_day':
            window = create_window_array(params['window'], season_lag=288)
        if params['season'] == 'half_day':
            window = create_window_array(params['window'], season_lag=168)
        else:
            window = create_window_array(params['window'])

        if not any(window) or nneurons < 2:
            return {'status': STATUS_FAIL}

        X_train, y_train, *_ = split_dataset(
            data, window, ratio=0.90, standardize=True)
        model = compile_model(
            nneurons, input_dim=sum(1 for x in window if x), loss_fn='mse',
            activation_fn=params['activation_function'])
        hist = model.fit(
            X_train, y_train, nb_epoch=50, validation_split=0.1,
            callbacks=[EarlyStopping(monitor='val_loss', patience=2)],
            verbose=0)

        return {'loss': hist.history['val_loss'][-1], 'status': STATUS_OK}
Пример #33
0
def main(args):

    ####################
    # Arguments
    gpu = args.gpu
    model_name = args.model
    initial_tree_sampling = args.initial_tree_sampling
    path_config = args.config
    data_augmentation = args.data_augmentation
    trial_name = args.name
    actiontype = args.actiontype
    max_epoch = args.max_epoch
    dev_size = args.dev_size

    # Check
    assert actiontype in ["train", "evaluate"]
    if actiontype == "train":
        assert max_epoch > 0
    assert len(initial_tree_sampling.split("_")) == 3
    for type_ in initial_tree_sampling.split("_"):
        assert type_ in ["X", "BU", "TD", "RB", "LB", "RB2"]
    assert initial_tree_sampling.split("_")[2] != "X"
    assert initial_tree_sampling.split("_")[1] != "RB2"
    assert initial_tree_sampling.split("_")[2] != "RB2"

    if trial_name is None or trial_name == "None":
        trial_name = utils.get_current_time()

    ####################
    # Path setting
    config = utils.Config(path_config)

    basename = "%s.%s.%s.aug_%s.%s" \
            % (model_name,
               initial_tree_sampling,
               utils.get_basename_without_ext(path_config),
               data_augmentation,
               trial_name)

    if actiontype == "train":
        path_log = os.path.join(config.getpath("results"),
                                basename + ".training.log")
    elif actiontype == "evaluate":
        path_log = os.path.join(config.getpath("results"),
                                basename + ".evaluation.log")
    path_train = os.path.join(config.getpath("results"),
                              basename + ".training.jsonl")
    path_valid = os.path.join(config.getpath("results"),
                              basename + ".validation.jsonl")
    path_snapshot = os.path.join(config.getpath("results"),
                                 basename + ".model")
    path_pred = os.path.join(config.getpath("results"),
                             basename + ".evaluation.ctrees")
    path_eval = os.path.join(config.getpath("results"),
                             basename + ".evaluation.json")

    utils.set_logger(path_log)

    ####################
    # Random seed
    random_seed = trial_name
    random_seed = utils.hash_string(random_seed)
    random.seed(random_seed)
    np.random.seed(random_seed)
    cuda.cupy.random.seed(random_seed)

    ####################
    # Log so far
    utils.writelog("gpu=%d" % gpu)
    utils.writelog("model_name=%s" % model_name)
    utils.writelog("initial_tree_sampling=%s" % initial_tree_sampling)
    utils.writelog("path_config=%s" % path_config)
    utils.writelog("data_augmentation=%s" % data_augmentation)
    utils.writelog("trial_name=%s" % trial_name)
    utils.writelog("actiontype=%s" % actiontype)
    utils.writelog("max_epoch=%s" % max_epoch)
    utils.writelog("dev_size=%s" % dev_size)

    utils.writelog("path_log=%s" % path_log)
    utils.writelog("path_train=%s" % path_train)
    utils.writelog("path_valid=%s" % path_valid)
    utils.writelog("path_snapshot=%s" % path_snapshot)
    utils.writelog("path_pred=%s" % path_pred)
    utils.writelog("path_eval=%s" % path_eval)

    utils.writelog("random_seed=%d" % random_seed)

    ####################
    # Data preparation
    begin_time = time.time()

    train_dataset = dataloader.read_rstdt("train",
                                          relation_level="coarse-grained",
                                          with_root=False)
    test_dataset = dataloader.read_rstdt("test",
                                         relation_level="coarse-grained",
                                         with_root=False)
    vocab_word = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab", "words.vocab.txt"))
    vocab_postag = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "postags.vocab.txt"))
    vocab_deprel = utils.read_vocab(
        os.path.join(config.getpath("data"), "rstdt-vocab",
                     "deprels.vocab.txt"))

    if data_augmentation:
        external_train_dataset = dataloader.read_ptbwsj_wo_rstdt(
            with_root=False)
        # Remove documents with only one leaf node
        external_train_dataset = utils.filter_dataset(
            external_train_dataset,
            condition=lambda data: len(data.edu_ids) > 1)

    end_time = time.time()
    utils.writelog("Loaded the corpus. %f [sec.]" % (end_time - begin_time))

    ####################
    # Hyper parameters
    word_dim = config.getint("word_dim")
    postag_dim = config.getint("postag_dim")
    deprel_dim = config.getint("deprel_dim")
    lstm_dim = config.getint("lstm_dim")
    mlp_dim = config.getint("mlp_dim")
    n_init_epochs = config.getint("n_init_epochs")
    negative_size = config.getint("negative_size")
    batch_size = config.getint("batch_size")
    weight_decay = config.getfloat("weight_decay")
    gradient_clipping = config.getfloat("gradient_clipping")
    optimizer_name = config.getstr("optimizer_name")

    utils.writelog("word_dim=%d" % word_dim)
    utils.writelog("postag_dim=%d" % postag_dim)
    utils.writelog("deprel_dim=%d" % deprel_dim)
    utils.writelog("lstm_dim=%d" % lstm_dim)
    utils.writelog("mlp_dim=%d" % mlp_dim)
    utils.writelog("n_init_epochs=%d" % n_init_epochs)
    utils.writelog("negative_size=%d" % negative_size)
    utils.writelog("batch_size=%d" % batch_size)
    utils.writelog("weight_decay=%f" % weight_decay)
    utils.writelog("gradient_clipping=%f" % gradient_clipping)
    utils.writelog("optimizer_name=%s" % optimizer_name)

    ####################
    # Model preparation
    cuda.get_device(gpu).use()

    # Initialize a model
    utils.mkdir(os.path.join(config.getpath("data"), "caches"))
    path_embed = config.getpath("pretrained_word_embeddings")
    path_caches = os.path.join(
        config.getpath("data"), "caches",
        "cached." + os.path.basename(path_embed) + ".npy")
    if os.path.exists(path_caches):
        utils.writelog("Loading cached word embeddings ...")
        initialW = np.load(path_caches)
    else:
        initialW = utils.read_word_embedding_matrix(path=path_embed,
                                                    dim=word_dim,
                                                    vocab=vocab_word,
                                                    scale=0.0)
        np.save(path_caches, initialW)

    if model_name == "spanbasedmodel":
        # Span-based model w/ template features
        template_feature_extractor = models.TemplateFeatureExtractor(
            dataset=train_dataset)
        utils.writelog("Template feature size=%d" %
                       template_feature_extractor.feature_size)
        if actiontype == "train":
            for template in template_feature_extractor.templates:
                dim = template_feature_extractor.template2dim[template]
                utils.writelog("Template feature #%s %s" % (dim, template))
        model = models.SpanBasedModel(
            vocab_word=vocab_word,
            vocab_postag=vocab_postag,
            vocab_deprel=vocab_deprel,
            word_dim=word_dim,
            postag_dim=postag_dim,
            deprel_dim=deprel_dim,
            lstm_dim=lstm_dim,
            mlp_dim=mlp_dim,
            initialW=initialW,
            template_feature_extractor=template_feature_extractor)
    elif model_name == "spanbasedmodel2":
        # Span-based model w/o template features
        model = models.SpanBasedModel2(vocab_word=vocab_word,
                                       vocab_postag=vocab_postag,
                                       vocab_deprel=vocab_deprel,
                                       word_dim=word_dim,
                                       postag_dim=postag_dim,
                                       deprel_dim=deprel_dim,
                                       lstm_dim=lstm_dim,
                                       mlp_dim=mlp_dim,
                                       initialW=initialW)
    else:
        raise ValueError("Invalid model_name=%s" % model_name)
    utils.writelog("Initialized the model ``%s''" % model_name)

    # Load pre-trained parameters
    if actiontype != "train":
        serializers.load_npz(path_snapshot, model)
        utils.writelog("Loaded trained parameters from %s" % path_snapshot)

    model.to_gpu(gpu)

    ####################
    # Decoder preparation
    decoder = decoders.IncrementalCKYDecoder()

    ####################
    # Initializer preparation
    sampler = treesamplers.TreeSampler(initial_tree_sampling.split("_"))

    ####################
    # Training / evaluation
    if actiontype == "train":
        with chainer.using_config("train", True):
            if dev_size > 0:
                # Training with cross validation
                train_dataset, dev_dataset = utils.split_dataset(
                    dataset=train_dataset, n_dev=dev_size, seed=None)
                with open(
                        os.path.join(config.getpath("results"),
                                     basename + ".valid_gold.ctrees"),
                        "w") as f:
                    for data in dev_dataset:
                        f.write("%s\n" % " ".join(data.nary_sexp))
            else:
                # Training with the full training set
                dev_dataset = None

            if data_augmentation:
                train_dataset = np.concatenate(
                    [train_dataset, external_train_dataset], axis=0)

            train(model=model,
                  decoder=decoder,
                  sampler=sampler,
                  max_epoch=max_epoch,
                  n_init_epochs=n_init_epochs,
                  negative_size=negative_size,
                  batch_size=batch_size,
                  weight_decay=weight_decay,
                  gradient_clipping=gradient_clipping,
                  optimizer_name=optimizer_name,
                  train_dataset=train_dataset,
                  dev_dataset=dev_dataset,
                  path_train=path_train,
                  path_valid=path_valid,
                  path_snapshot=path_snapshot,
                  path_pred=os.path.join(config.getpath("results"),
                                         basename + ".valid_pred.ctrees"),
                  path_gold=os.path.join(config.getpath("results"),
                                         basename + ".valid_gold.ctrees"))

    elif actiontype == "evaluate":
        with chainer.using_config("train", False), chainer.no_backprop_mode():
            # Test
            parse(model=model,
                  decoder=decoder,
                  dataset=test_dataset,
                  path_pred=path_pred)
            scores = metrics.rst_parseval(
                pred_path=path_pred,
                gold_path=os.path.join(config.getpath("data"), "rstdt", "wsj",
                                       "test", "gold.labeled.nary.ctrees"))
            old_scores = metrics.old_rst_parseval(
                pred_path=path_pred,
                gold_path=os.path.join(config.getpath("data"), "rstdt", "wsj",
                                       "test", "gold.labeled.nary.ctrees"))
            out = {
                "Morey2018": {
                    "Unlabeled Precision": scores["S"]["Precision"] * 100.0,
                    "Precision_info": scores["S"]["Precision_info"],
                    "Unlabeled Recall": scores["S"]["Recall"] * 100.0,
                    "Recall_info": scores["S"]["Recall_info"],
                    "Micro F1": scores["S"]["Micro F1"] * 100.0
                },
                "Marcu2000": {
                    "Unlabeled Precision":
                    old_scores["S"]["Precision"] * 100.0,
                    "Precision_info": old_scores["S"]["Precision_info"],
                    "Unlabeled Recall": old_scores["S"]["Recall"] * 100.0,
                    "Recall_info": old_scores["S"]["Recall_info"],
                    "Micro F1": old_scores["S"]["Micro F1"] * 100.0
                }
            }
            utils.write_json(path_eval, out)
            utils.writelog(utils.pretty_format_dict(out))

    utils.writelog("Done: %s" % basename)
Пример #34
0
def boston_housing():
    """
    Trains algorithms for the boston housing dataset
    """

    # boston housing dataset is available in the sklearn library
    boston = load_boston()
    dimension = boston.data.shape[1]
    boston_data = np.column_stack((boston.data, boston.target))
    split = 2 / 3
    iterations = 20
    gammas = np.linspace(math.pow(2, -40), math.pow(2, -26), 15)
    sigmas = np.linspace(math.pow(2, 7), math.pow(2, 13), 14)
    k_fold = 5

    naive_results = np.zeros((iterations, 2))
    single_results = np.zeros((iterations, dimension, 2))
    all_results = np.zeros((iterations, 2))
    k_results = np.zeros((iterations, 2))

    for i in range(iterations):
        print("iterations i", i)

        training_set, testing_set = utils.split_dataset(boston_data, split)

        # naive
        _, train_mse, test_mse = lr.naive(training_set, testing_set)
        naive_results[i, 0] = train_mse
        naive_results[i, 1] = test_mse

        # single attribute
        for attr in range(dimension):
            _, train_mse, test_mse = lr.single_attribute(training_set,
                                                         testing_set, attr)
            single_results[i, attr, 0] = train_mse
            single_results[i, attr, 1] = test_mse

        # all attributes
        _, train_mse, test_mse = lr.all_attributes(training_set, testing_set)
        all_results[i, 0] = train_mse
        all_results[i, 1] = test_mse

        # kernel
        _, train_mse, test_mse = rr.gaussian_kernel_cross_validation(
            training_set, testing_set, gammas, sigmas, k_fold)
        k_results[i, 0] = train_mse
        k_results[i, 1] = test_mse

    # display naive results
    print("Naive MSE train: %s +- %s" % (
        np.mean(naive_results[:, 0]), np.std(naive_results[:, 0])))
    print("Naive MSE test: %s +- %s" % (
        np.mean(naive_results[:, 1]), np.std(naive_results[:, 1])))

    # display single attribute results
    for attr in range(dimension):
        id = attr + 1
        print("Linear (attribute %d) MSE train: %s +- %s" % (
            id,
            np.mean(single_results[:, attr, 0]),
            np.std(single_results[:, attr, 0])))
        print("Linear (attribute %d) MSE test: %s +- %s" % (
            id,
            np.mean(single_results[:, attr, 1]),
            np.std(single_results[:, attr, 1])))

    # display all attributes results
    print("Linear (all) MSE train: %s +- %s" % (
        np.mean(all_results[:, 0]), np.std(all_results[:, 0])))
    print("Linear (all) MSE test: %s +- %s" % (
        np.mean(all_results[:, 1]), np.std(all_results[:, 1])))

    # display kernel results
    print("Kernel MSE train: %s +- %s" % (
        np.mean(k_results[:, 0]), np.std(k_results[:, 0])))
    print("Kernel MSE test: %s +- %s" % (
        np.mean(k_results[:, 1]), np.std(k_results[:, 1])))
Пример #35
0
            self.__get_labels_neighborhood(row, distance)
            for row in X_classifier
        ]


if __name__ == "__main__":
    PATH_FILE = (
        "/home/nobrega/Dados/Documentos/Estudos/notes/dataset/knn_classification.csv"
    )
    df = pd.read_csv(PATH_FILE)
    df.pop("id")
    y = df["class"].values
    y = y.reshape(len(y), 1)
    df.pop("class")

    X = df.values

    X_train, y_train, X_test, y_test = split_dataset(X, y, 0.7)

    n_neighbors = 15

    knn = KNearestNeighbors(X_train, y_train, n_neighbors)
    for type_distance in DistanceTypes:
        try:
            y_hat = np.array(knn.predict(X_test, type_distance.value))
            y_test = np.ndarray.flatten(np.array(y_test))

            acc = sum(((y_hat == y_test) * 1.0)) / len(y_test)
            print(f"Type of Distance {type_distance.value} Acurácia: {acc}")
        except:
            print(f"This distance type {type_distance.value} can't calculate")
Пример #36
0
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    keep_topK = 200
    USE_HOLD_OUT = True  # visualization of HOLD-OUT set

    # dir_prenms = "../results/preNMS"

    # =========================== Dataset ==============================
    # file path and make a list
    imgs_path = '../data/hw3_mycocodata_img_comp_zlib.h5'
    masks_path = '../data/hw3_mycocodata_mask_comp_zlib.h5'
    labels_path = "../data/hw3_mycocodata_labels_comp_zlib.npy"
    bboxes_path = "../data/hw3_mycocodata_bboxes_comp_zlib.npy"
    paths = [imgs_path, masks_path, labels_path, bboxes_path]

    dataset = BuildDataset(paths, augmentation=False)
    train_dataset, test_dataset = utils.split_dataset(dataset)

    # dataset
    # train_build_loader = BuildDataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
    # train_loader = train_build_loader.loader()
    test_build_loader = BuildDataLoader(test_dataset,
                                        batch_size=batch_size,
                                        shuffle=False,
                                        num_workers=0)
    test_loader = test_build_loader.loader()

    # we will need the ImageList from torchvision
    from torchvision.models.detection.image_list import ImageList

    do_eval(test_loader,
            checkpoint_file,
Пример #37
0
def main():
    #torch.manual_seed(42)

    # ------------
    # args
    # ------------
    parser = ArgumentParser()
    parser.add_argument('--auto_lr',
                        type=U.str2bool,
                        default=False,
                        help="Auto lr finder")
    parser.add_argument('--learning_rate', type=float, default=10e-4)
    parser.add_argument('--scheduler', type=U.str2bool, default=False)
    parser.add_argument('--wd', type=float, default=2e-4)
    parser.add_argument('--moment', type=float, default=0.9)
    parser.add_argument('--batch_size', default=5, type=int)
    parser.add_argument('--n_epochs', default=10, type=int)
    parser.add_argument('--model',
                        default='FCN',
                        type=str,
                        help="FCN or DLV3 model")
    parser.add_argument('--pretrained',
                        default=False,
                        type=U.str2bool,
                        help="Use pretrained pytorch model")
    parser.add_argument('--eval_angle', default=True, type=U.str2bool,help=\
        "If true, it'll eval the model with different angle input size")
    parser.add_argument('--rotate',
                        default=False,
                        type=U.str2bool,
                        help="Use random rotation as data augmentation")
    parser.add_argument('--scale',
                        default=True,
                        type=U.str2bool,
                        help="Use scale as data augmentation")
    parser.add_argument('--size_img',
                        default=520,
                        type=int,
                        help="Size of input images")
    parser.add_argument('--size_crop',
                        default=480,
                        type=int,
                        help="Size of crop image during training")
    parser.add_argument('--nw',
                        default=0,
                        type=int,
                        help="Num workers for the data loader")
    parser.add_argument('--pm',
                        default=True,
                        type=U.str2bool,
                        help="Pin memory for the dataloader")
    parser.add_argument('--gpu',
                        default=0,
                        type=int,
                        help="Wich gpu to select for training")
    parser.add_argument('--benchmark',
                        default=False,
                        type=U.str2bool,
                        help="enable or disable backends.cudnn")
    parser.add_argument('--split',
                        default=False,
                        type=U.str2bool,
                        help="Split the dataset")
    parser.add_argument('--split_ratio',
                        default=0.3,
                        type=float,
                        help="Amount of data we used for training")
    parser.add_argument('--dataroot_voc',
                        default='/share/DEEPLEARNING/datasets/voc2012/',
                        type=str)
    parser.add_argument('--dataroot_sbd',
                        default='/share/DEEPLEARNING/datasets/sbd/',
                        type=str)
    parser.add_argument('--model_name',
                        type=str,
                        help="what name to use for saving")
    parser.add_argument('--save_dir', default='/data/save_model', type=str)
    parser.add_argument('--save_all_ep', default=False, type=U.str2bool,help=\
        "If true it'll save the model every epoch in save_dir")
    parser.add_argument('--save_best',
                        default=False,
                        type=U.str2bool,
                        help="If true will only save the best epoch model")
    args = parser.parse_args()
    # ------------
    # save
    # ------------
    save_dir = U.create_save_directory(args.save_dir)
    print('model will be saved in', save_dir)
    U.save_hparams(args, save_dir)
    # ------------
    # device
    # ------------
    device = torch.device(
        "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu")
    print("device used:", device)
    # ------------
    # model
    # ------------

    if args.model.upper() == 'FCN':
        model = models.segmentation.fcn_resnet101(pretrained=args.pretrained)
    elif args.model.upper() == 'DLV3':
        model = models.segmentation.deeplabv3_resnet101(
            pretrained=args.pretrained)
    else:
        raise Exception('model must be "FCN" or "DLV3"')
    model.to(device)
    # ------------
    # data
    # ------------
    if args.size_img < args.size_crop:
        raise Exception(
            'Cannot have size of input images less than size of crop')
    size_img = (args.size_img, args.size_img)
    size_crop = (args.size_crop, args.size_crop)
    train_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc,year='2012', image_set='train', \
        download=True,rotate=args.rotate,scale=args.scale,size_img=size_img,size_crop=size_crop)
    val_dataset_VOC = mdset.VOCSegmentation(args.dataroot_voc,
                                            year='2012',
                                            image_set='val',
                                            download=True)
    train_dataset_SBD = mdset.SBDataset(args.dataroot_sbd, image_set='train_noval',mode='segmentation',\
        rotate=args.rotate,scale=args.scale,size_img=size_img,size_crop=size_crop)
    # Concatene dataset
    train_dataset = tud.ConcatDataset([train_dataset_VOC, train_dataset_SBD])
    split = args.split
    if split == True:
        train_dataset = U.split_dataset(train_dataset, args.split_ratio)
    # Print len datasets
    print("There is", len(train_dataset), "images for training and",
          len(val_dataset_VOC), "for validation")
    dataloader_train = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size,num_workers=args.nw,\
        pin_memory=args.pm,shuffle=True,drop_last=True)#,collate_fn=U.my_collate)
    dataloader_val = torch.utils.data.DataLoader(val_dataset_VOC,num_workers=args.nw,pin_memory=args.pm,\
        batch_size=args.batch_size)
    # Decide which device we want to run on

    # ------------
    # training
    # ------------
    # Auto lr finding
    #if args.auto_lr==True:

    criterion = nn.CrossEntropyLoss(
        ignore_index=21)  # On ignore la classe border.
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=args.learning_rate,
                                momentum=args.moment,
                                weight_decay=args.wd)
    ev.train_fully_supervised(model=model,n_epochs=args.n_epochs,train_loader=dataloader_train,val_loader=dataloader_val,\
        criterion=criterion,optimizer=optimizer,save_folder=save_dir,scheduler=args.scheduler,model_name=args.model_name,\
            benchmark=args.benchmark, save_best=args.save_best,save_all_ep=args.save_all_ep,device=device,num_classes=21)

    # Final evaluation
    if args.eval_angle:
        d_iou = ev.eval_model_all_angle(model,
                                        args.size_img,
                                        args.dataroot_voc,
                                        train=True,
                                        device=device)
        U.save_eval_angle(d_iou, save_dir)
        d_iou = ev.eval_model_all_angle(model,
                                        args.size_img,
                                        args.dataroot_voc,
                                        train=False,
                                        device=device)
        U.save_eval_angle(d_iou, save_dir)
Пример #38
0
def train_dvna(A, P, verbose=False):
    n_epochs = 4000

    train_ones_indices, train, val, test = u.split_dataset(A, seed=seed)

    A_train = u.prepare_train_matrix_dvne(A, train_ones_indices)

    # sample triplets
    # nbrs is a dict nbrs[i] -> {j1,j2,...,}
    nbrs = {}
    not_nbrs = {}
    all_nodes = {i for i in range(A.shape[0])}

    for ij in zip(train_ones_indices[0], train_ones_indices[1]):
        i, j = int(ij[0]), int(ij[1])
        if i in nbrs.keys():
            nbrs[i] = nbrs[i].union({j})
        else:
            nbrs[i] = {j}

        if j in nbrs.keys():
            nbrs[j] = nbrs[j].union({i})
        else:
            nbrs[j] = {i}

    for i in nbrs.keys():
        nbrs_set = nbrs[i]
        not_nbrs[i] = all_nodes - nbrs_set

    model = DVNE(n_features=A.shape[0])
    model.to(device)

    opt = torch.optim.Adam(lr=0.001, params=model.parameters())

    nonzero_ratio = A.sum() / (A.shape[0]**2)
    zero_ratio = 1 - nonzero_ratio

    A = A.to(device)
    P = P.to(device)
    A_train = A_train.to(device)

    model.n_samples = len(train)
    criterion = l.dvne_loss

    for e in range(n_epochs):
        t0 = time.time()

        triplets = u.sample_triplets(nbrs, not_nbrs, 300)
        i, j, k = triplets

        model.train()
        opt.zero_grad()

        out_i, mi, stdi = model.forward(A_train[i, :])
        out_j, mj, stdj = model.forward(A_train[j, :])
        out_k, mk, stdk = model.forward(A_train[k, :])

        gt_i = P[i, :]
        gt_j = P[j, :]
        gt_k = P[k, :]

        out_reconstruction = torch.cat([out_i, out_j, out_k], dim=0).view(-1)
        gt = torch.cat([gt_i, gt_j, gt_k], dim=0).view(-1).to(device)
        a_gt = torch.cat([A[i, :], A[j, :], A[j, :]],
                         dim=0).view(-1).to(device)

        x = torch.ones(gt.shape[0]).to(device)
        weights = torch.where(gt == 0.0, x * float(nonzero_ratio),
                              x * float(zero_ratio)).to(device)
        loss_norm = weights.sum() / len(train)

        loss_weight = 0.6
        # loss = criterion(out_reconstruction, gt_reconsturction, weight=weights) * loss_weight/ loss_norm
        l2 = criterion(gt, out_reconstruction) * loss_weight

        # loss = 0.0
        w_ij = model.wasserstein((mi, stdi), (mj, stdj))
        w_ik = model.wasserstein((mi, stdi), (mk, stdk))
        l1 = l.energy_loss(w_ij, w_ik)

        loss = l1 + loss_weight * l2

        loss.backward()
        opt.step()

        t1 = time.time()
        if verbose:
            if (e + 1) % 100 == 0:
                if len(val) > 0:
                    with torch.no_grad():
                        val_loss = float(
                            criterion(
                                (model.forward(A_train)[0]).reshape(-1)[val],
                                A.reshape(-1)[val].data))
                    val_auc = u.test_auc_dvna(model, A_train, A, val)
                else:
                    val_auc = np.nan
                    val_loss = np.nan
                print(
                    "Iteration: {0}; train loss: {1:.4f}; val loss: {2:.4f}; val auc: {3:.4f}; time: {4:.4f}"
                    .format(e + 1, loss, val_loss, val_auc, t1 - t0))

    test_auc = u.test_auc_dvna(model, A_train, A, idx=test, test=True)
    # print("Test auc: ", test_auc)

    if dataset_name == 'cora' and visualize:
        with torch.no_grad():
            encodings, mean, std = model.encode(P.to(device))
            embeddings = torch.cat([mean, std], dim=1)

            dv.reduct_and_visualize(embeddings.cpu().numpy(), Y.argmax(axis=1))

            train, val_test = next(
                Split(train_size=140, random_state=seed).split(embeddings, Y))

            embeddings = embeddings.cpu()
            x_train, y_train = embeddings[train], Y[train]
            x_test, y_test = embeddings[val_test], Y[val_test]

            svm = SVC(C=10.0)

            svm.fit(x_train, y_train.argmax(axis=1))
            y_predicted = svm.predict(x_test)
            print("SVM Accuracy: ",
                  accuracy_score(y_predicted, y_test.argmax(axis=1)))
    return test_auc
Пример #39
0
    def run(self, id, updated_fields, enable_service=False):
        share = self.datastore.get_by_id('shares', id)
        if not share:
            raise TaskException(errno.ENOENT, 'Share not found')

        if share['immutable']:
            raise TaskException(errno.EACCES, 'Cannot modify immutable share {0}.'.format(id))

        if 'name' in updated_fields or 'type' in updated_fields:
            share.update(updated_fields)
            if self.datastore.exists(
                'shares',
                ('id', '!=', id),
                ('type', '=', share['type']),
                ('name', '=', share['name'])
            ):
                raise TaskException(errno.EEXIST, 'Share {0} of type {1} already exists'.format(
                    share['name'],
                    share['type']
                ))

        path_after_update = updated_fields.get('target_path', share['target_path'])
        type_after_update = updated_fields.get('target_type', share['target_type'])
        permissions = updated_fields.pop('permissions', None)
        share_path = self.dispatcher.call_sync('share.expand_path', path_after_update, type_after_update)

        if not os.path.exists(share_path):
            raise TaskException(
                errno.ENOENT,
                'Selected share target {0} does not exist'.format(path_after_update)
            )

        share = self.datastore.get_by_id('shares', id)
        remove_unchanged(updated_fields, share)

        path = self.dispatcher.call_sync('share.get_directory_path', share['id'])
        try:
            delete_config(
                path,
                '{0}-{1}'.format(share['type'], share['name'])
            )
        except (OSError, ValueError):
            pass

        if 'type' in updated_fields:
            old_share_type = share['type']
            new_share_type = self.dispatcher.call_sync('share.supported_types').get(updated_fields['type'])
            if share['target_type'] == 'DATASET':
                pool, dataset = split_dataset(share['target_path'])
                self.join_subtasks(
                    self.run_subtask('volume.dataset.update', dataset, {
                        'permissions_type': new_share_type['perm_type']
                    })
                )

            share.update(updated_fields)
            self.run_subtask_sync('share.{0}.delete'.format(old_share_type), id)
            self.run_subtask_sync('share.{0}.create'.format(updated_fields['type']), share)
        else:
            self.run_subtask_sync('share.{0}.update'.format(share['type']), id, updated_fields)

        if permissions:
            path = self.dispatcher.call_sync('share.translate_path', id)
            self.run_subtask_sync('file.set_permissions', path, permissions)

        self.dispatcher.dispatch_event('share.changed', {
            'operation': 'update',
            'ids': [share['id']]
        })

        updated_share = self.datastore.get_by_id('shares', id)
        path = self.dispatcher.call_sync('share.get_directory_path', updated_share['id'])
        try:
            save_config(
                path,
                '{0}-{1}'.format(updated_share['type'], updated_share['name']),
                updated_share
            )
        except OSError as err:
            self.add_warning(TaskWarning(errno.ENXIO, 'Cannot save backup config file: {0}'.format(str(err))))

        service_state = self.dispatcher.call_sync('service.query', [('name', '=', share['type'])], {'single': True})
        if service_state['state'] != 'RUNNING':
            if enable_service:
                config = service_state['config']
                config['enable'] = True
                self.run_subtask_sync('service.update', service_state['id'], {'config': config})
            else:
                self.add_warning(TaskWarning(
                    errno.ENXIO, "Share has been updated but the service {0} is not currently running "
                                 "Please enable the {0} service.".format(share['type'])
                ))
Пример #40
0
    def run(self, id, updated_fields, enable_service=False):
        share = self.datastore.get_by_id('shares', id)
        if not share:
            raise TaskException(errno.ENOENT, 'Share not found')

        if share['immutable']:
            raise TaskException(errno.EACCES, 'Cannot modify immutable share {0}.'.format(id))

        if 'name' in updated_fields or 'type' in updated_fields:
            share.update(updated_fields)
            if self.datastore.exists(
                'shares',
                ('id', '!=', id),
                ('type', '=', share['type']),
                ('name', '=', share['name'])
            ):
                raise TaskException(errno.EEXIST, 'Share {0} of type {1} already exists'.format(
                    share['name'],
                    share['type']
                ))

        path_after_update = updated_fields.get('target_path', share['target_path'])
        type_after_update = updated_fields.get('target_type', share['target_type'])
        permissions = updated_fields.pop('permissions', None)
        share_path = self.dispatcher.call_sync('share.expand_path', path_after_update, type_after_update)

        if type_after_update in ('DIRECTORY', 'FILE'):
            pool_mountpoints = tuple(self.dispatcher.call_sync('volume.query', [], {'select': 'mountpoint'}))
            if not path_after_update.startswith(pool_mountpoints):
                raise TaskException(errno.EINVAL, "Provided directory or file has to reside within user defined ZFS pool")

        if not os.path.exists(share_path):
            raise TaskException(
                errno.ENOENT,
                'Selected share target {0} does not exist'.format(path_after_update)
            )

        share = self.datastore.get_by_id('shares', id)
        remove_unchanged(updated_fields, share)

        path = self.dispatcher.call_sync('share.get_directory_path', share['id'])
        try:
            delete_config(
                path,
                '{0}-{1}'.format(share['type'], share['name'])
            )
        except (OSError, ValueError):
            pass

        if 'type' in updated_fields:
            old_share_type = share['type']
            new_share_type = self.dispatcher.call_sync('share.supported_types').get(updated_fields['type'])
            if share['target_type'] == 'DATASET':
                pool, dataset = split_dataset(share['target_path'])
                self.join_subtasks(
                    self.run_subtask('volume.dataset.update', dataset, {
                        'permissions_type': new_share_type['perm_type']
                    })
                )

            share.update(updated_fields)
            self.run_subtask_sync('share.{0}.delete'.format(old_share_type), id)
            self.run_subtask_sync('share.{0}.create'.format(updated_fields['type']), share)
        else:
            self.run_subtask_sync('share.{0}.update'.format(share['type']), id, updated_fields)

        if permissions:
            path = self.dispatcher.call_sync('share.translate_path', id)
            self.run_subtask_sync('file.set_permissions', path, permissions)

        self.dispatcher.dispatch_event('share.changed', {
            'operation': 'update',
            'ids': [share['id']]
        })

        updated_share = self.datastore.get_by_id('shares', id)
        path = self.dispatcher.call_sync('share.get_directory_path', updated_share['id'])
        try:
            save_config(
                path,
                '{0}-{1}'.format(updated_share['type'], updated_share['name']),
                updated_share,
                file_perms=0o600
            )
        except OSError as err:
            self.add_warning(TaskWarning(errno.ENXIO, 'Cannot save backup config file: {0}'.format(str(err))))

        service_state = self.dispatcher.call_sync('service.query', [('name', '=', share['type'])], {'single': True})
        if service_state['state'] != 'RUNNING':
            if enable_service:
                config = service_state['config']
                config['enable'] = True
                self.run_subtask_sync('service.update', service_state['id'], {'config': config})
            else:
                self.add_warning(TaskWarning(
                    errno.ENXIO, "Share has been updated but the service {0} is not currently running "
                                 "Please enable the {0} service.".format(share['type'])
                ))
Пример #41
0
import utils
from pylearn2.utils import serial
import h5py
import numpy as np
import sys

if __name__ == "__main__":
    conf_file = sys.argv[1] if len(sys.argv) > 1 else None
    conf = utils.get_config(conf_file)
    paths = utils.get_paths()
    region_size = conf['region_size']
    region_stride = conf['region_stride']

    train_rows, valid_rows, test_rows = utils.split_dataset(
        utils.get_filtered_rows(), conf['valid_percent'],
        conf['test_percent'], rng=conf['rng_seed'])

    rowsdict = {'train': train_rows, 'valid': valid_rows, 'test': test_rows}
    nsamples = {}

    prefixes = ['s_', 'i_', 't_']  # Feature names' prefixes
    for subset, subrows in rowsdict.iteritems():
        X = None
        y = []
        feats = []
        for row in subrows:
            samples = utils.get_samples_from_image(
                row, oversampling=(subset == 'train' and conf['oversampling']))
            print "%i samples to %s taken from %s" % (
                len(samples), subset, row['image_filename'])
            if len(samples) == 0:
Пример #42
0
from torchvision import transforms
import os
import argparse
from utils import split_dataset, train, validate
from grayscale import Grayscale
from colorize import Colorize

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True, help="path to input image")
ap.add_argument("-e", "--epochs", type=int, default=100, help="# of epochs")
args = vars(ap.parse_args())

# split the landscape dataset to train and validation folders
split_dataset(args["image"])

# Training
train_transforms = transforms.Compose(
    [transforms.RandomResizedCrop(224),
     transforms.RandomHorizontalFlip()])
train_folder = Grayscale('images/train', train_transforms)
train_loader = torch.utils.data.DataLoader(train_folder,
                                           batch_size=64,
                                           shuffle=True)

# Validation
val_transforms = transforms.Compose(
    [transforms.Resize(256),
     transforms.CenterCrop(224)])
val_folder = Grayscale('images/val', val_transforms)
Пример #43
0
    for i in range(0, len(rows), chunkSize):
        offset = min(i + chunkSize, len(rows))
        f_chunk, y_chunk = fe_extraction.get_feats_from_cnn(
            rows[i:offset], model)
        if feats is None:
            feats = f_chunk
            y = y_chunk
        else:
            feats = np.vstack((feats, f_chunk))
            y = np.hstack((y, y_chunk))

    segm_ids = np.asarray([int(row['segmentation_id']) for row in rows])
    features = np.hstack((features, feats))

train_rows, valid_rows, test_rows = utils.split_dataset(
    utils.get_filtered_rows(), conf['valid_percent'],
    conf['test_percent'], rng=conf['rng_seed'])

rows = train_rows + valid_rows
patients = utils.rows_to_patients(rows)
for i in range(n_runs):
    train_rows, empty_rows, valid_rows = utils.split_dataset(
        rows, valid_percent=0, test_percent=0.2, rng=rng, patients=patients)
    X_train, y_train = get_features(train_rows, features, segm_ids)
    X_valid, y_valid = get_features(valid_rows, features, segm_ids)
    print 'train: %i, valid: %i' % (X_train.shape[0], X_valid.shape[0])

    if scale_feats:
        min_max_scaler = preprocessing.MinMaxScaler()
        X_train = min_max_scaler.fit_transform(X_train)
        X_valid = min_max_scaler.transform(X_valid)