Exemplo n.º 1
0
def main(c_runtime, c_transformer, c_model, c_trainer, c_log):

    with blocktimer('Preprocess'):
        train, test = Transformer.run(**c_transformer.__dict__)
        X_train, y_train, X_test = split_X_y(train, test)

    with blocktimer('Tune & Train'):
        modelfactory = ModelFactory()

        # tune the model params
        model = modelfactory.create(c_model)
        optimal_c_model = tune_gbdt_params(model, X_train, y_train,
                                           c_trainer.n_splits)

        # train with best params, full data
        model = modelfactory.create(optimal_c_model)
        model = model.train(X_train, y_train)

    with blocktimer('Predict'):
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = test['TransactionID']

        y_test = model.predict(X_test)

        sub['isFraud'] = y_test
        sub.to_csv(c_runtime.out_sub_path, index=False)
        logger.info(f'Saved {c_runtime.out_sub_path}')
Exemplo n.º 2
0
 def load_model(self, create_raw_model = False):
     if not self.config["LOAD_MODEL"]:
         raise ValueError('LOAD_MODEL config must be set to true for inference')
     if create_raw_model:
         self.config["LOAD_MODEL"] = False
     model_factory = ModelFactory(self.config)
     model = model_factory.create_model(self.model_name)
     return model
Exemplo n.º 3
0
 def test_invalid_id_of_device_mapping(self):
     train_strategy = ipu.ipu_strategy.IPUStrategy()
     with train_strategy.scope():
         model = ModelFactory.create_model(model_name='toy_model',
                                           weights=None,
                                           input_shape=(28, 28, 1),
                                           classes=10)
     with self.assertRaises(DimensionError):
         model = ModelFactory.configure_model(
             model=model,
             gradient_accumulation_count=1,
             pipeline_splits=['conv2d_1', 'flatten'],
             device_mapping=[1, 2, 3],
             pipeline_schedule='Grouped',
             available_memory_proportion=[])
Exemplo n.º 4
0
    def execute(self, model_simi: BaseModel):
        model = ModelFactory.get_model(self.model_type)
        if os.path.isfile(self.model_path):
            model.load_model(self.model_path)
            return model

        sal_data = None

        for i, path in enumerate(self.img_path):
            im_data = self.parse_data(self.img_data[i],
                                      self.img_data_level0[i], model_simi)
            for j, rlist in enumerate(im_data.rlists):
                data = Region2Csv.generate_seg_csv(rlist,
                                                   im_data.feature93s[j],
                                                   self.seg_path[i])
                if data is None:
                    continue
                if sal_data is None:
                    sal_data = data
                else:
                    sal_data = np.vstack((sal_data, data))
        y_train, x_train = self.prepare_data(sal_data)

        model.train(x_train, y_train)
        model.save_model(self.model_path)
        return model
Exemplo n.º 5
0
def create_training_parts(batch_size, imshape, anchors_per_scale, ckpt_path, learning_rate,
                          loss_weights, valid_category, weight_suffix='latest'):
    model = ModelFactory(batch_size, imshape, anchors_per_scale).get_model()
    model = try_load_weights(ckpt_path, model, weight_suffix)
    loss_object = IntegratedLoss(loss_weights, valid_category)
    optimizer = tf.optimizers.Adam(lr=learning_rate)
    return model, loss_object, optimizer
Exemplo n.º 6
0
    def execute(self):
        model = ModelFactory.get_model(model_name=self.model_type)
        if os.path.isfile(self.model_path):
            model.load_model(self.model_path)
            return model

        simi_data = None
        for i, path in enumerate(self.img_path):
            im_data = self.check_exist(
                img_path=self.img_path[i], img_path_level0=self.img_data_level0[i]
            )

            data = Region2Csv.generate_similar_csv(
                im_data.rlist, im_data.comb_features, self.seg_path[i]
            )
            if simi_data is None:
                simi_data = data
            else:
                simi_data = np.vstack((simi_data, data))
            logging.info("Finished simi {}".format(i))

        y_train, x_train = self.prepare_data(simi_data)

        model.train(x_train, y_train)
        model.save_model(self.model_path)
        return model
Exemplo n.º 7
0
    def test_pipeline_split(self):
        def initial_model_1():
            model_input = keras.Input(shape=(32, 32, 3))
            model_output = keras.layers.MaxPooling2D(
                name='test_pipeline_split_layer1')(model_input)
            model_output_1 = keras.layers.Conv2D(
                filters=32, kernel_size=3,
                name='test_pipeline_split_layer2')(model_output)
            model_output_2 = keras.layers.Conv2D(
                filters=32, kernel_size=3,
                name='test_pipeline_split_layer3')(model_output)
            model_output = keras.layers.Add(name='test_pipeline_split_layer4')(
                [model_output_1, model_output_2])
            model_output = keras.layers.Flatten(
                name='test_pipeline_split_layer5')(model_output)
            return keras.Model(model_input, model_output)

        def expected_model_1():
            model_input = keras.Input(shape=(32, 32, 3))
            with ipu.keras.PipelineStage(0):
                model_output = keras.layers.MaxPooling2D()(model_input)
                model_output_1 = keras.layers.Conv2D(
                    filters=32, kernel_size=3)(model_output)
            with ipu.keras.PipelineStage(1):
                model_output_2 = keras.layers.Conv2D(
                    filters=32, kernel_size=3)(model_output)
                model_output = keras.layers.Add()(
                    [model_output_1, model_output_2])
            with ipu.keras.PipelineStage(2):
                model_output = keras.layers.Flatten()(model_output)
            return keras.Model(model_input, model_output)

        train_strategy = ipu.ipu_strategy.IPUStrategy()
        with train_strategy.scope():
            model = initial_model_1()
            pipelined_model = ModelFactory.configure_model(
                model=model,
                gradient_accumulation_count=1,
                pipeline_splits=[
                    'test_pipeline_split_layer3', 'test_pipeline_split_layer5'
                ],
                device_mapping=[],
                pipeline_schedule='Grouped',
                available_memory_proportion=[])

            expected_assignments = expected_model_1(
            ).get_pipeline_stage_assignment()
            pipelined_assignments = pipelined_model.get_pipeline_stage_assignment(
            )

            for expected_assignment, pipelined_assignment in zip(
                    expected_assignments, pipelined_assignments):
                assert (expected_assignment.layer.__class__.name ==
                        pipelined_assignment.layer.__class__.name)
                assert (expected_assignment.pipeline_stage ==
                        pipelined_assignment.pipeline_stage)
Exemplo n.º 8
0
def main(c):

    with blocktimer('Preprocess'):
        train, test = Transformer.run(**c.transformer.__dict__)
        X_train, y_train, X_test = split_X_y(train, test)
        test = test.sort_values('TransactionDT')

    with blocktimer('Tune & Train'):
        modelfactory = ModelFactory()

        # tune the model params
        model = modelfactory.create(c.model)
        optimal_c_model = tune_gbdt_params(model, X_train, y_train,
                                           c.trainer.n_splits)

        # train with best params, full data
        model = modelfactory.create(optimal_c_model)
        model = model.train(X_train, y_train)

        # save results
        model.save(c.model.dir /
                   f'model_{c.runtime.VERSION}_{c.model.TYPE}.pkl')

        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])

        importance_path = c.runtime.ROOTDIR / 'feature/importance' / f'importance_{c.runtime.VERSION}.csv'
        importance.to_csv(importance_path)
        logger.info(f'Saved {str(importance_path)}')

    with blocktimer('Predict'):
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = test['TransactionID']

        y_test = model.predict(X_test)

        sub['isFraud'] = y_test
        sub.to_csv(c.runtime.out_sub_path, index=False)
        logger.debug(f'Saved {c.runtime.out_sub_path}')
Exemplo n.º 9
0
 def test_toy_model_factory_prediction(self):
     tf.random.set_seed(1)
     model = ModelFactory.create_model(model_name='toy_model',
                                       weights=None,
                                       input_shape=(32, 32, 3),
                                       classes=10)
     image_1 = np.ones((1, 32, 32, 3)) * 10
     assert (np.allclose(
         model.predict(image_1)[0], [
             0.08292384, 0.05735856, 0.27028584, 0.2666999, 0.02177826,
             0.01853362, 0.06498592, 0.04272136, 0.15957771, 0.015135
         ]))
     tf.random.set_seed(None)
Exemplo n.º 10
0
    def get_predictions_for_model(self, model_name: str):
        tf.random.set_seed(1)
        np.random.seed(0)
        image0 = np.zeros((1, 32, 32, 3))
        image1 = np.ones((1, 32, 32, 3)) * 10

        model = ModelFactory.create_model(model_name=model_name,
                                          input_shape=(32, 32, 3),
                                          classes=2)
        image0_preds = model.predict(image0)[0]
        image1_preds = model.predict(image1)[0]

        tf.random.set_seed(None)
        np.random.seed(None)

        return (image0_preds, image1_preds)
Exemplo n.º 11
0
    def execute(self, model_sal):
        model = ModelFactory.get_model(self.model_type)
        if os.path.isfile(self.model_path):
            model.load_model(self.model_path)
            return model
        ground_truths = None
        salience_maps = None

        for i, path in enumerate(self.img_data):
            im_data = pickle.load(open(path, "rb+"))
            seg_num = len(im_data.rlists)
            if seg_num < len(self.C_LIST) + 1:
                continue

            height = im_data.rmat.shape[0]
            width = im_data.rmat.shape[1]
            salience_map = np.zeros([seg_num, height, width])
            for j, rlist in enumerate(im_data.rlists):
                Y = model_sal.predict(im_data.feature93s[j])[:, 1]
                for k, r in enumerate(rlist):
                    salience_map[j][r] = Y[k]
            ground_truth = cv2.imread(self.seg_path[i])[:, :, 0]
            ground_truth[ground_truth == 255] = 1
            if salience_maps is None:
                salience_maps = salience_map.reshape([-1, height * width]).T
            else:
                salience_maps = np.append(salience_maps,
                                          salience_map.reshape(
                                              [-1, height * width]).T,
                                          axis=0)
            if ground_truths is None:
                ground_truths = ground_truth.reshape(-1)
            else:
                ground_truths = np.append(ground_truths,
                                          ground_truth.reshape(-1),
                                          axis=0)
        x_train = salience_maps
        y_train = ground_truths
        model.train(x_train, y_train)
        model.save_model(self.model_path)
Exemplo n.º 12
0
def objective(trial, X_train, y_train, X_test, cols, c):
    '''
    Define objectives for optuna
    '''
    modelfactory = ModelFactory()
    if c.model.type == 'lightgbm':
        max_depth = trial.suggest_int('max_depth', 3, 12)
        params_to_tune = {
            # num_leaves should be smaller than approximately 2^max_depth*0.75
            'num_leaves':
            2**max_depth * 3 // 4,
            'max_depth':
            max_depth,
            'min_child_weight':
            trial.suggest_loguniform('min_child_weight', 1e-3, 1e0),
            'reg_alpha':
            trial.suggest_loguniform('reg_alpha', 1e-2, 1e0),
            'reg_lambda':
            trial.suggest_loguniform('reg_lambda', 1e-2, 1e0),
            'min_data_in_leaf':
            trial.suggest_int('min_data_in_leaf', 50, 200),
            'feature_fraction':
            trial.suggest_uniform('feature_fraction', 0, 1),
            'bagging_fraction':
            trial.suggest_uniform('bagging_fraction', 0, 1)
        }
    elif c.model.type == 'xgboost':
        params_to_tune = {
            'min_split_loss':
            trial.suggest_loguniform('min_split_loss', 1e-3, 1e0),
            'max_depth':
            trial.suggest_int('max_depth', 3, 12),
            'min_child_weight':
            trial.suggest_loguniform('min_child_weight', 1e-3, 1e0),
            'subsample':
            trial.suggest_uniform('subsample', 0, 1),
            'colsample_bytree':
            trial.suggest_uniform('colsample_bytree', 0.0, 1.0),
            'reg_alpha':
            trial.suggest_loguniform('reg_alpha', 1e-3, 1e0),
            'reg_lambda':
            trial.suggest_loguniform('reg_lambda', 1e-3, 1e0)
        }
    elif c.model.type == 'catboost':
        max_depth = trial.suggest_int('max_depth', 3, 12)
        params_to_tune = {
            # num_leaves should be smaller than approximately 2^max_depth*0.75
            # 'num_leaves': 2 ** max_depth * 3 // 4,
            'max_depth': max_depth,
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e0)
        }

    # apply suggested params
    params = c.model.params.copy()
    params.update(params_to_tune)

    # Train by 6-fold CV
    oof = np.zeros(len(X_train))
    preds = np.zeros(len(X_test))
    skf = GroupKFold(n_splits=6)
    for i, (idxT, idxV) in enumerate(
            skf.split(X_train, y_train, groups=X_train['DT_M'])):
        fold = i + 1
        month = X_train.iloc[idxV]['DT_M'].iloc[0]
        model_fold_path = f'data/model/model_{c.runtime.version}_opt_fold{fold}{c.runtime.dsize}.pkl'
        model = modelfactory.create(c.model)
        logger.info(f'Fold {fold} withholding month {month}')
        logger.info(
            f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}')

        model = model.train(
            X_train[cols].iloc[idxT],
            y_train.iloc[idxT],
            X_train[cols].iloc[idxV],
            y_train.iloc[idxV],
            params=params,
            num_boost_round=c.train.num_boost_round,
            early_stopping_rounds=c.train.early_stopping_rounds,
            fold=i + 1)

        oof[idxV] = model.predict(X_train[cols].iloc[idxV])
        preds += model.predict(X_test[cols]) / skf.n_splits

        r.paths.update({f'model_fold_{fold}_path': model_fold_path})
        model.save(r.paths[f'model_fold_{fold}_path'])
        del model

    score = roc_auc_score(y_train, oof)
    logger.info(f'Fold {fold} OOF cv= {score}')
    mlflow.log_metric('oof_cv_score', score, step=trial.number)
    return score
Exemplo n.º 13
0
# -*- coding: utf-8 -*-
import sys

from configuration import Configuration
from dataset import Dataset
from model.model_factory import ModelFactory

if __name__ == '__main__':
    if (len(sys.argv)) != 4:
        raise Exception(
            "Invalid number of arguments. Got {} arguments, expected 4.".
            format(len(sys.argv)))

    train_path = sys.argv[1]
    test_path = sys.argv[2]
    config_path = sys.argv[3]

    config = Configuration.from_path(config_path)
    train = Dataset.from_path(train_path)
    test = Dataset.from_path(test_path)

    model = ModelFactory.get(config.model)(config)
    model.fit(train)
    model.prediction(test)
Exemplo n.º 14
0
 def get_model(self):
     model = ModelFactory.get_model(self.model_type)
     model.load_model(self.model_path)
     return model
Exemplo n.º 15
0
 def test_unsupported_model(self):
     with self.assertRaises(NameError):
         ModelFactory.create_model(model_name='foo',
                                   input_shape=(32, 32, 3),
                                   classes=2)
Exemplo n.º 16
0
            f'steps_per_execution {steps_per_execution} should divide micro_batches_per_epoch = {micro_batches_per_epoch}'
        )

    time_to_train_timer = time_to_train.TimeToTrain()

    # Create an IPU distribution strategy
    train_strategy = PopDistStrategy(
    ) if distributed_training else ipu.ipu_strategy.IPUStrategy()

    with train_strategy.scope():

        # Create an instance of the model
        model = ModelFactory.create_model(
            model_name=model_name,
            input_shape=img_shape,
            classes=num_classes,
            accelerator_side_preprocessing_fn=
            accelerator_side_preprocess_train_fn,
            eight_bit_transfer=eight_bit_transfer)

        model = ModelFactory.configure_model(
            model=model,
            gradient_accumulation_count=batch_config.
            gradient_accumulation_count,
            pipeline_splits=pipeline_splits,
            device_mapping=device_mapping,
            pipeline_schedule=pipeline_schedule,
            available_memory_proportion=available_memory_proportion,
            optimizer_state_offloading=optimizer_state_offloading)

        if training:
Exemplo n.º 17
0
def main(c):
    dsize = '.small' if c.runtime.use_small_data is True else ''
    paths = EasyDict()
    scores = EasyDict()
    modelfactory = ModelFactory()

    with blocktimer('Preprocess', level=INFO):
        paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl'
        paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl'
        train = pd.read_pickle(paths.in_train_path)
        test = pd.read_pickle(paths.in_test_path)
        logger.debug(f'Loaded feature {c.features[0]}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)
        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop('isFraud', axis=1)
        X_test = test
        y_train = train['isFraud'].copy(deep=True)
        del train, test

    with blocktimer('Optimize', level=INFO):
        if c.train.optimize_num_boost_round is True:
            # tune the model params
            model = modelfactory.create(c.model)
            best_iteration = optimize_num_boost_round(model, X_train[c.cols],
                                                      y_train,
                                                      c.train.n_splits, dsize,
                                                      paths, scores)
        else:
            logger.debug('Skip optimization')
            best_iteration = c.train.num_boost_round

    with blocktimer('Train', level=INFO):
        logger.debug(f'Now using the following {len(c.cols)} features.')
        logger.debug(f'{np.array(c.cols)}')

        # CHRIS - TRAIN 75% PREDICT 25%
        idxT = X_train.index[:3 * len(X_train) // 4]
        idxV = X_train.index[3 * len(X_train) // 4:]
        '''
        model = modelfactory.create(c.model)
        model = model.train(X_train.loc[idxT, :], y_train[idxT],
                            X_train.loc[idxV, :], y_train[idxV],
                            num_boost_round=best_iteration)
        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])

        # save results
        paths.out_model_dir = f'data/model/model_{c.runtime.version}_{c.model.type}{dsize}.pkl'
        paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
        model.save(paths.out_model_dir)
        importance.to_csv(paths.importance_path)
        '''

        from sklearn.model_selection import GroupKFold
        from sklearn.metrics import roc_auc_score
        oof = np.zeros(len(X_train))
        preds = np.zeros(len(X_test))

        skf = GroupKFold(n_splits=6)
        for i, (idxT, idxV) in enumerate(
                skf.split(X_train, y_train, groups=X_train['DT_M'])):
            month = X_train.iloc[idxV]['DT_M'].iloc[0]
            logger.info(f'Fold {i+1} withholding month {month}')
            logger.info(
                f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}')

            categorical_features = [
                'ProductCD',
                'M4',
                'card1',
                'card2',
                'card3',
                'card5',
                'card6',
                'addr1',
                'addr2',
                'dist1',
                'dist2',
                'P_emaildomain',
                'R_emaildomain',
            ]

            model = modelfactory.create(c.model)
            model = model.train(
                X_train[c.cols].iloc[idxT],
                y_train.iloc[idxT],
                X_train[c.cols].iloc[idxV],
                y_train.iloc[idxV],
                num_boost_round=best_iteration,
                early_stopping_rounds=c.train.early_stopping_rounds,
                # categorical_features=categorical_features,
                fold=i + 1)

            oof[idxV] += model.predict(X_train[c.cols].iloc[idxV])
            preds += model.predict(X_test[c.cols]) / skf.n_splits
            del model
        logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}')
        paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
        # model.save(paths.out_model_dir)
        '''
        importance = pd.DataFrame(model.feature_importance,
                                  index=X_train.columns,
                                  columns=['importance'])
        importance.to_csv(paths.importance_path)
        '''

    with blocktimer('Predict', level=INFO):
        # y_test = model.predict(X_test)
        sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
        sub['TransactionID'] = X_test.reset_index()['TransactionID']
        # sub['isFraud'] = y_test
        sub['isFraud'] = preds

        paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv'
        sub.to_csv(paths.out_sub_path, index=False)

    result = EasyDict()
    result.update(c)
    result.scores = scores
    result.paths = paths
    return result
Exemplo n.º 18
0
        Y_valid_augmented = None

    synt_sufix = ''
    datagen = None
    if args.synthetic_data:
        synt_sufix = '_synt_' + args.noise
        datagen = create_data_generator(args.noise)
        print(datagen.__dict__)


    if args.dev:
        models.clear()
        models.append(TrainingParameters('dev', 128))
        models.append(TrainingParameters('dev1', 128))

    mf = ModelFactory()
    scores = []
    for m in models:
        # Create directory for model
        model_dir = os.path.join('models', str(m) + ed_sufix + all_sufix + synt_sufix)
        if os.path.exists(model_dir):
            shutil.rmtree(model_dir)
        os.makedirs(model_dir)

        # JPG only
        keras_model = mf.create_model(m.model_name, in_shape=(m.in_size, m.in_size, 3), parameters=m.parameters)
        print ("Model created successfuly. Compiliing")
        keras_model.model.compile(loss=m.loss, optimizer=m.optimizer, metrics=['accuracy'])
        print ("Compilation done")
        print (keras_model.model.summary())
        if args.multi_gpu:
Exemplo n.º 19
0
def main(c):
    dsize = '.small' if c.runtime.use_small_data is True else ''
    paths = EasyDict()
    scores = EasyDict()
    result = EasyDict()
    result.update(c)
    modelfactory = ModelFactory()

    with blocktimer('Preprocess', level=INFO):
        paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl'
        paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl'
        train = pd.read_pickle(paths.in_train_path)
        test = pd.read_pickle(paths.in_test_path)
        logger.debug(f'Loaded feature {c.features[0]}')

        if c.runtime.use_small_data:
            frac = 0.001
            train = train.sample(frac=frac, random_state=42)
            test = test.sample(frac=frac, random_state=42)
        logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}')

        # Split into X, y
        X_train = train.drop('isFraud', axis=1)
        X_test = test
        y_train = train['isFraud'].copy(deep=True)
        del train, test

    with blocktimer('Optimize num_boost_round', level=INFO):
        if c.train.optimize_num_boost_round is True:
            # tune the model params
            model = modelfactory.create(c.model)
            best_iteration = optimize_num_boost_round(model, X_train[c.cols],
                                                      y_train,
                                                      c.train.n_splits, dsize,
                                                      paths, scores)
        else:
            logger.debug('Skip optimization')
            best_iteration = c.train.num_boost_round

    with blocktimer('Optimize model params', level=INFO):
        if c.train.optimize_model_params is True:
            # define objective for optuna
            def objectives(trial):
                max_depth = trial.suggest_int('max_depth', 3, 12)
                params = {
                    'boosting_type':
                    'gbdt',
                    # num_leaves should be smaller than approximately 2^max_depth*0.75
                    'num_leaves':
                    2**max_depth * 3 // 4,
                    'max_depth':
                    max_depth,
                    'learning_rate':
                    0.05,
                    'objective':
                    'binary',
                    'min_child_weight':
                    trial.suggest_loguniform('min_child_weight', 1e-3,
                                             1e0),  # 0.03454472573214212,
                    'reg_alpha':
                    trial.suggest_loguniform('reg_alpha', 1e-2,
                                             1e0),  # 0.3899927210061127,
                    'reg_lambda':
                    trial.suggest_loguniform('reg_lambda', 1e-2,
                                             1e0),  # 0.6485237330340494,
                    'random_state':
                    42,
                    'min_data_in_leaf':
                    trial.suggest_int('min_data_in_leaf', 50, 200),  # 106,
                    'metric':
                    'auc',
                    'max_bin':
                    255
                }
                c.model.params = params

                # Train by 6-fold CV
                oof = np.zeros(len(X_train))
                preds = np.zeros(len(X_test))

                skf = GroupKFold(n_splits=6)
                for i, (idxT, idxV) in enumerate(
                        skf.split(X_train, y_train, groups=X_train['DT_M'])):
                    fold = i + 1
                    month = X_train.iloc[idxV]['DT_M'].iloc[0]
                    model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_opt_fold{fold}{dsize}.pkl'
                    model = modelfactory.create(c.model)
                    logger.info(f'Fold {fold} withholding month {month}')
                    logger.info(
                        f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}'
                    )

                    model = model.train(
                        X_train[c.cols].iloc[idxT],
                        y_train.iloc[idxT],
                        X_train[c.cols].iloc[idxV],
                        y_train.iloc[idxV],
                        num_boost_round=best_iteration,
                        early_stopping_rounds=c.train.early_stopping_rounds,
                        # categorical_features=categorical_features,
                        fold=i + 1)

                    oof[idxV] = model.predict(X_train[c.cols].iloc[idxV])
                    preds += model.predict(X_test[c.cols]) / skf.n_splits

                    paths.update({f'model_fold_{fold}_path': model_fold_path})
                    model.save(paths[f'model_fold_{fold}_path'])
                    del model
                score = roc_auc_score(y_train, oof)
                logger.info(f'Fold {fold} OOF cv= {score}')
                return score

            # run optimization
            opt = optuna.create_study(
                direction='maximize',
                study_name=f'parameter_study_0016{dsize}',
                storage=
                f'sqlite:///data/optimization/parameter_study_0016{dsize}.db',
                load_if_exists=True)
            opt.optimize(objectives, n_trials=20)
            trial = opt.best_trial
            logger.debug(f'Best trial: {trial.value}')
            logger.debug(f'Best params: {trial.params}')
            scores.best_trial = trial.value
            result.optimize = {}
            result.optimize.best_params = trial.params
        else:
            logger.debug('Skip optimization')

    with blocktimer('Train', level=INFO):
        if c.train.train_model:
            logger.debug(f'Now using the following {len(c.cols)} features.')
            logger.debug(f'{np.array(c.cols)}')

            oof = np.zeros(len(X_train))
            preds = np.zeros(len(X_test))

            skf = GroupKFold(n_splits=6)
            for i, (idxT, idxV) in enumerate(
                    skf.split(X_train, y_train, groups=X_train['DT_M'])):
                month = X_train.iloc[idxV]['DT_M'].iloc[0]
                logger.info(f'Fold {i+1} withholding month {month}')
                logger.info(
                    f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}'
                )
                '''
                categorical_features = ['ProductCD', 'M4',
                                        'card1', 'card2', 'card3', 'card5', 'card6',
                                        'addr1', 'addr2', 'dist1', 'dist2',
                                        'P_emaildomain', 'R_emaildomain',
                                        ]
                '''

                model = modelfactory.create(c.model)
                model = model.train(
                    X_train[c.cols].iloc[idxT],
                    y_train.iloc[idxT],
                    X_train[c.cols].iloc[idxV],
                    y_train.iloc[idxV],
                    num_boost_round=best_iteration,
                    early_stopping_rounds=c.train.early_stopping_rounds,
                    # categorical_features=categorical_features,
                    fold=i + 1)

                oof[idxV] = model.predict(X_train[c.cols].iloc[idxV])
                preds += model.predict(X_test[c.cols]) / skf.n_splits
                del model
            logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}')
            paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv'
            # model.save(paths.out_model_dir)
            '''
            importance = pd.DataFrame(model.feature_importance,
                                      index=X_train.columns,
                                      columns=['importance'])
            importance.to_csv(paths.importance_path)
            '''

    with blocktimer('Predict', level=INFO):
        if c.train.predict:
            sub = pd.DataFrame(columns=['TransactionID', 'isFraud'])
            sub['TransactionID'] = X_test.reset_index()['TransactionID']
            sub['isFraud'] = preds

            paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv'
            sub.to_csv(paths.out_sub_path, index=False)

    result.scores = scores
    result.paths = paths
    return result
 def generate_model(self):
     model_factory = ModelFactory(self.config)
     model = model_factory.create_model(self.model_name)
     compile_para = self.model_compile_para()
     model.compile_model(**compile_para)
     return model
Exemplo n.º 21
0
def build_model(problem_name, observation_shape, num_actions, model_params):
    # Model
    model_manager = ModelFactory(problem_name, observation_shape, num_actions, model_params)
    model = model_manager.get_model()
    return model
Exemplo n.º 22
0
import os

from werkzeug.utils import secure_filename
from flask import Flask, render_template, Response, url_for, request
from model.model_factory import ModelFactory

app = Flask(__name__)
model_factory = ModelFactory()


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/predict', methods=['POST', "GET"])
def prediction():
    model = model_factory.get_resnet34()

    if request.method == 'POST':
        f = request.files['file']
        basepath = os.path.dirname(__file__)
        file_path = os.path.join(basepath, 'uploads',
                                 secure_filename(f.filename))
        f.save(file_path)
        pred_class = model.predict(file_path)
        return pred_class

    return None