Пример #1
0
def test_save_experiment_and_get_hash():
    # no reason to make assertions on the config itself, use a basic dict
    experiment_config = {'one': 'two'}
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        ensure_db(engine)
        exp_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert isinstance(exp_hash, str)
        new_hash = save_experiment_and_get_hash(experiment_config, engine)
        assert new_hash == exp_hash
Пример #2
0
    def __init__(self,
                 config,
                 db_engine,
                 model_storage_class=None,
                 project_path=None,
                 replace=True):
        self.config = config
        self.db_engine = db_engine
        if model_storage_class:
            self.model_storage_engine =\
                model_storage_class(project_path=project_path)
        self.project_path = project_path
        self.replace = replace
        ensure_db(self.db_engine)

        self.labels_table_name = 'labels'
        self.features_schema_name = 'features'
        if project_path:
            self.matrices_directory = os.path.join(self.project_path,
                                                   'matrices')
            if not os.path.exists(self.matrices_directory):
                os.makedirs(self.matrices_directory)
        self.experiment_hash = save_experiment_and_get_hash(
            self.config, self.db_engine)
        self._split_definitions = None
        self._matrix_build_tasks = None
        self._feature_table_tasks = None
        self._all_as_of_times = None
        self.initialize_factories()
        self.initialize_components()
Пример #3
0
def main(config_file_name, labels_config_file, args):
    now = datetime.datetime.now().strftime('%d-%m-%y_%H:%M:S')
    log_filename = 'logs/{}.log'.format(now)
    logging.basicConfig(format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
                        level=logging.DEBUG,
                        handlers=[logging.FileHandler(log_filename), logging.StreamHandler()])
    log = logging.getLogger('eis')

    # read config files
    config = utils.read_yaml(config_file_name)
    labels_config = utils.read_yaml(labels_config_file)

    # If you specify production in the args
    if args.production:
        # print(args.modelgroup)
        # print(args.date)

        db_engine = setup_environment.get_database()

        # Write query to pull all information, write to a new config to pass to populate_features
        config_query = '''
          SELECT config->'feature_blocks',
          config->'unit',
          config->'temporal_info'->'timegated_feature_lookback_duration', --needs to be updated
          model_groups.model_type,
          model_groups.model_parameters,
          model_groups.model_config
          from results.models
          JOIN results.model_groups USING (model_group_id)
          where model_group_id = {} LIMIT 1;'''.format(args.modelgroup)

        config_df = pd.read_sql(config_query, db_engine)

        prod_config = {'feature_blocks': config_df.iloc[0, 0],
                       'unit': config_df.iloc[0, 1],
                       'temporal_info': {'timegated_feature_lookback_duration':config_df.iloc[0, 2]},
                       'model_type': config_df.iloc[0, 3],
                       'model_parameters': config_df.iloc[0, 4]}

        # individual model parameters
        model_config = config_df.iloc[0, 5]

        # set time parameters:
        prod_config['temporal_info']['prediction_window'] = [model_config['prediction_window']]
        prod_config['temporal_info']['train_size'] = [model_config['train_size']]
        prod_config['temporal_info']['features_frequency'] = [model_config['features_frequency']]

        #set the config parameter
        prod_config['officer_features']=model_config['blocks']

        # Load in arguments, convert to appropriate time deltas
        pred_wind = prod_config['temporal_info']['prediction_window']
        pred_wind_dict = utils.relative_deltas_conditions(pred_wind)
        pred_wind_delta = relativedelta(**pred_wind_dict[pred_wind[0]])

        train_size = prod_config['temporal_info']['train_size']
        train_size_dict = utils.relative_deltas_conditions(train_size)
        train_size_delta = relativedelta(**train_size_dict[train_size[0]])

        start_date = pd.to_datetime(args.date) - pred_wind_delta - train_size_delta

        prod_config['temporal_info']['start_date'] = start_date.strftime("%Y-%m-%d")
        prod_config['temporal_info']['end_date'] = args.date
        prod_config['temporal_info']['update_window'] = [model_config['train_size']] 
        prod_config['temporal_info']['test_time_ahead'] = ['0d']
        prod_config['temporal_info']['test_frequency'] = ['1d']
        prod_config['temporal_info']['officer_past_activity_window'] = ['1y']

        # Specify number of cpus for feature building
        cpu = {'n_cpus': config['n_cpus']}
        prod_config.update(cpu)

        # To generate matrices need this info
        temporal_sets = utils.generate_temporal_info(prod_config['temporal_info'])
        block_sets = utils.feature_blocks_sets(prod_config['officer_features'], 0)

        # needs to be a list
        params = dict((k[0], [k[1]]) for k in prod_config['model_parameters'].items())
        grid_config = {prod_config['model_type']: params}

        # More info
        misc_db_parameters = {'config': prod_config,
                              'test': config['test_flag'],
                              'model_comment': config['model_comment'],
                              'batch_comment': config['batch_comment']
                              }
        # TODO: lables needs to be read from the database too
        models_args = {'labels': config['labels'],
                       'features': prod_config['feature_blocks'],
                       'schema_name': config["production_schema_feature_blocks"],
                       'feature_lookback_duration': prod_config['temporal_info']['timegated_feature_lookback_duration'],
                       'labels_config': labels_config,
                       'labels_table_name': config['production_officer_label_table_name'],
                       # config['officer_label_table_name'],
                       'grid_config': grid_config,
                       'project_path': config['project_path'],
                       'misc_db_parameters': misc_db_parameters}

        populate_features.populate_features_table(prod_config, config['production_schema_feature_blocks'])

        # Create labels table-> Right now the labels configuration is read from the command line and not from the database
        populate_labels.create_labels_table(config,  config['production_officer_label_table_name'])
        populate_labels.populate_labels_table(config, labels_config, config['production_officer_label_table_name'])

        log.info("Done building the features required for production use")

        # build the matrices
        Parallel(n_jobs=1, verbose=51)(delayed(generate_all_matrices)(temporal_set, blocks, **models_args)
                                       for temporal_set, blocks in product(temporal_sets, block_sets))

        log.info("Done building the matrices required for production use")

        # run the model
        # Run models
        db_engine = setup_environment.get_database()

        Parallel(n_jobs=1, verbose=51)(
            delayed(apply_score_day)(temporal_set, blocks, args.modelgroup, args.date, **models_args)
            for temporal_set, blocks in product(temporal_sets, block_sets))

        log.info('Done running the model and storing the output')
        sys.exit()

    # If asked to generate features, then do that and stop.
    if args.buildfeatures:
        log.info("Re-building features...")

        # Create the features and labels table.
        populate_labels.create_labels_table(config, config['officer_label_table_name'])

        # Populate the featuress  and labels table
        populate_features.populate_features_table(config, config["schema_feature_blocks"])
        populate_labels.populate_labels_table(config, labels_config, config['officer_label_table_name'])

        log.info('Done creating features table')
        sys.exit()

    # modify models_config
    grid_config = utils.generate_model_config(config)

    # Generate temporal_sets
    temporal_sets = utils.generate_temporal_info(config['temporal_info'])
    # Combination of blocks for iteration
    block_sets = utils.feature_blocks_sets(config['officer_features'], config['leave_out'])

    # Add more arguments
    misc_db_parameters = {'config': config,
                          'test': config['test_flag'],
                          'model_comment': config['model_comment'],
                          'batch_comment': config['batch_comment']
                          }

    models_args = {'labels': config['labels'],
                   'features': config['feature_blocks'],
                   'schema_name': config["schema_feature_blocks"],
                   'feature_lookback_duration': config['temporal_info']['timegated_feature_lookback_duration'],
                   'labels_config': labels_config,
                   'labels_table_name': config['officer_label_table_name'],
                   'grid_config': grid_config,
                   'project_path': config['project_path'],
                   'misc_db_parameters': misc_db_parameters}

    n_cups = config['n_cpus']

    if args.generatematrices:
        # Parallelization
        Parallel(n_jobs=n_cups, verbose=51)(delayed(generate_all_matrices)(temporal_set, blocks, **models_args)
                                            for temporal_set, blocks in product(temporal_sets, block_sets))

        log.info('Done creating all matrices')
        sys.exit()


        # Run models
    db_engine = setup_environment.get_database()
    experiment_hash = save_experiment_and_get_hash(config, db_engine)
    models_args['experiment_hash'] = experiment_hash

    Parallel(n_jobs=n_cups, verbose=5)(delayed(apply_train_test)(temporal_set, blocks, **models_args)
                                       for temporal_set, blocks in product(temporal_sets, block_sets))

    log.info("Done!")
    return None
Пример #4
0
def main(config_file_name, labels_config_file, args):

    now = datetime.datetime.now().strftime('%d-%m-%y_%H:%M:S')
    log_filename = 'logs/{}.log'.format(now)
    logging.basicConfig(
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        level=logging.DEBUG,
        handlers=[logging.FileHandler(log_filename),
                  logging.StreamHandler()])
    log = logging.getLogger('eis')

    # read config files
    config = utils.read_yaml(config_file_name)
    labels_config = utils.read_yaml(labels_config_file)

    # If asked to generate features, then do that and stop.
    if args.buildfeatures:

        log.info("Re-building features...")

        # Create the features and labels table.
        populate_labels.create_labels_table(config,
                                            config['officer_label_table_name'])

        # Populate the featuress  and labels table
        populate_features.populate_features_table(
            config, config["schema_feature_blocks"])
        populate_labels.populate_labels_table(
            config, labels_config, config['officer_label_table_name'])

        log.info('Done creating features table')
        sys.exit()

    # modify models_config
    grid_config = utils.generate_model_config(config)

    # Generate temporal_sets
    temporal_sets = utils.generate_temporal_info(config['temporal_info'])
    # Combination of blocks for iteration
    block_sets = utils.feature_blocks_sets(config['officer_features'],
                                           config['leave_out'])

    # Add more arguments
    misc_db_parameters = {
        'config': config,
        'test': config['test_flag'],
        'model_comment': config['model_comment'],
        'batch_comment': config['batch_comment']
    }

    models_args = {
        'labels':
        config['labels'],
        'features':
        config['feature_blocks'],
        'schema_name':
        config["schema_feature_blocks"],
        'feature_lookback_duration':
        config['temporal_info']['timegated_feature_lookback_duration'],
        'labels_config':
        labels_config,
        'labels_table_name':
        config['officer_label_table_name'],
        'grid_config':
        grid_config,
        'project_path':
        config['project_path'],
        'misc_db_parameters':
        misc_db_parameters
    }

    n_cups = config['n_cpus']

    if args.generatematrices:
        # Parallelization
        Parallel(n_jobs=n_cups, verbose=51)(
            delayed(generate_all_matrices)(temporal_set, blocks, **models_args)
            for temporal_set, blocks in product(temporal_sets, block_sets))

        log.info('Done creating all matrices')
        sys.exit()

    # Run models
    db_engine = setup_environment.get_database()
    experiment_hash = save_experiment_and_get_hash(config, db_engine)
    models_args['experiment_hash'] = experiment_hash

    Parallel(n_jobs=n_cups, verbose=5)(
        delayed(apply_train_test)(temporal_set, blocks, **models_args)
        for temporal_set, blocks in product(temporal_sets, block_sets))

    log.info("Done!")
    return None
Пример #5
0
def test_integration():
    with testing.postgresql.Postgresql() as postgresql:
        db_engine = create_engine(postgresql.url())
        ensure_db(db_engine)

        with mock_s3():
            s3_conn = boto3.resource('s3')
            s3_conn.create_bucket(Bucket='econ-dev')
            project_path = 'econ-dev/inspections'

            # create train and test matrices
            train_matrix = pandas.DataFrame.from_dict({
                'entity_id': [1, 2],
                'feature_one': [3, 4],
                'feature_two': [5, 6],
                'label': [7, 8]
            }).set_index('entity_id')
            train_metadata = {
                'beginning_of_time': datetime.date(2012, 12, 20),
                'end_time': datetime.date(2016, 12, 20),
                'label_name': 'label',
                'label_window': '1y',
                'feature_names': ['ft1', 'ft2'],
                'metta-uuid': '1234',
            }

            train_store = InMemoryMatrixStore(train_matrix, train_metadata)

            as_of_dates = [
                datetime.date(2016, 12, 21),
                datetime.date(2017, 1, 21)
            ]

            test_stores = [
                InMemoryMatrixStore(
                    pandas.DataFrame.from_dict({
                        'entity_id': [3],
                        'feature_one': [8],
                        'feature_two': [5],
                        'label': [5]
                    }).set_index('entity_id'),
                    {
                        'label_name': 'label',
                        'label_window': '1y',
                        'end_time': as_of_date,
                        'metta-uuid': '1234',
                    }
                )
                for as_of_date in as_of_dates
            ]

            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)

            experiment_hash = save_experiment_and_get_hash({}, db_engine)
            # instantiate pipeline objects
            trainer = ModelTrainer(
                project_path=project_path,
                experiment_hash=experiment_hash,
                model_storage_engine=model_storage_engine,
                db_engine=db_engine,
                model_group_keys=['label_name', 'label_window']
            )
            predictor = Predictor(
                project_path,
                model_storage_engine,
                db_engine
            )
            model_scorer = ModelScorer(
                [{'metrics': ['precision@'], 'thresholds': {'top_n': [5]}}],
                db_engine
            )

            # run the pipeline
            grid_config = {
                'sklearn.linear_model.LogisticRegression': {
                    'C': [0.00001, 0.0001],
                    'penalty': ['l1', 'l2'],
                    'random_state': [2193]
                }
            }
            model_ids = trainer.train_models(
                grid_config=grid_config,
                misc_db_parameters=dict(),
                matrix_store=train_store
            )

            for model_id in model_ids:
                for as_of_date, test_store in zip(as_of_dates, test_stores):
                    predictions_proba = predictor.predict(
                        model_id,
                        test_store,
                        misc_db_parameters=dict()
                    )

                    model_scorer.score(
                        predictions_proba,
                        test_store.labels(),
                        model_id,
                        as_of_date,
                        as_of_date,
                        '6month'
                    )

            # assert
            # 1. that the predictions table entries are present and
            # can be linked to the original models
            records = [
                row for row in
                db_engine.execute('''select entity_id, model_id, as_of_date
                from results.predictions
                join results.models using (model_id)
                order by 3, 2''')
            ]
            assert records == [
                (3, 1, datetime.datetime(2016, 12, 21)),
                (3, 2, datetime.datetime(2016, 12, 21)),
                (3, 3, datetime.datetime(2016, 12, 21)),
                (3, 4, datetime.datetime(2016, 12, 21)),
                (3, 1, datetime.datetime(2017, 1, 21)),
                (3, 2, datetime.datetime(2017, 1, 21)),
                (3, 3, datetime.datetime(2017, 1, 21)),
                (3, 4, datetime.datetime(2017, 1, 21)),
            ]

            # that evaluations are there
            records = [
                row for row in
                db_engine.execute('''
                    select model_id, evaluation_start_time, metric, parameter
                    from results.evaluations order by 2, 1''')
            ]
            assert records == [
                (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'),
                (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
                (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'),
            ]