def test_creating_transaction(self): with testing.postgresql.Postgresql() as postgresql: print("url={}".format(postgresql.url())) print("data directory={}".format(postgresql.get_data_directory())) engine = create_engine(postgresql.url()) Session = sessionmaker(bind=engine) # Create schema Base.metadata.create_all(engine) # Create session session = Session() print("session started") # Add transactions for sub_id in range(20): session.add(build_message(submission_id=sub_id, collection_data={'json_id': sub_id, 'two': 2})) session.commit() print("added transaction") # test standard query all_transactions = session.query(Transaction).filter(Transaction.status == TransactionStatus.ready).all() self.assertEqual(20, len(all_transactions)) # test json field query json_query = session.query(Transaction).filter(Transaction.collection_data['json_id'].astext.cast(Integer) >= 10).all() self.assertEqual(10, len(json_query)) session.close()
def test_training_label_generation(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_events(engine) labels_table_name = 'labels' label_generator = BinaryLabelGenerator( events_table='events', db_engine=engine, ) label_generator._create_labels_table(labels_table_name) label_generator.generate( start_date='2014-09-30', label_window='6month', labels_table=labels_table_name ) result = engine.execute( 'select * from {} order by entity_id, as_of_date'.format(labels_table_name) ) records = [row for row in result] expected = [ # entity_id, as_of_date, label_window, name, type, label (1, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', False), (3, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', True), (4, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', False), ] assert records == expected
def pg_engine(): with testing.postgresql.Postgresql() as postgresql: engine = sa.create_engine(postgresql.url()) models.Base.metadata.create_all(bind=engine) yield engine engine.dispose()
def test_train_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)] with TemporaryDirectory() as temp_dir: planner = Planner( beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0), label_names = ['booking'], label_types = ['binary'], states = ['state_one AND state_two'], db_config = db_config, matrix_directory = temp_dir, user_metadata = {}, engine = engine ) feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'beginning_of_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_window': '1 month' } uuid = metta.generate_uuid(matrix_metadata) planner.build_matrix( as_of_times = dates, label_name = 'booking', label_type = 'binary', feature_dictionary = feature_dictionary, matrix_directory = temp_dir, matrix_metadata = matrix_metadata, matrix_uuid = uuid, matrix_type = 'train' ) matrix_filename = os.path.join( temp_dir, '{}.csv'.format(uuid) ) with open(matrix_filename, 'r') as f: reader = csv.reader(f) assert(len([row for row in reader]) == 6)
def test_replace(): aggregate_config = [{ 'prefix': 'aprefix', 'aggregates': [ {'quantity': 'quantity_one', 'metrics': ['sum', 'count']}, ], 'categoricals': [ { 'column': 'cat_one', 'choices': ['good', 'bad'], 'metrics': ['sum'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' feature_tables = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name, replace=False ).create_all_tables( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) assert len(feature_tables) == 1 table_tasks = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name, replace=False ).generate_all_table_tasks( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) assert len(table_tasks['aprefix_entity_id'].keys()) == 0
def app(request): # Create PostgreSQL server on the fly postgresql = testing.postgresql.Postgresql() # And override the database URL app = factory('passgen.config.development') app.config['SQLALCHEMY_DATABASE_URI'] = postgresql.url() # Set up schema with app.app_context(): migrate_extension.upgrade(revision='head') def fin(): postgresql.stop() request.addfinalizer(fin) return app
def test_generate_table_tasks(): aggregate_config = [{ 'prefix': 'prefix1', 'categoricals': [ { 'column': 'cat_one', 'choice_query': 'select distinct(cat_one) from data', 'metrics': ['sum'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }, { 'prefix': 'prefix2', 'aggregates': [ {'quantity': 'quantity_one', 'metrics': ['count']}, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' table_tasks = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name ).generate_all_table_tasks( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) for task in table_tasks.values(): assert 'DROP TABLE' in task['prepare'][0] assert 'CREATE TABLE' in str(task['prepare'][1]) assert 'CREATE INDEX' in task['finalize'][0] assert isinstance(task['inserts'], list)
def test_sparse_state_table_generator(): input_data = [ (5, 'permitted', datetime(2016, 1, 1), datetime(2016, 6, 1)), (6, 'permitted', datetime(2016, 2, 5), datetime(2016, 5, 5)), (1, 'injail', datetime(2014, 7, 7), datetime(2014, 7, 15)), (1, 'injail', datetime(2016, 3, 7), datetime(2016, 4, 2)), ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_dense_state_table(engine, 'states', input_data) table_generator = StateTableGenerator(engine, 'exp_hash') as_of_dates = [ datetime(2016, 1, 1), datetime(2016, 2, 1), datetime(2016, 3, 1), datetime(2016, 4, 1), datetime(2016, 5, 1), datetime(2016, 6, 1), ] table_generator.generate_sparse_table('states', as_of_dates) results = [row for row in engine.execute( 'select entity_id, as_of_date, injail, permitted from {} order by entity_id, as_of_date'.format( table_generator.sparse_table_name ))] expected_output = [ # entity_id, as_of_date, injail, permitted (1, datetime(2016, 4, 1), True, False), (5, datetime(2016, 1, 1), False, True), (5, datetime(2016, 2, 1), False, True), (5, datetime(2016, 3, 1), False, True), (5, datetime(2016, 4, 1), False, True), (5, datetime(2016, 5, 1), False, True), (6, datetime(2016, 3, 1), False, True), (6, datetime(2016, 4, 1), False, True), (6, datetime(2016, 5, 1), False, True), ] assert results == expected_output assert_index(engine, table_generator.sparse_table_name, 'entity_id') assert_index(engine, table_generator.sparse_table_name, 'as_of_date')
def test_write_to_csv(): """ Test the write_to_csv function by checking whether the csv contains the correct number of lines. """ with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) with TemporaryDirectory() as temp_dir: planner = Planner( beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0), label_names = ['booking'], label_types = ['binary'], states = ['state_one AND state_two'], db_config = db_config, matrix_directory = temp_dir, user_metadata = {}, engine = engine, builder_class = builders.LowMemoryCSVBuilder ) # for each table, check that corresponding csv has the correct # of rows for table in features_tables: with NamedTempFile() as f: planner.builder.write_to_csv( ''' select * from features.features{} '''.format(features_tables.index(table)), f.name ) f.seek(0) reader = csv.reader(f) assert(len([row for row in reader]) == len(table) + 1)
def test_feature_dictionary_creator(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) engine.execute('create schema features') engine.execute(''' create table features.feature_table_one ( entity_id int, as_of_date date, feature_one float, feature_two float ) ''') engine.execute(''' create table features.feature_table_two ( entity_id int, as_of_date date, feature_three float, feature_four float ) ''') engine.execute(''' create table features.random_other_table ( another_column float ) ''') creator = FeatureDictionaryCreator( features_schema_name='features', db_engine=engine ) feature_dictionary = creator.feature_dictionary( ['feature_table_one', 'feature_table_two'] ) assert feature_dictionary == { 'feature_table_one': ['feature_one', 'feature_two'], 'feature_table_two': ['feature_three', 'feature_four'], }
def test_generate_all_labels(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_events(engine) labels_table_name = 'labels' label_generator = BinaryLabelGenerator( events_table='events', db_engine=engine, ) label_generator.generate_all_labels( labels_table=labels_table_name, as_of_dates=['2014-09-30', '2015-03-30'], label_windows=['6month', '3month'], ) result = engine.execute(''' select * from {} order by entity_id, as_of_date, label_window desc '''.format(labels_table_name) ) records = [row for row in result] expected = [ # entity_id, as_of_date, label_window, name, type, label (1, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', False), (1, date(2014, 9, 30), timedelta(90), 'outcome', 'binary', False), (2, date(2015, 3, 30), timedelta(180), 'outcome', 'binary', False), (2, date(2015, 3, 30), timedelta(90), 'outcome', 'binary', False), (3, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', True), (3, date(2015, 3, 30), timedelta(180), 'outcome', 'binary', False), (4, date(2014, 9, 30), timedelta(180), 'outcome', 'binary', False), (4, date(2014, 9, 30), timedelta(90), 'outcome', 'binary', False), ] assert records == expected
import testing.postgresql import settings Postgresql = testing.postgresql.PostgresqlFactory(cache_initialized_db=False) postgresql = Postgresql() settings.DB_URI = postgresql.url()
def end_to_end_run(self, format_name): config = """ { "format": { "name": "{{format_name}}", "version": 1 }, "queue": "on-us" } """ config = config.replace('{{format_name}}', format_name) # transaction template message_template = { 'source': 'RBA', 'status': TransactionStatus.ready, 'collection_format_name': 'direct_entry', 'collection_format_version': 1, 'collection_data': {}, 'collection_datetime': datetime.today().date(), 'queue': 'default', } # direct entry data de_data = { 'record_type': '1', 'reel_seq_num': '01', 'name_fin_inst': 'SUN', 'user_name': 'DE USER NAME', 'user_num': '654321', 'file_desc': 'DE FILE DESC', 'date_for_process': '011216', 'bsb_number': '484-799', 'account_number': '111111111', 'indicator': ' ', 'tran_code': '53', 'amount': '0000012345', # $2.00 'account_title': 'DE ACCT TITLE', 'lodgement_ref': 'DE LODGE REF', 'trace_bsb_number': '484-799', 'trace_account_number': '222222222', 'name_of_remitter': 'DE REMITTER NAME', 'withholding_tax_amount': '00000000', } # json payment data json_data = { 'from_account': '987654321', 'from_routing': '484-799', 'to_description': 'JSON TO DESC', 'from_name': 'JSON FROM NAME', 'tran_type': 'cr', 'to_name': 'JSON TO NAME', 'to_account': '333333333', 'to_routing': '484-799', 'amount': 54321, 'post_date': date(2016, 12, 2) } tran_list = [] for tran_id in range(0, 100, 2): tran_list.append( build_message( submission_id=str(tran_id), collection_data=de_data, template=message_template, queue='on-us' ) ) tran_list.append( build_message( submission_id=str(tran_id+1), collection_data=json_data, collection_format_name='json', template=message_template, queue='on-us' ) ) tran_list.append( SystemControl(system_id=1, effective_date=date(2000, 1, 1)) ) with testing.postgresql.Postgresql() as postgresql: # setup test database LOGGER.debug('Creating postgresql instance for testing') LOGGER.debug(' url={}'.format(postgresql.url())) LOGGER.debug(' data directory={}'.format(postgresql.get_data_directory())) engine = create_engine(postgresql.url(), json_serializer=dumps) alembic_cfg = Config("alembic.ini") with engine.begin() as connection: alembic_cfg.attributes['connection'] = connection command.upgrade(alembic_cfg, "head") Session = sessionmaker(bind=engine) session = Session() session.add_all(tran_list) session.commit() session.close() # run the job runner = CliRunner() with runner.isolated_filesystem() as fs: with open('test.json', 'w') as fp: fp.write(config) start_time = time.clock() result = runner.invoke( pr_file_distribution, ['test.json', 'out-json.txt', '--db-url', postgresql.url()], catch_exceptions=False ) duration = time.clock() - start_time divider = '.'*20 LOGGER.debug('output:\n%s', result.output) LOGGER.debug('exception:\n%s', result.exception) outfile = os.path.join(fs, 'out-json.txt') print(divider) with open(outfile, 'r') as out_fh: record_count = 0 for line in out_fh: print(line.rstrip()) record_count += 1 print("Processed {} lines".format(record_count)) print("{}\nRun completed in {} seconds\n{}".format(divider, duration, divider))
super(TestConsumer, self).setUp() def tearDown(self): self.cur.execute("ROLLBACK;") self.cur.close() super(TestConsumer, self).tearDown() def test_insert_query(self): insert_query = """ INSERT INTO tweets_by_topic (topic, kafka_offset, user_id, tweet, sentiment) values ( %s, %s, %s, %s, %s) """ self.cur.execute(insert_query, ( "Quotes", "500", "1234567890", "Someone's sitting in the shade today because someone planted a tree a long time ago.", 1)) consumer.conn.commit() self.cur.execute( 'SELECT topic, kafka_offset, user_id, tweet, sentiment FROM tweets_by_topic' ) self.assertEqual(self.cur.fetchone(), ( 'Quotes', 500, "1234567890", "Someone's sitting in the shade today because someone planted a tree a long time ago.", 1)) if __name__ == "__main__": consumer.conn = postgresql.url() unittest.main()
from sqlalchemy import create_engine import testing.postgresql import settings from sequences import create_sequences # Launch new PostgreSQL server Postgres = testing.postgresql.PostgresqlFactory(cache_initialized_db=False) postgresql = Postgres() settings.DB_URL = postgresql.url() engine = create_engine(postgresql.url()) create_sequences(engine)
def test_load_labels_data_include_missing_labels_as_false(): """ Test the load_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 6, 1, 0, 0) ] # same as the other load_labels_data test, except we include an extra date, 2016-06-01 # this date does have entity 0 included via the states table, but no labels # make a dataframe of labels to test against labels_df = pd.DataFrame(labels, columns=[ 'entity_id', 'as_of_date', 'label_timespan', 'label_name', 'label_type', 'label' ]) labels_df['as_of_date'] = convert_string_column_to_date( labels_df['as_of_date']) labels_df.set_index(['entity_id', 'as_of_date']) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, include_missing_labels_in_train_as=False, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_timespan='1 month') result = builder.load_labels_data( label_name=label_name, label_type=label_type, label_timespan='1 month', matrix_uuid='my_uuid', entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict({ 'entity_id': [0, 2, 3, 4, 4], 'as_of_date': [dates[2], dates[1], dates[1], dates[0], dates[1]], 'booking': [0, 0, 0, 1, 0], }).set_index(['entity_id', 'as_of_date']) # the first row would not be here if we had not configured the Builder # to include missing labels as false test = (result == df) assert (test.all().all())
def test_array_categoricals(): aggregate_config = [{ 'prefix': 'aprefix', 'array_categoricals': [ { 'column': 'cat_one', 'choices': ['good', 'bad', 'inbetween'], 'metrics': ['sum'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }] expected_output = { 'aprefix_entity_id': [ { 'entity_id': 3, 'as_of_date': date(2013, 9, 30), 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_inbetween_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, { 'entity_id': 1, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_cat_one_good_sum': 1, 'aprefix_entity_id_all_cat_one_inbetween_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 0 }, { 'entity_id': 3, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_inbetween_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, { 'entity_id': 4, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_inbetween_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, ] } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) input_data = [ # entity_id, knowledge_date, cat_one, quantity_one (1, date(2014, 1, 1), ['good', 'good'], 10000), (1, date(2014, 10, 11), ['good'], None), (3, date(2012, 6, 8), ['bad'], 342), (3, date(2014, 12, 21), ['inbetween'], 600), (4, date(2014, 4, 4), ['bad'], 1236) ] engine.execute(""" create table data ( entity_id int, knowledge_date date, cat_one varchar[], quantity_one float ) """) for row in input_data: engine.execute( 'insert into data values (%s, %s, %s, %s)', row ) features_schema_name = 'features' output_tables = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name ).create_all_tables( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) for output_table in output_tables: records = pandas.read_sql( 'select * from {}.{} order by as_of_date, entity_id'.format( features_schema_name, output_table ), engine ).to_dict('records') assert records == expected_output[output_table]
def test_simple_experiment(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment_class( config=sample_config(), db_engine=db_engine, project_path=os.path.join(temp_dir, "inspections"), cleanup=True, ).run() # assert # 1. that model groups entries are present num_mgs = len([ row for row in db_engine.execute( "select * from triage_metadata.model_groups") ]) assert num_mgs > 0 # 2. that model entries are present, and linked to model groups num_models = len([ row for row in db_engine.execute(""" select * from triage_metadata.model_groups join triage_metadata.models using (model_group_id) where model_comment = 'test2-final-final' """) ]) assert num_models > 0 # 3. predictions, linked to models for both training and testing predictions for set_type in ("train", "test"): num_predictions = len([ row for row in db_engine.execute(""" select * from {}_results.predictions join triage_metadata.models using (model_id)""".format( set_type, set_type)) ]) assert num_predictions > 0 # 4. evaluations linked to predictions linked to models, for training and testing for set_type in ("train", "test"): num_evaluations = len([ row for row in db_engine.execute(""" select * from {}_results.evaluations e join triage_metadata.models using (model_id) join {}_results.predictions p on ( e.model_id = p.model_id and e.evaluation_start_time <= p.as_of_date and e.evaluation_end_time >= p.as_of_date) """.format(set_type, set_type, set_type)) ]) assert num_evaluations > 0 # 5. subset evaluations linked to subsets and predictions linked to # models, for training and testing for set_type in ("train", "test"): num_evaluations = len([ row for row in db_engine.execute(""" select e.model_id, e.subset_hash from {}_results.evaluations e join triage_metadata.models using (model_id) join triage_metadata.subsets using (subset_hash) join {}_results.predictions p on ( e.model_id = p.model_id and e.evaluation_start_time <= p.as_of_date and e.evaluation_end_time >= p.as_of_date) group by e.model_id, e.subset_hash """.format(set_type, set_type)) ]) # 4 model groups trained/tested on 2 splits, with 1 metric + parameter assert num_evaluations == 8 # 6. experiment num_experiments = len([ row for row in db_engine.execute( "select * from triage_metadata.experiments") ]) assert num_experiments == 1 # 7. that models are linked to experiments num_models_with_experiment = len([ row for row in db_engine.execute(""" select * from triage_metadata.experiments join triage_metadata.experiment_models using (experiment_hash) join triage_metadata.models using (model_hash) """) ]) assert num_models == num_models_with_experiment # 8. that models have the train end date and label timespan results = [(model["train_end_time"], model["training_label_timespan"]) for model in db_engine.execute( "select * from triage_metadata.models")] assert sorted(set(results)) == [ (datetime(2012, 6, 1), timedelta(180)), (datetime(2013, 6, 1), timedelta(180)), ] # 9. that the right number of individual importances are present individual_importances = [ row for row in db_engine.execute(""" select * from test_results.individual_importances join triage_metadata.models using (model_id) """) ] assert len( individual_importances) == num_predictions * 2 # only 2 features # 10. Checking the proper matrices created and stored matrices = [ row for row in db_engine.execute(""" select matrix_type, num_observations from triage_metadata.matrices""" ) ] types = [i[0] for i in matrices] counts = [i[1] for i in matrices] assert types.count("train") == 2 assert types.count("test") == 2 for i in counts: assert i > 0 assert len(matrices) == 4 # 11. Checking that all matrices are associated with the experiment linked_matrices = list( db_engine.execute("""select * from triage_metadata.matrices join triage_metadata.experiment_matrices using (matrix_uuid) join triage_metadata.experiments using (experiment_hash)""")) assert len(linked_matrices) == len(matrices)
def test_make_entity_date_table_include_missing_labels(): """ Test that the make_entity_date_table function contains the correct values. """ dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), datetime.datetime(2016, 6, 1, 0, 0) ] # same as the other make_entity_date_label test except there is an extra date, 2016-06-01 # entity 0 is included in this date via the states table, but has no label # make a dataframe of entity ids and dates to test against ids_dates = create_entity_date_df(labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name='booking', label_type='binary', label_timespan='1 month') # this line adds the new entity-date combo as an expected one ids_dates = ids_dates.append( { 'entity_id': 0, 'as_of_date': datetime.date(2016, 6, 1) }, ignore_index=True) with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, include_missing_labels_in_train_as=False, engine=engine) engine.execute( 'CREATE TABLE features.tmp_entity_date (a int, b date);') # call the function to test the creation of the table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_uuid='my_uuid', matrix_type='train', label_timespan='1 month') # read in the table result = pd.read_sql( "select * from features.{} order by entity_id, as_of_date". format(entity_date_table_name), engine) # compare the table to the test dataframe assert sorted(result.values.tolist()) == sorted( ids_dates.values.tolist())
def test_model_scoring_inspections(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) metric_groups = [{ 'metrics': ['precision@', 'recall@', 'fpr@'], 'thresholds': {'percentiles': [50.0], 'top_n': [3]} }, { # ensure we test a non-thresholded metric as well 'metrics': ['accuracy'], }] training_metric_groups = [{'metrics': ['accuracy'], 'thresholds': {'percentiles': [50.0]}}] model_evaluator = ModelEvaluator(metric_groups, training_metric_groups, db_engine) testing_labels = numpy.array([True, False, numpy.nan, True, False]) testing_prediction_probas = numpy.array([0.56, 0.4, 0.55, 0.5, 0.3]) training_labels = numpy.array([False, False, True, True, True, False, True, True]) training_prediction_probas = numpy.array([0.6, 0.4, 0.55, 0.70, 0.3, 0.2, 0.8, 0.6]) fake_train_matrix_store = MockMatrixStore('train', 'efgh', 5, db_engine, training_labels) fake_test_matrix_store = MockMatrixStore('test', '1234', 5, db_engine, testing_labels) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine ) # Evaluate testing matrix and test the results model_evaluator.evaluate( testing_prediction_probas, fake_test_matrix_store, model_id, ) for record in db_engine.execute( '''select * from test_results.test_evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_test_matrix_store.as_of_dates[0]) ): assert record['num_labeled_examples'] == 4 assert record['num_positive_labels'] == 2 if record['parameter'] == '': assert record['num_labeled_above_threshold'] == 4 elif 'pct' in record['parameter']: assert record['num_labeled_above_threshold'] == 1 else: assert record['num_labeled_above_threshold'] == 2 # Evaluate the training matrix and test the results model_evaluator.evaluate( training_prediction_probas, fake_train_matrix_store, model_id, ) for record in db_engine.execute( '''select * from train_results.train_evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_train_matrix_store.as_of_dates[0]) ): assert record['num_labeled_examples'] == 8 assert record['num_positive_labels'] == 5 assert record['value'] == 0.625
def test_evaluating_early_warning(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) metric_groups = [{ 'metrics': ['precision@', 'recall@', 'true positives@', 'true negatives@', 'false positives@', 'false negatives@'], 'thresholds': { 'percentiles': [5.0, 10.0], 'top_n': [5, 10] } }, { 'metrics': ['f1', 'mediocre', 'accuracy', 'roc_auc', 'average precision score'], }, { 'metrics': ['fbeta@'], 'parameters': [{'beta': 0.75}, {'beta': 1.25}] }] training_metric_groups = [{'metrics': ['accuracy', 'roc_auc']}] custom_metrics = {'mediocre': always_half} model_evaluator = ModelEvaluator(metric_groups, training_metric_groups, db_engine, custom_metrics=custom_metrics ) labels = fake_labels(5) fake_train_matrix_store = MockMatrixStore('train', 'efgh', 5, db_engine, labels) fake_test_matrix_store = MockMatrixStore('test', '1234', 5, db_engine, labels) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine ) # Evaluate the testing metrics and test for all of them. model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_test_matrix_store, model_id, ) records = [ row[0] for row in db_engine.execute( '''select distinct(metric || parameter) from test_results.test_evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_test_matrix_store.as_of_dates[0]) ) ] assert records == [ 'accuracy', 'average precision score', 'f1', 'false [email protected]_pct', 'false negatives@10_abs', 'false [email protected]_pct', 'false negatives@5_abs', 'false [email protected]_pct', 'false positives@10_abs', 'false [email protected]_pct', 'false positives@5_abs', '[email protected]_beta', '[email protected]_beta', 'mediocre', '[email protected]_pct', 'precision@10_abs', '[email protected]_pct', 'precision@5_abs', '[email protected]_pct', 'recall@10_abs', '[email protected]_pct', 'recall@5_abs', 'roc_auc', 'true [email protected]_pct', 'true negatives@10_abs', 'true [email protected]_pct', 'true negatives@5_abs', 'true [email protected]_pct', 'true positives@10_abs', 'true [email protected]_pct', 'true positives@5_abs' ] # Evaluate the training metrics and test model_evaluator.evaluate( trained_model.predict_proba(labels)[:, 1], fake_train_matrix_store, model_id, ) records = [ row[0] for row in db_engine.execute( '''select distinct(metric || parameter) from train_results.train_evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, fake_train_matrix_store.as_of_dates[0]) ) ] assert records == ['accuracy', 'roc_auc']
def test_entity_date_table_generator_replace(): input_data = [ (1, datetime(2016, 1, 1), True), (1, datetime(2016, 4, 1), False), (1, datetime(2016, 3, 1), True), (2, datetime(2016, 1, 1), False), (2, datetime(2016, 1, 1), True), (3, datetime(2016, 1, 1), True), (5, datetime(2016, 3, 1), True), (5, datetime(2016, 4, 1), True), ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) utils.create_binary_outcome_events(engine, "events", input_data) table_generator = EntityDateTableGenerator( query= "select entity_id from events where outcome_date < '{as_of_date}'::date", db_engine=engine, entity_date_table_name="exp_hash_entity_date", replace=True) as_of_dates = [ datetime(2016, 1, 1), datetime(2016, 2, 1), datetime(2016, 3, 1), datetime(2016, 4, 1), datetime(2016, 5, 1), datetime(2016, 6, 1), ] table_generator.generate_entity_date_table(as_of_dates) expected_output = [ (1, datetime(2016, 2, 1), True), (1, datetime(2016, 3, 1), True), (1, datetime(2016, 4, 1), True), (1, datetime(2016, 5, 1), True), (1, datetime(2016, 6, 1), True), (2, datetime(2016, 2, 1), True), (2, datetime(2016, 3, 1), True), (2, datetime(2016, 4, 1), True), (2, datetime(2016, 5, 1), True), (2, datetime(2016, 6, 1), True), (3, datetime(2016, 2, 1), True), (3, datetime(2016, 3, 1), True), (3, datetime(2016, 4, 1), True), (3, datetime(2016, 5, 1), True), (3, datetime(2016, 6, 1), True), (5, datetime(2016, 4, 1), True), (5, datetime(2016, 5, 1), True), (5, datetime(2016, 6, 1), True), ] results = list( engine.execute(f""" select entity_id, as_of_date, active from {table_generator.entity_date_table_name} order by entity_id, as_of_date """)) assert results == expected_output utils.assert_index(engine, table_generator.entity_date_table_name, "entity_id") utils.assert_index(engine, table_generator.entity_date_table_name, "as_of_date") table_generator.generate_entity_date_table(as_of_dates) assert results == expected_output
def test_entity_date_table_generator_noreplace(): input_data = [ (1, datetime(2016, 1, 1), True), (1, datetime(2016, 4, 1), False), (1, datetime(2016, 3, 1), True), (2, datetime(2016, 1, 1), False), (2, datetime(2016, 1, 1), True), (3, datetime(2016, 1, 1), True), (5, datetime(2016, 3, 1), True), (5, datetime(2016, 4, 1), True), ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) utils.create_binary_outcome_events(engine, "events", input_data) table_generator = EntityDateTableGenerator( query= "select entity_id from events where outcome_date < '{as_of_date}'::date", db_engine=engine, entity_date_table_name="exp_hash_entity_date", replace=False) # 1. generate a cohort for a subset of as-of-dates as_of_dates = [ datetime(2016, 1, 1), datetime(2016, 2, 1), datetime(2016, 3, 1), ] table_generator.generate_entity_date_table(as_of_dates) expected_output = [ (1, datetime(2016, 2, 1), True), (1, datetime(2016, 3, 1), True), (2, datetime(2016, 2, 1), True), (2, datetime(2016, 3, 1), True), (3, datetime(2016, 2, 1), True), (3, datetime(2016, 3, 1), True), ] results = list( engine.execute(f""" select entity_id, as_of_date, active from {table_generator.entity_date_table_name} order by entity_id, as_of_date """)) assert results == expected_output utils.assert_index(engine, table_generator.entity_date_table_name, "entity_id") utils.assert_index(engine, table_generator.entity_date_table_name, "as_of_date") table_generator.generate_entity_date_table(as_of_dates) assert results == expected_output # 2. generate a cohort for a different subset of as-of-dates, # actually including an overlap to make sure that it doesn't double-insert anything as_of_dates = [ datetime(2016, 3, 1), datetime(2016, 4, 1), datetime(2016, 5, 1), datetime(2016, 6, 1), ] table_generator.generate_entity_date_table(as_of_dates) expected_output = [ (1, datetime(2016, 2, 1), True), (1, datetime(2016, 3, 1), True), (1, datetime(2016, 4, 1), True), (1, datetime(2016, 5, 1), True), (1, datetime(2016, 6, 1), True), (2, datetime(2016, 2, 1), True), (2, datetime(2016, 3, 1), True), (2, datetime(2016, 4, 1), True), (2, datetime(2016, 5, 1), True), (2, datetime(2016, 6, 1), True), (3, datetime(2016, 2, 1), True), (3, datetime(2016, 3, 1), True), (3, datetime(2016, 4, 1), True), (3, datetime(2016, 5, 1), True), (3, datetime(2016, 6, 1), True), (5, datetime(2016, 4, 1), True), (5, datetime(2016, 5, 1), True), (5, datetime(2016, 6, 1), True), ] results = list( engine.execute(f""" select entity_id, as_of_date, active from {table_generator.entity_date_table_name} order by entity_id, as_of_date """)) assert results == expected_output
def test_model_trainer(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # create training set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }) metadata = { 'beginning_of_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_window': '1y', 'metta-uuid': '1234', 'feature_names': ['ft1', 'ft2'] } project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) trainer = ModelTrainer( project_path=project_path, experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_window']) matrix_store = InMemoryMatrixStore(matrix, metadata) model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) # assert # 1. that the models and feature importances table entries are present records = [ row for row in engine.execute( 'select * from results.feature_importances') ] assert len(records) == 4 * 3 # maybe exclude entity_id? records = [ row for row in engine.execute( 'select model_hash from results.models') ] assert len(records) == 4 cache_keys = [ model_cache_key(project_path, model_row[0], s3_conn) for model_row in records ] # 2. that the model groups are distinct records = [ row for row in engine.execute( 'select distinct model_group_id from results.models') ] assert len(records) == 4 # 3. that all four models are cached model_pickles = [ pickle.loads(cache_key.get()['Body'].read()) for cache_key in cache_keys ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 4. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 5. when run again, same models are returned new_model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) assert len([ row for row in engine.execute( 'select model_hash from results.models') ]) == 4 assert model_ids == new_model_ids # 6. if metadata is deleted but the cache is still there, # retrains that one and replaces the feature importance records engine.execute( 'delete from results.feature_importances where model_id = 3') engine.execute('delete from results.models where model_id = 3') new_model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) expected_model_ids = [1, 2, 4, 5] assert expected_model_ids == sorted(new_model_ids) assert [ row['model_id'] for row in engine.execute( 'select model_id from results.models order by 1 asc') ] == expected_model_ids records = [ row for row in engine.execute( 'select * from results.feature_importances') ] assert len(records) == 4 * 3 # maybe exclude entity_id? # 7. if the cache is missing but the metadata is still there, reuse the metadata for row in engine.execute('select model_hash from results.models'): model_storage_engine.get_store(row[0]).delete() expected_model_ids = [1, 2, 4, 5] new_model_ids = trainer.train_models(grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) assert expected_model_ids == sorted(new_model_ids) # 8. that the generator interface works the same way new_model_ids = trainer.generate_trained_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=matrix_store) assert expected_model_ids == \ sorted([model_id for model_id in new_model_ids])
def test_sparse_table_generator_from_entities(): input_data = [ (1, datetime(2016, 1, 1), True), (1, datetime(2016, 4, 1), False), (1, datetime(2016, 3, 1), True), (2, datetime(2016, 1, 1), False), (2, datetime(2016, 1, 1), True), (3, datetime(2016, 1, 1), True), (5, datetime(2016, 1, 1), True), (5, datetime(2016, 1, 1), True), ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) utils.create_binary_outcome_events(engine, "events", input_data) table_generator = StateTableGeneratorFromEntities( entities_table="events", db_engine=engine, experiment_hash="exp_hash") as_of_dates = [ datetime(2016, 1, 1), datetime(2016, 2, 1), datetime(2016, 3, 1), datetime(2016, 4, 1), datetime(2016, 5, 1), datetime(2016, 6, 1), ] table_generator.generate_sparse_table(as_of_dates) expected_output = [ (1, datetime(2016, 1, 1), True), (1, datetime(2016, 2, 1), True), (1, datetime(2016, 3, 1), True), (1, datetime(2016, 4, 1), True), (1, datetime(2016, 5, 1), True), (1, datetime(2016, 6, 1), True), (2, datetime(2016, 1, 1), True), (2, datetime(2016, 2, 1), True), (2, datetime(2016, 3, 1), True), (2, datetime(2016, 4, 1), True), (2, datetime(2016, 5, 1), True), (2, datetime(2016, 6, 1), True), (3, datetime(2016, 1, 1), True), (3, datetime(2016, 2, 1), True), (3, datetime(2016, 3, 1), True), (3, datetime(2016, 4, 1), True), (3, datetime(2016, 5, 1), True), (3, datetime(2016, 6, 1), True), (5, datetime(2016, 1, 1), True), (5, datetime(2016, 2, 1), True), (5, datetime(2016, 3, 1), True), (5, datetime(2016, 4, 1), True), (5, datetime(2016, 5, 1), True), (5, datetime(2016, 6, 1), True), ] results = [ row for row in engine.execute(""" select entity_id, as_of_date, active from {} order by entity_id, as_of_date """.format(table_generator.sparse_table_name)) ] assert results == expected_output utils.assert_index(engine, table_generator.sparse_table_name, "entity_id") utils.assert_index(engine, table_generator.sparse_table_name, "as_of_date")
def test_engine(): with Postgresql() as postgresql: engine = create_engine(postgresql.url()) ((result,),) = engine.execute("SELECT COUNT(*) FROM food_inspections") assert result == 966
def test_predictor(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine) # create prediction set matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id'], } matrix_store = InMemoryMatrixStore(matrix, metadata) train_matrix_columns = ['feature_one', 'feature_two'] predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) # assert # 1. that the returned predictions are of the desired length assert len(predict_proba) == 2 # 2. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 2 # 3. that the contained as_of_dates match what we sent in for record in records: assert record[1].date() == AS_OF_DATE # 4. that the entity ids match the given dataset assert sorted([record[0] for record in records]) == [1, 2] # 5. running with same model_id, different as of date # then with same as of date only replaces the records # with the same date new_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') new_metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE + datetime.timedelta(days=1), 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id'], } new_matrix_store = InMemoryMatrixStore(new_matrix, new_metadata) predictor.predict(model_id, new_matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) predictor.predict(model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=train_matrix_columns) records = [ row for row in db_engine.execute('''select entity_id, as_of_date from results.predictions join results.models using (model_id)''') ] assert len(records) == 4 # 6. That we can delete the model when done prediction on it predictor.delete_model(model_id) assert predictor.load_model(model_id) == None
def test_replace(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: builder = builders.HighMemoryCSVBuilder( db_config=db_config, matrix_directory=temp_dir, engine=engine, replace=False) feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month', 'test_duration': '1 month' } uuid = metta.generate_uuid(matrix_metadata) builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid)) with open(matrix_filename, 'r') as f: reader = csv.reader(f) assert (len([row for row in reader]) == 6) # rerun builder.make_entity_date_table = Mock() builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') assert not builder.make_entity_date_table.called
def test_load_features_data(): dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0) ] # make dataframe for entity ids and dates ids_dates = create_entity_date_df(labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name='booking', label_type='binary', label_timespan='1 month') features = [['f1', 'f2'], ['f3', 'f4']] # make dataframes of features to test against features_dfs = [] for i, table in enumerate(features_tables): cols = ['entity_id', 'as_of_date'] + features[i] temp_df = pd.DataFrame(table, columns=cols) temp_df['as_of_date'] = convert_string_column_to_date( temp_df['as_of_date']) features_dfs.append( ids_dates.merge(right=temp_df, how='left', on=['entity_id', 'as_of_date' ]).set_index(['entity_id', 'as_of_date'])) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, ) # make the entity-date table entity_date_table_name = builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_timespan='1 month') feature_dictionary = dict( ('features{}'.format(i), feature_list) for i, feature_list in enumerate(features)) returned_features_dfs = builder.load_features_data( as_of_times=dates, feature_dictionary=feature_dictionary, entity_date_table_name=entity_date_table_name, matrix_uuid='my_uuid') # get the queries and test them for result, df in zip(returned_features_dfs, features_dfs): test = (result == df) assert (test.all().all())
def test_sparse_table_generator_from_events(): input_data = [ (1, datetime(2016, 1, 1), True), (1, datetime(2016, 4, 1), False), (1, datetime(2016, 3, 1), True), (2, datetime(2016, 1, 1), False), (2, datetime(2016, 1, 1), True), (3, datetime(2016, 1, 1), True), (5, datetime(2016, 1, 1), True), (5, datetime(2016, 1, 1), True), ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) utils.create_binary_outcome_events(engine, 'events', input_data) table_generator = StateTableGenerator( engine, 'exp_hash', events_table='events' ) as_of_dates = [ datetime(2016, 1, 1), datetime(2016, 2, 1), datetime(2016, 3, 1), datetime(2016, 4, 1), datetime(2016, 5, 1), datetime(2016, 6, 1), ] table_generator.generate_sparse_table(as_of_dates) expected_output = [ (1, datetime(2016, 1, 1), True), (1, datetime(2016, 2, 1), True), (1, datetime(2016, 3, 1), True), (1, datetime(2016, 4, 1), True), (1, datetime(2016, 5, 1), True), (1, datetime(2016, 6, 1), True), (2, datetime(2016, 1, 1), True), (2, datetime(2016, 2, 1), True), (2, datetime(2016, 3, 1), True), (2, datetime(2016, 4, 1), True), (2, datetime(2016, 5, 1), True), (2, datetime(2016, 6, 1), True), (3, datetime(2016, 1, 1), True), (3, datetime(2016, 2, 1), True), (3, datetime(2016, 3, 1), True), (3, datetime(2016, 4, 1), True), (3, datetime(2016, 5, 1), True), (3, datetime(2016, 6, 1), True), (5, datetime(2016, 1, 1), True), (5, datetime(2016, 2, 1), True), (5, datetime(2016, 3, 1), True), (5, datetime(2016, 4, 1), True), (5, datetime(2016, 5, 1), True), (5, datetime(2016, 6, 1), True), ] results = [row for row in engine.execute( ''' select entity_id, as_of_date, active from {} order by entity_id, as_of_date '''.format( table_generator.sparse_table_name ) )] assert results == expected_output utils.assert_index(engine, table_generator.sparse_table_name, 'entity_id') utils.assert_index(engine, table_generator.sparse_table_name, 'as_of_date')
def basic_integration_test( state_filters, feature_group_create_rules, feature_group_mix_rules, expected_matrix_multiplier, expected_group_lists, ): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) Base.metadata.create_all(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: chopper = Timechop( feature_start_time=datetime(2010, 1, 1), feature_end_time=datetime(2014, 1, 1), label_start_time=datetime(2011, 1, 1), label_end_time=datetime(2014, 1, 1), model_update_frequency="1year", training_label_timespans=["6months"], test_label_timespans=["6months"], training_as_of_date_frequencies="1day", test_as_of_date_frequencies="3months", max_training_histories=["1months"], test_durations=["1months"], ) state_table_generator = StateTableGeneratorFromDense( db_engine=db_engine, experiment_hash="abcd", dense_state_table="states") label_generator = LabelGenerator( db_engine=db_engine, query=sample_config()["label_config"]["query"]) feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name="features", replace=True) feature_dictionary_creator = FeatureDictionaryCreator( db_engine=db_engine, features_schema_name="features") feature_group_creator = FeatureGroupCreator( feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) project_storage = ProjectStorage(temp_dir) planner = Planner( feature_start_time=datetime(2010, 1, 1), label_names=["outcome"], label_types=["binary"], states=state_filters, user_metadata={}, ) builder = MatrixBuilder( engine=db_engine, db_config={ "features_schema_name": "features", "labels_schema_name": "public", "labels_table_name": "labels", "sparse_state_table_name": "tmp_sparse_states_abcd", }, experiment_hash=None, matrix_storage_engine=project_storage.matrix_storage_engine(), replace=True, ) # chop time split_definitions = chopper.chop_time() num_split_matrices = sum(1 + len(split["test_matrices"]) for split in split_definitions) # generate as_of_times for feature/label/state generation all_as_of_times = [] for split in split_definitions: all_as_of_times.extend(split["train_matrix"]["as_of_times"]) for test_matrix in split["test_matrices"]: all_as_of_times.extend(test_matrix["as_of_times"]) all_as_of_times = list(set(all_as_of_times)) # generate sparse state table state_table_generator.generate_sparse_table( as_of_dates=all_as_of_times) # create labels table label_generator.generate_all_labels( labels_table="labels", as_of_dates=all_as_of_times, label_timespans=["6months"], ) # create feature table tasks # we would use FeatureGenerator#create_all_tables but want to use # the tasks dict directly to create a feature dict aggregations = feature_generator.aggregations( feature_aggregation_config=[ { "prefix": "cat", "from_obj": "cat_complaints", "knowledge_date_column": "as_of_date", "aggregates": [{ "quantity": "cat_sightings", "metrics": ["count", "avg"], "imputation": { "all": { "type": "mean" } }, }], "intervals": ["1y"], "groups": ["entity_id"], }, { "prefix": "dog", "from_obj": "dog_complaints", "knowledge_date_column": "as_of_date", "aggregates_imputation": { "count": { "type": "constant", "value": 7 }, "sum": { "type": "mean" }, "avg": { "type": "zero" }, }, "aggregates": [{ "quantity": "dog_sightings", "metrics": ["count", "avg"] }], "intervals": ["1y"], "groups": ["entity_id"], }, ], feature_dates=all_as_of_times, state_table=state_table_generator.sparse_table_name, ) feature_table_agg_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type="aggregation") # create feature aggregation tables feature_generator.process_table_tasks(feature_table_agg_tasks) feature_table_imp_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type="imputation") # create feature imputation tables feature_generator.process_table_tasks(feature_table_imp_tasks) # build feature dictionaries from feature tables and # subsetting config master_feature_dict = feature_dictionary_creator.feature_dictionary( feature_table_names=feature_table_imp_tasks.keys(), index_column_lookup=feature_generator.index_column_lookup( aggregations), ) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict)) # figure out what matrices need to be built _, matrix_build_tasks = planner.generate_plans( split_definitions, feature_dicts) # go and build the matrices builder.build_all_matrices(matrix_build_tasks) # super basic assertion: did matrices we expect get created? matrices_records = list( db_engine.execute( """select matrix_uuid, num_observations, matrix_type from model_metadata.matrices """)) matrix_directory = os.path.join(temp_dir, "matrices") matrices = [ path for path in os.listdir(matrix_directory) if ".csv" in path ] metadatas = [ path for path in os.listdir(matrix_directory) if ".yaml" in path ] assert len( matrices) == num_split_matrices * expected_matrix_multiplier assert len( metadatas) == num_split_matrices * expected_matrix_multiplier assert len(matrices) == len(matrices_records) feature_group_name_lists = [] for metadata_path in metadatas: with open(os.path.join(matrix_directory, metadata_path)) as f: metadata = yaml.load(f) feature_group_name_lists.append(metadata["feature_groups"]) for matrix_uuid, num_observations, matrix_type in matrices_records: assert matrix_uuid in matrix_build_tasks # the hashes of the matrices assert type(num_observations) is int assert matrix_type == matrix_build_tasks[matrix_uuid][ "matrix_type"] def deep_unique_tuple(l): return set([tuple(i) for i in l]) assert deep_unique_tuple( feature_group_name_lists) == deep_unique_tuple( expected_group_lists)
def init_postgres(): postgresql = testing.postgresql.Postgresql() print("postgresql up and running at {}".format(postgresql.url())) return postgresql
def setup_static_fetch(postgresql): subprocess.check_output( ['psql', postgresql.url(), '-q', '-f', 'create_schema.sql'])
def test_write_features_data(): dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)] # make dataframe for entity ids and dates ids_dates = create_entity_date_df( labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name='booking', label_type='binary', label_window='1 month' ) features = [['f1', 'f2'], ['f3', 'f4']] # make dataframes of features to test against features_dfs = [] for i, table in enumerate(features_tables): cols = ['entity_id', 'as_of_date'] + features[i] temp_df = pd.DataFrame( table, columns = cols ) temp_df['as_of_date'] = convert_string_column_to_date(temp_df['as_of_date']) features_dfs.append( ids_dates.merge( right = temp_df, how = 'left', on = ['entity_id', 'as_of_date'] ) ) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) with TemporaryDirectory() as temp_dir: planner = Planner( beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0), label_names = ['booking'], label_types = ['binary'], states = ['state_one AND state_two'], db_config = db_config, matrix_directory = temp_dir, user_metadata = {}, engine = engine, builder_class=builders.LowMemoryCSVBuilder ) # make the entity-date table entity_date_table_name = planner.builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state = 'state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_window='1 month' ) feature_dictionary = dict( ('features{}'.format(i), feature_list) for i, feature_list in enumerate(features) ) print(feature_dictionary) features_csv_names = planner.builder.write_features_data( as_of_times=dates, feature_dictionary=feature_dictionary, entity_date_table_name=entity_date_table_name, matrix_uuid='my_uuid' ) # get the queries and test them for feature_csv_name, df in zip(sorted(features_csv_names), features_dfs): df = df.fillna(0) df = df.reset_index() result = pd.read_csv(feature_csv_name).reset_index() result['as_of_date'] = convert_string_column_to_date(result['as_of_date']) test = (result == df) assert(test.all().all())
def test_feature_generation(): aggregate_config = [{ 'prefix': 'aprefix', 'aggregates': [ {'quantity': 'quantity_one', 'metrics': ['sum', 'count']}, ], 'categoricals': [ { 'column': 'cat_one', 'choices': ['good', 'bad'], 'metrics': ['sum'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }] expected_output = { 'aprefix_entity_id': [ { 'entity_id': 3, 'as_of_date': date(2013, 9, 30), 'aprefix_entity_id_all_quantity_one_sum': 342, 'aprefix_entity_id_all_quantity_one_count': 1, 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, { 'entity_id': 1, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_quantity_one_sum': 10000, 'aprefix_entity_id_all_quantity_one_count': 1, 'aprefix_entity_id_all_cat_one_good_sum': 1, 'aprefix_entity_id_all_cat_one_bad_sum': 0 }, { 'entity_id': 3, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_quantity_one_sum': 342, 'aprefix_entity_id_all_quantity_one_count': 1, 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, { 'entity_id': 4, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_quantity_one_sum': 1236, 'aprefix_entity_id_all_quantity_one_count': 1, 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, ] } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' output_tables = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name ).create_all_tables( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) for output_table in output_tables: records = pandas.read_sql( 'select * from {}.{} order by as_of_date, entity_id' .format(features_schema_name, output_table), engine ).to_dict('records') assert records == expected_output[output_table]
def setUpClass(cls): consumer.conn = psycopg2.connect(postgresql.url().replace( "test", "twitter_sentiment")) consumer.conn.set_isolation_level( \ psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
def basic_integration_test( state_filters, feature_group_create_rules, feature_group_mix_rules, expected_num_matrices ): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) Base.metadata.create_all(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: chopper = Timechop( beginning_of_time=datetime(2010, 1, 1), modeling_start_time=datetime(2011, 1, 1), modeling_end_time=datetime(2014, 1, 1), update_window='1y', train_label_windows=['6months'], test_label_windows=['6months'], train_example_frequency='1day', test_example_frequency='3months', train_durations=['1months'], test_durations=['1months'], ) state_table_generator = StateTableGenerator( db_engine=db_engine, experiment_hash='abcd' ) label_generator = BinaryLabelGenerator( db_engine=db_engine, events_table='events' ) feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name='features', replace=True, ) feature_dictionary_creator = FeatureDictionaryCreator( db_engine=db_engine, features_schema_name='features' ) feature_group_creator = FeatureGroupCreator(feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) planner = Planner( engine=db_engine, beginning_of_time=datetime(2010, 1, 1), label_names=['outcome'], label_types=['binary'], db_config={ 'features_schema_name': 'features', 'labels_schema_name': 'public', 'labels_table_name': 'labels', 'sparse_state_table_name': 'tmp_sparse_states_abcd', }, matrix_directory=os.path.join(temp_dir, 'matrices'), states=state_filters, user_metadata={}, replace=True ) # chop time split_definitions = chopper.chop_time() # generate as_of_times for feature/label/state generation all_as_of_times = [] for split in split_definitions: all_as_of_times.extend(split['train_matrix']['as_of_times']) for test_matrix in split['test_matrices']: all_as_of_times.extend(test_matrix['as_of_times']) all_as_of_times = list(set(all_as_of_times)) # generate sparse state table state_table_generator.generate_sparse_table( dense_state_table='states', as_of_dates=all_as_of_times ) # create labels table label_generator.generate_all_labels( labels_table='labels', as_of_dates=all_as_of_times, label_windows=['6months'] ) # create feature table tasks # we would use FeatureGenerator#create_all_tables but want to use # the tasks dict directly to create a feature dict feature_table_tasks = feature_generator.generate_all_table_tasks( feature_aggregation_config=[{ 'prefix': 'cat', 'from_obj': 'cat_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'cat_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }, { 'prefix': 'dog', 'from_obj': 'dog_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'dog_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }], feature_dates=all_as_of_times, ) # create feature tables feature_generator.process_table_tasks(feature_table_tasks) # build feature dictionaries from feature tables and # subsetting config master_feature_dict = feature_dictionary_creator\ .feature_dictionary(feature_table_tasks.keys()) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict) ) # figure out what matrices need to be built _, matrix_build_tasks =\ planner.generate_plans( split_definitions, feature_dicts ) # go and build the matrices planner.build_all_matrices(matrix_build_tasks) # super basic assertion: did matrices we expect get created? matrix_directory = os.path.join(temp_dir, 'matrices') matrices = [path for path in os.listdir(matrix_directory) if '.csv' in path] metadatas = [path for path in os.listdir(matrix_directory) if '.yaml' in path] assert len(matrices) == expected_num_matrices assert len(metadatas) == expected_num_matrices
def run_import(url, **kwargs): db = Database(url=postgresql.url()) db.create() database_load(os.path.join(root_dir, 'data/performance-dataset.zip'), db_url=url, **kwargs) db.drop_all() def unzip_file(file_path): temp_dir = tempfile.mkdtemp() with closing(zipfile.ZipFile(file_path)) as z: z.extractall(temp_dir) return temp_dir with testing.postgresql.Postgresql() as postgresql: db = Database(url=postgresql.url()) db.engine.execute('create extension postgis;') db.engine.execute('create extension postgis_topology;') run_x = 3 start_time = time.time() batch_size = config #batch_sizes = [1000, 5000, 10000] batch_sizes = [10000] db_threads = [1]#,5,10] result_list = [] for batch_size in batch_sizes: for db_th in db_threads:
def test_thresholder_2014_threshold(self): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) thresholder = self.setup_data(engine) assert thresholder.model_groups_past_threshold( self.dataframe_as_of(thresholder, "2014-01-01")) == set([1])
async def test_allocate_parking_lot_to_user_fail(event_loop): with Postgresql() as postgresql: db = await DbAccess.create(postgresql.url(), event_loop, reset_tables=True) assert await db.allocate_parking_lot("test_user", 1) is False
def test_thresholder_2015_close(self): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) thresholder = self.setup_data(engine) assert thresholder.model_groups_close_to_best_case( self.dataframe_as_of(thresholder, "2015-01-01")) == set([2])
def test_integration(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) init_engine(db_engine) with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') project_path = 'econ-dev/inspections' # create train and test matrices train_matrix = pandas.DataFrame.from_dict({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': [7, 8] }).set_index('entity_id') train_metadata = { 'feature_start_time': datetime.date(2012, 12, 20), 'end_time': datetime.date(2016, 12, 20), 'label_name': 'label', 'label_timespan': '1y', 'feature_names': ['ft1', 'ft2'], 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'train' } # Creates a matrix entry in the matrices table with uuid from train_metadata MatrixFactory(matrix_uuid = "1234") session.commit() train_store = InMemoryMatrixStore(train_matrix, sample_metadata()) as_of_dates = [ datetime.date(2016, 12, 21), datetime.date(2017, 1, 21) ] test_stores = [ InMemoryMatrixStore( pandas.DataFrame.from_dict({ 'entity_id': [3], 'feature_one': [8], 'feature_two': [5], 'label': [5] }), { 'label_name': 'label', 'label_timespan': '1y', 'end_time': as_of_date, 'metta-uuid': '1234', 'indices': ['entity_id'], 'matrix_type': 'test', 'as_of_date_frequency': '1month' } ) for as_of_date in as_of_dates ] model_storage_engine = S3ModelStorageEngine(project_path) experiment_hash = save_experiment_and_get_hash({}, db_engine) # instantiate pipeline objects trainer = ModelTrainer( project_path=project_path, experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, ) predictor = Predictor( project_path, model_storage_engine, db_engine ) model_evaluator = ModelEvaluator( [{'metrics': ['precision@'], 'thresholds': {'top_n': [5]}}], [{}], db_engine ) # run the pipeline grid_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } model_ids = trainer.train_models( grid_config=grid_config, misc_db_parameters=dict(), matrix_store=train_store ) for model_id in model_ids: for as_of_date, test_store in zip(as_of_dates, test_stores): predictions_proba = predictor.predict( model_id, test_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two'] ) model_evaluator.evaluate( predictions_proba, test_store, model_id, ) # assert # 1. that the predictions table entries are present and # can be linked to the original models records = [ row for row in db_engine.execute('''select entity_id, model_id, as_of_date from test_results.predictions join model_metadata.models using (model_id) order by 3, 2''') ] assert records == [ (3, 1, datetime.datetime(2016, 12, 21)), (3, 2, datetime.datetime(2016, 12, 21)), (3, 3, datetime.datetime(2016, 12, 21)), (3, 4, datetime.datetime(2016, 12, 21)), (3, 1, datetime.datetime(2017, 1, 21)), (3, 2, datetime.datetime(2017, 1, 21)), (3, 3, datetime.datetime(2017, 1, 21)), (3, 4, datetime.datetime(2017, 1, 21)), ] # that evaluations are there records = [ row for row in db_engine.execute(''' select model_id, evaluation_start_time, metric, parameter from test_results.evaluations order by 2, 1''') ] assert records == [ (1, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (2, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (3, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (4, datetime.datetime(2016, 12, 21), 'precision@', '5_abs'), (1, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (2, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (3, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), (4, datetime.datetime(2017, 1, 21), 'precision@', '5_abs'), ]
def test_model_scoring_early_warning(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) metric_groups = [{ 'metrics': [ 'precision@', 'recall@', 'true positives@', 'true negatives@', 'false positives@', 'false negatives@' ], 'thresholds': { 'percentiles': [5.0, 10.0], 'top_n': [5, 10] } }, { 'metrics': [ 'f1', 'mediocre', 'accuracy', 'roc_auc', 'average precision score' ], }, { 'metrics': ['fbeta@'], 'parameters': [{ 'beta': 0.75 }, { 'beta': 1.25 }] }] custom_metrics = {'mediocre': always_half} model_scorer = ModelScorer(metric_groups, db_engine, custom_metrics=custom_metrics) trained_model, model_id = fake_trained_model( 'myproject', InMemoryModelStorageEngine('myproject'), db_engine) labels = fake_labels(5) as_of_date = datetime.date(2016, 5, 5) model_scorer.score( trained_model.predict_proba(labels)[:, 1], labels, model_id, as_of_date, as_of_date, '1y') # assert # that all of the records are there records = [ row[0] for row in db_engine.execute( '''select distinct(metric || parameter) from results.evaluations where model_id = %s and evaluation_start_time = %s order by 1''', (model_id, as_of_date)) ] assert records == [ 'accuracy', 'average precision score', 'f1', 'false [email protected]_pct', 'false negatives@10_abs', 'false [email protected]_pct', 'false negatives@5_abs', 'false [email protected]_pct', 'false positives@10_abs', 'false [email protected]_pct', 'false positives@5_abs', '[email protected]_beta', '[email protected]_beta', 'mediocre', '[email protected]_pct', 'precision@10_abs', '[email protected]_pct', 'precision@5_abs', '[email protected]_pct', 'recall@10_abs', '[email protected]_pct', 'recall@5_abs', 'roc_auc', 'true [email protected]_pct', 'true negatives@10_abs', 'true [email protected]_pct', 'true negatives@5_abs', 'true [email protected]_pct', 'true positives@10_abs', 'true [email protected]_pct', 'true positives@5_abs' ]
def test_predictor_retrieve(): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) project_path = 'econ-dev/inspections' model_storage_engine = InMemoryModelStorageEngine(project_path) _, model_id = \ fake_trained_model(project_path, model_storage_engine, db_engine) predictor = Predictor(project_path, model_storage_engine, db_engine, replace=False) dayone = datetime.date(2011, 1, 1).strftime(predictor.expected_matrix_ts_format) daytwo = datetime.date(2011, 1, 2).strftime(predictor.expected_matrix_ts_format) # create prediction set matrix_data = { 'entity_id': [1, 2, 1, 2], 'as_of_date': [dayone, dayone, daytwo, daytwo], 'feature_one': [3, 4, 5, 6], 'feature_two': [5, 6, 7, 8], 'label': [7, 8, 8, 7] } matrix = pandas.DataFrame.from_dict(matrix_data)\ .set_index(['entity_id', 'as_of_date']) metadata = { 'label_name': 'label', 'end_time': AS_OF_DATE, 'label_timespan': '3month', 'metta-uuid': '1234', 'indices': ['entity_id'], } matrix_store = InMemoryMatrixStore(matrix, metadata) predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) # When run again, the predictions retrieved from the database # should match. # # Some trickiness here. Let's explain: # # If we are not careful, retrieving predictions from the database and # presenting them as a numpy array can result in a bad ordering, # since the given matrix may not be 'ordered' by some criteria # that can be easily represented by an ORDER BY clause. # # It will sometimes work, because without ORDER BY you will get # it back in the table's physical order, which unless something has # happened to the table will be the order you inserted it, # which could very well be the order in the matrix. # So it's not a bug that would necessarily immediately show itself, # but when it does go wrong your scores will be garbage. # # So we simulate a table order mutation that can happen over time: # Remove the first row and put it at the end. # If the Predictor doesn't explicitly reorder the results, this will fail session = sessionmaker(bind=db_engine)() obj = session.query(Prediction).first() session.delete(obj) session.commit() make_transient(obj) session = sessionmaker(bind=db_engine)() session.add(obj) session.commit() predictor.load_model = Mock() new_predict_proba = predictor.predict( model_id, matrix_store, misc_db_parameters=dict(), train_matrix_columns=['feature_one', 'feature_two']) assert_array_equal(new_predict_proba, predict_proba) assert not predictor.load_model.called
def test_make_entity_date_table(): """ Test that the make_entity_date_table function contains the correct values. """ dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)] # make a dataframe of entity ids and dates to test against ids_dates = create_entity_date_df( labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name='booking', label_type='binary', label_window='1 month' ) with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) with TemporaryDirectory() as temp_dir: planner = Planner( beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0), label_names = ['booking'], label_types = ['binary'], states = ['state_one AND state_two'], db_config = db_config, matrix_directory = temp_dir, user_metadata = {}, engine = engine ) engine.execute( 'CREATE TABLE features.tmp_entity_date (a int, b date);' ) # call the function to test the creation of the table entity_date_table_name = planner.builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_uuid='my_uuid', matrix_type='train', label_window='1 month' ) # read in the table result = pd.read_sql( "select * from features.{} order by entity_id, as_of_date".format(entity_date_table_name), engine ) labels_df = pd.read_sql('select * from labels.labels', engine) # compare the table to the test dataframe print("ids_dates") for i, row in ids_dates.iterrows(): print(row.values) print("result") for i, row in result.iterrows(): print(row.values) test = (result == ids_dates) print(test) assert(test.all().all())
def test_feature_dictionary_creator(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) engine.execute('create schema features') engine.execute(''' create table features.prefix1_entity_id ( entity_id int, as_of_date date, feature_one float, feature_two float ) ''') engine.execute(''' create table features.prefix1_zipcode ( zipcode text, as_of_date date, feature_three float, feature_four float ) ''') engine.execute(''' create table features.prefix1_aggregation ( entity_id int, as_of_date date, zipcode text, feature_one float, feature_two float, feature_three float, feature_four float ) ''') engine.execute(''' create table features.prefix1_aggregation_imputed ( entity_id int, as_of_date date, zipcode text, feature_one float, feature_two float, feature_three float, feature_three_imp int, feature_four float ) ''') engine.execute(''' create table features.random_other_table ( another_column float ) ''') creator = FeatureDictionaryCreator( features_schema_name='features', db_engine=engine ) feature_dictionary = creator.feature_dictionary( feature_table_names=[ 'prefix1_entity_id', 'prefix1_zip_code', 'prefix1_aggregation', 'prefix1_aggregation_imputed' ], index_column_lookup={ 'prefix1_aggregation_imputed': ['entity_id', 'zipcode', 'as_of_date'] } ) assert feature_dictionary == { 'prefix1_aggregation_imputed': [ 'feature_one', 'feature_two', 'feature_three', 'feature_three_imp', 'feature_four' ], }
def test_write_labels_data(): """ Test the write_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0)] # make a dataframe of labels to test against labels_df = pd.DataFrame( labels, columns = [ 'entity_id', 'as_of_date', 'label_window', 'label_name', 'label_type', 'label' ] ) labels_df['as_of_date'] = convert_string_column_to_date(labels_df['as_of_date']) labels_df.set_index(['entity_id', 'as_of_date']) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas( engine, features_tables, labels, states ) with TemporaryDirectory() as temp_dir: planner = Planner( beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0), label_names = ['booking'], label_types = ['binary'], states = ['state_one AND state_two'], db_config = db_config, matrix_directory = temp_dir, user_metadata = {}, engine = engine, builder_class=builders.LowMemoryCSVBuilder ) # make the entity-date table entity_date_table_name = planner.builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state = 'state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_window='1 month' ) csv_filename = planner.builder.write_labels_data( label_name=label_name, label_type=label_type, label_window='1 month', matrix_uuid='my_uuid', entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict({ 'entity_id': [2, 3, 4, 4], 'as_of_date': ['2016-02-01', '2016-02-01', '2016-01-01', '2016-02-01'], 'booking': [0, 0, 1, 0], }).set_index(['entity_id', 'as_of_date']) result = pd.read_csv(csv_filename).set_index(['entity_id', 'as_of_date']) test = (result == df) assert(test.all().all())
def test_baselines_with_missing_features(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) # set up the config with the baseline model and feature group mixing config = sample_config() config['grid_config'] = { 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': { 'feature': ['entity_features_entity_id_1year_cat_sightings_count'] } } config['feature_group_definition'] = { 'tables': [ 'entity_features_aggregation_imputed', 'zip_code_features_aggregation_imputed' ] } config['feature_group_strategies'] = ['leave-one-in'] with TemporaryDirectory() as temp_dir: experiment_class(config=config, db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections')).run() # assert # 1. that model groups entries are present num_mgs = len([ row for row in db_engine.execute( 'select * from model_metadata.model_groups') ]) assert num_mgs > 0 # 2. that model entries are present, and linked to model groups num_models = len([ row for row in db_engine.execute(''' select * from model_metadata.model_groups join model_metadata.models using (model_group_id) where model_comment = 'test2-final-final' ''') ]) assert num_models > 0 # 3. predictions, linked to models num_predictions = len([ row for row in db_engine.execute(''' select * from test_results.predictions join model_metadata.models using (model_id)''') ]) assert num_predictions > 0 # 4. evaluations linked to predictions linked to models num_evaluations = len([ row for row in db_engine.execute(''' select * from test_results.evaluations e join model_metadata.models using (model_id) join test_results.predictions p on ( e.model_id = p.model_id and e.evaluation_start_time <= p.as_of_date and e.evaluation_end_time >= p.as_of_date) ''') ]) assert num_evaluations > 0 # 5. experiment num_experiments = len([ row for row in db_engine.execute( 'select * from model_metadata.experiments') ]) assert num_experiments == 1 # 6. that models are linked to experiments num_models_with_experiment = len([ row for row in db_engine.execute(''' select * from model_metadata.experiments join model_metadata.models using (experiment_hash) ''') ]) assert num_models == num_models_with_experiment # 7. that models have the train end date and label timespan results = [(model['train_end_time'], model['training_label_timespan']) for model in db_engine.execute( 'select * from model_metadata.models')] assert sorted(set(results)) == [ (datetime(2012, 6, 1), timedelta(180)), (datetime(2013, 6, 1), timedelta(180)), ] # 8. that the right number of individual importances are present individual_importances = [ row for row in db_engine.execute(''' select * from test_results.individual_importances join model_metadata.models using (model_id) ''') ] assert len( individual_importances) == num_predictions * 2 # only 2 features
def test_simple_experiment(experiment_class): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) ensure_db(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: experiment_class( config=sample_config(), db_engine=db_engine, model_storage_class=FSModelStorageEngine, project_path=os.path.join(temp_dir, 'inspections'), cleanup=True, ).run() # assert # 1. that model groups entries are present num_mgs = len([ row for row in db_engine.execute( 'select * from model_metadata.model_groups') ]) assert num_mgs > 0 # 2. that model entries are present, and linked to model groups num_models = len([ row for row in db_engine.execute(''' select * from model_metadata.model_groups join model_metadata.models using (model_group_id) where model_comment = 'test2-final-final' ''') ]) assert num_models > 0 # 3. predictions, linked to models for both training and testing predictions for set_type in ("train", "test"): num_predictions = len([ row for row in db_engine.execute(''' select * from {}_results.predictions join model_metadata.models using (model_id)'''.format( set_type, set_type)) ]) assert num_predictions > 0 # 4. evaluations linked to predictions linked to models, for training and testing for set_type in ("train", "test"): num_evaluations = len([ row for row in db_engine.execute(''' select * from {}_results.evaluations e join model_metadata.models using (model_id) join {}_results.predictions p on ( e.model_id = p.model_id and e.evaluation_start_time <= p.as_of_date and e.evaluation_end_time >= p.as_of_date) '''.format(set_type, set_type, set_type)) ]) assert num_evaluations > 0 # 5. experiment num_experiments = len([ row for row in db_engine.execute( 'select * from model_metadata.experiments') ]) assert num_experiments == 1 # 6. that models are linked to experiments num_models_with_experiment = len([ row for row in db_engine.execute(''' select * from model_metadata.experiments join model_metadata.models using (experiment_hash) ''') ]) assert num_models == num_models_with_experiment # 7. that models have the train end date and label timespan results = [(model['train_end_time'], model['training_label_timespan']) for model in db_engine.execute( 'select * from model_metadata.models')] assert sorted(set(results)) == [ (datetime(2012, 6, 1), timedelta(180)), (datetime(2013, 6, 1), timedelta(180)), ] # 8. that the right number of individual importances are present individual_importances = [ row for row in db_engine.execute(''' select * from test_results.individual_importances join model_metadata.models using (model_id) ''') ] assert len( individual_importances) == num_predictions * 2 # only 2 features # 9. Checking the proper matrices created and stored matrices = [ row for row in db_engine.execute(''' select matrix_type, num_observations from model_metadata.matrices''' ) ] types = [i[0] for i in matrices] counts = [i[1] for i in matrices] assert types.count('train') == 2 assert types.count('test') == 2 for i in counts: assert i > 0 assert len(matrices) == 4
def test_simple_model_trainer(): with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) ensure_db(engine) model_config = { 'sklearn.linear_model.LogisticRegression': { 'C': [0.00001, 0.0001], 'penalty': ['l1', 'l2'], 'random_state': [2193] } } with mock_s3(): s3_conn = boto3.resource('s3') s3_conn.create_bucket(Bucket='econ-dev') # create training set with fake_metta({ 'entity_id': [1, 2], 'feature_one': [3, 4], 'feature_two': [5, 6], 'label': ['good', 'bad'] }, {'label_name': 'label'}) as (matrix_path, metadata_path): trainer = SimpleModelTrainer( training_set_path=matrix_path, training_metadata_path=metadata_path, model_config=model_config, project_path='econ-dev/inspections', s3_conn=s3_conn, db_engine=engine ) cache_keys = trainer.train_models() # assert # 1. that all four models are cached model_pickles = [ pickle.loads(cache_key.get()['Body'].read()) for cache_key in cache_keys ] assert len(model_pickles) == 4 assert len([x for x in model_pickles if x is not None]) == 4 # 2. that their results can have predictions made on it test_matrix = pandas.DataFrame.from_dict({ 'entity_id': [3, 4], 'feature_one': [4, 4], 'feature_two': [6, 5], }) for model_pickle in model_pickles: predictions = model_pickle.predict(test_matrix) assert len(predictions) == 2 # 3. that the models table entries are present records = [ row for row in engine.execute('select * from results.models') ] assert len(records) == 4 records = [ row for row in engine.execute('select * from results.feature_importances') ] assert len(records) == 4 * 3 # maybe exclude entity_id?
def run_import(url, **kwargs): db = Database(url=postgresql.url()) db.create() database_load(os.path.join(root_dir, 'data/performance-dataset.zip'), db_url=url, **kwargs) db.drop_all()
def test_replace(): aggregate_config = [{ "prefix": "aprefix", "aggregates_imputation": { "all": { "type": "mean" } }, "aggregates": [{ "quantity": "quantity_one", "metrics": ["sum", "count"] }], "categoricals": [{ "column": "cat_one", "choices": ["good", "bad"], "metrics": ["sum"], "imputation": { "all": { "type": "null_category" } }, }], "groups": ["entity_id"], "intervals": ["all"], "knowledge_date_column": "knowledge_date", "from_obj": "data", }] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = "features" feature_tables = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name, replace=False).create_all_tables( feature_dates=["2013-09-30", "2014-09-30"], feature_aggregation_config=aggregate_config, state_table="states", ) assert len(feature_tables) == 1 assert list(feature_tables)[0] == "aprefix_aggregation_imputed" feature_generator = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name, replace=False) aggregations = feature_generator.aggregations( feature_dates=["2013-09-30", "2014-09-30"], feature_aggregation_config=aggregate_config, state_table="states", ) table_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type="aggregation") assert len(table_tasks["aprefix_entity_id"].keys()) == 0 imp_tasks = feature_generator.generate_all_table_tasks( aggregations, task_type="imputation") assert len(imp_tasks["aprefix_aggregation_imputed"].keys()) == 0 engine.dispose()