def test_test_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: planner = Planner(feature_start_time=datetime.datetime( 2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine) matrix_dates = { 'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0), 'matrix_info_end_time': datetime.datetime(2016, 3, 1, 0, 0), 'as_of_times': dates } feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month' } uuid = metta.generate_uuid(matrix_metadata) planner.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid)) with open(matrix_filename, 'r') as f: reader = csv.reader(f) assert (len([row for row in reader]) == 6)
def test_write_to_csv(): """ Test the write_to_csv function by checking whether the csv contains the correct number of lines. """ with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with TemporaryDirectory() as temp_dir: planner = Planner(feature_start_time=datetime.datetime( 2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine, builder_class=builders.HighMemoryCSVBuilder) # for each table, check that corresponding csv has the correct # of rows for table in features_tables: planner.builder.write_to_csv( ''' select * from features.features{} '''.format(features_tables.index(table)), 'test_csv.csv') reader = csv.reader( planner.builder.open_fh_for_reading('test_csv.csv')) assert (len([row for row in reader]) == len(table) + 1)
def test_make_entity_date_table(): """ Test that the make_entity_date_table function contains the correct values. """ dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] # make a dataframe of entity ids and dates to test against ids_dates = create_entity_date_df(labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name='booking', label_type='binary', label_timespan='1 month') with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with TemporaryDirectory() as temp_dir: planner = Planner(feature_start_time=datetime.datetime( 2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine) engine.execute( 'CREATE TABLE features.tmp_entity_date (a int, b date);') # call the function to test the creation of the table entity_date_table_name = planner.builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_uuid='my_uuid', matrix_type='train', label_timespan='1 month') # read in the table result = pd.read_sql( "select * from features.{} order by entity_id, as_of_date". format(entity_date_table_name), engine) labels_df = pd.read_sql('select * from labels.labels', engine) # compare the table to the test dataframe test = (result == ids_dates) assert (test.all().all())
def test_badinput(self): """We assert column names, so replacing 'date' with 'as_of_date' should result in an error""" with TemporaryDirectory() as temp_dir: planner = Planner( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=None, ) rowlists = [ [ ('entity_id', 'date', 'f1'), (1, 3, 3), (4, 5, 6), (7, 8, 9), ], [ ('entity_id', 'date', 'f2'), (1, 2, 3), (4, 5, 9), (7, 8, 15), ], [ ('entity_id', 'date', 'f3'), (1, 2, 2), (4, 5, 20), (7, 8, 56), ], ] filekeys = [] for rows in rowlists: filekey = uuid.uuid4() planner.builder.open_fh_for_writing(filekey) filekeys.append(filekey) writer = csv.writer(planner.builder.filehandles[filekey]) for row in rows: writer.writerow(row) with self.assertRaises(KeyError): planner.builder.merge_feature_csvs(filekeys, matrix_directory=temp_dir, matrix_uuid='1234')
def test_Planner(): matrix_set_definitions = [{ 'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0), 'modeling_start_time': datetime.datetime(2010, 1, 1, 0, 0), 'modeling_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'train_matrix': { 'first_as_of_time': datetime.datetime(2010, 1, 1, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 6, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0) ] }, 'test_matrices': [{ 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 11, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 7, 0, 0), datetime.datetime(2010, 1, 8, 0, 0), datetime.datetime(2010, 1, 9, 0, 0), datetime.datetime(2010, 1, 10, 0, 0) ] }] }, { 'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0), 'modeling_start_time': datetime.datetime(2010, 1, 1, 0, 0), 'modeling_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'train_matrix': { 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 11, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 7, 0, 0), datetime.datetime(2010, 1, 8, 0, 0), datetime.datetime(2010, 1, 9, 0, 0), datetime.datetime(2010, 1, 10, 0, 0) ] }, 'test_matrices': [{ 'first_as_of_time': datetime.datetime(2010, 1, 11, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 11, 0, 0), datetime.datetime(2010, 1, 12, 0, 0), datetime.datetime(2010, 1, 13, 0, 0), datetime.datetime(2010, 1, 14, 0, 0), datetime.datetime(2010, 1, 15, 0, 0) ] }] }] feature_dict_one = FeatureGroup(name='first_features', features_by_table={ 'features0': ['f1', 'f2'], 'features1': ['f1', 'f2'] }) feature_dict_two = FeatureGroup(name='second_features', features_by_table={ 'features2': ['f3', 'f4'], 'features3': ['f5', 'f6'] }) feature_dicts = [feature_dict_one, feature_dict_two] planner = Planner( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], cohort_name='prior_bookings', states=['state_one AND state_two'], user_metadata={}, ) updated_matrix_definitions, build_tasks = \ planner.generate_plans(matrix_set_definitions, feature_dicts) # test that it added uuids: we don't much care what they are matrix_uuids = [] for matrix_def in updated_matrix_definitions: assert isinstance(matrix_def['train_uuid'], str) matrix_uuids.append(matrix_def['train_uuid']) for test_uuid in matrix_def['test_uuids']: assert isinstance(test_uuid, str) assert len(set(matrix_uuids)) == 4 # not going to assert anything on the keys (uuids), just get out the values build_tasks = build_tasks.values() assert len( build_tasks) == 8 # 2 splits * 2 matrices per split * 2 feature dicts assert sum(1 for task in build_tasks if task['matrix_type'] == 'train') == 4 assert sum(1 for task in build_tasks if task['matrix_type'] == 'test') == 4 assert sum(1 for task in build_tasks if task['feature_dictionary'] == feature_dict_one) == 4 assert sum(1 for task in build_tasks if task['feature_dictionary'] == feature_dict_two) == 4 assert sum(1 for task in build_tasks if task['matrix_metadata'] ['feature_groups'] == ['first_features']) == 4 assert sum(1 for task in build_tasks if task['matrix_metadata'] ['feature_groups'] == ['second_features']) == 4 assert sum( 1 for task in build_tasks if task['matrix_metadata']['cohort_name'] == 'prior_bookings') == 8
def test_Planner(): matrix_set_definitions = [ { "feature_start_time": datetime.datetime(1990, 1, 1, 0, 0), "modeling_start_time": datetime.datetime(2010, 1, 1, 0, 0), "modeling_end_time": datetime.datetime(2010, 1, 16, 0, 0), "train_matrix": { "first_as_of_time": datetime.datetime(2010, 1, 1, 0, 0), "matrix_info_end_time": datetime.datetime(2010, 1, 6, 0, 0), "as_of_times": [ datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0), ], }, "test_matrices": [{ "first_as_of_time": datetime.datetime(2010, 1, 6, 0, 0), "matrix_info_end_time": datetime.datetime(2010, 1, 11, 0, 0), "as_of_times": [ datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 7, 0, 0), datetime.datetime(2010, 1, 8, 0, 0), datetime.datetime(2010, 1, 9, 0, 0), datetime.datetime(2010, 1, 10, 0, 0), ], }], }, { "feature_start_time": datetime.datetime(1990, 1, 1, 0, 0), "modeling_start_time": datetime.datetime(2010, 1, 1, 0, 0), "modeling_end_time": datetime.datetime(2010, 1, 16, 0, 0), "train_matrix": { "first_as_of_time": datetime.datetime(2010, 1, 6, 0, 0), "matrix_info_end_time": datetime.datetime(2010, 1, 11, 0, 0), "as_of_times": [ datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 7, 0, 0), datetime.datetime(2010, 1, 8, 0, 0), datetime.datetime(2010, 1, 9, 0, 0), datetime.datetime(2010, 1, 10, 0, 0), ], }, "test_matrices": [{ "first_as_of_time": datetime.datetime(2010, 1, 11, 0, 0), "matrix_info_end_time": datetime.datetime(2010, 1, 16, 0, 0), "as_of_times": [ datetime.datetime(2010, 1, 11, 0, 0), datetime.datetime(2010, 1, 12, 0, 0), datetime.datetime(2010, 1, 13, 0, 0), datetime.datetime(2010, 1, 14, 0, 0), datetime.datetime(2010, 1, 15, 0, 0), ], }], }, ] feature_dict_one = FeatureGroup( name="first_features", features_by_table={ "features0": ["f1", "f2"], "features1": ["f1", "f2"] }, ) feature_dict_two = FeatureGroup( name="second_features", features_by_table={ "features2": ["f3", "f4"], "features3": ["f5", "f6"] }, ) feature_dicts = [feature_dict_one, feature_dict_two] planner = Planner( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_names=["booking"], label_types=["binary"], cohort_names=["prior_bookings"], user_metadata={}, ) updated_matrix_definitions, build_tasks = planner.generate_plans( matrix_set_definitions, feature_dicts) # test that it added uuids: we don't much care what they are matrix_uuids = [] for matrix_def in updated_matrix_definitions: assert isinstance(matrix_def["train_uuid"], str) matrix_uuids.append(matrix_def["train_uuid"]) for test_uuid in matrix_def["test_uuids"]: assert isinstance(test_uuid, str) assert len(set(matrix_uuids)) == 4 # not going to assert anything on the keys (uuids), just get out the values build_tasks = build_tasks.values() assert len( build_tasks) == 8 # 2 splits * 2 matrices per split * 2 feature dicts assert sum(1 for task in build_tasks if task["matrix_type"] == "train") == 4 assert sum(1 for task in build_tasks if task["matrix_type"] == "test") == 4 assert (sum(1 for task in build_tasks if task["feature_dictionary"] == feature_dict_one) == 4) assert (sum(1 for task in build_tasks if task["feature_dictionary"] == feature_dict_two) == 4) assert (sum(1 for task in build_tasks if task["matrix_metadata"] ["feature_groups"] == ["first_features"]) == 4) assert (sum(1 for task in build_tasks if task["matrix_metadata"] ["feature_groups"] == ["second_features"]) == 4) assert (sum( 1 for task in build_tasks if task["matrix_metadata"]["cohort_name"] == "prior_bookings") == 8)
def test_nullcheck(self): f0_dict = {(r[0], r[1]): r for r in features0_pre} f1_dict = {(r[0], r[1]): r for r in features1_pre} features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0])) features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0])) features_tables = [features0, features1] with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: planner = Planner(feature_start_time=datetime.datetime( 2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine) matrix_dates = { 'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0), 'matrix_info_end_time': datetime.datetime(2016, 3, 1, 0, 0), 'as_of_times': dates } feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month' } uuid = metta.generate_uuid(matrix_metadata) with self.assertRaises(ValueError): planner.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test')
def test_generate_plans(): matrix_set_definitions = [{ 'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0), 'modeling_start_time': datetime.datetime(2010, 1, 1, 0, 0), 'modeling_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'train_matrix': { 'first_as_of_time': datetime.datetime(2010, 1, 1, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 6, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0) ] }, 'test_matrices': [{ 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 11, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 7, 0, 0), datetime.datetime(2010, 1, 8, 0, 0), datetime.datetime(2010, 1, 9, 0, 0), datetime.datetime(2010, 1, 10, 0, 0) ] }] }, { 'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0), 'modeling_start_time': datetime.datetime(2010, 1, 1, 0, 0), 'modeling_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'train_matrix': { 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 11, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 7, 0, 0), datetime.datetime(2010, 1, 8, 0, 0), datetime.datetime(2010, 1, 9, 0, 0), datetime.datetime(2010, 1, 10, 0, 0) ] }, 'test_matrices': [{ 'first_as_of_time': datetime.datetime(2010, 1, 11, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 11, 0, 0), datetime.datetime(2010, 1, 12, 0, 0), datetime.datetime(2010, 1, 13, 0, 0), datetime.datetime(2010, 1, 14, 0, 0), datetime.datetime(2010, 1, 15, 0, 0) ] }] }] feature_dict_one = {'features0': ['f1', 'f2'], 'features1': ['f1', 'f2']} feature_dict_two = {'features2': ['f3', 'f4'], 'features3': ['f5', 'f6']} feature_dicts = [feature_dict_one, feature_dict_two] planner = Planner( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, user_metadata={}, matrix_directory='', # this test won't write anything engine=None # or look at the db! ) updated_matrix_definitions, build_tasks = planner.generate_plans( matrix_set_definitions, feature_dicts) # test that it added uuids: we don't much care what they are matrix_uuids = [] for matrix_def in updated_matrix_definitions: assert isinstance(matrix_def['train_uuid'], str) matrix_uuids.append(matrix_def['train_uuid']) for test_uuid in matrix_def['test_uuids']: assert isinstance(test_uuid, str) assert len(set(matrix_uuids)) == 4 # not going to assert anything on the keys (uuids), just get out the values build_tasks = build_tasks.values() assert len( build_tasks) == 8 # 2 splits * 2 matrices per split * 2 feature dicts assert sum(1 for task in build_tasks if task['matrix_type'] == 'train') == 4 assert sum(1 for task in build_tasks if task['matrix_type'] == 'test') == 4 assert all(task for task in build_tasks if task['matrix_directory'] == '') assert sum(1 for task in build_tasks if task['feature_dictionary'] == feature_dict_one) == 4 assert sum(1 for task in build_tasks if task['feature_dictionary'] == feature_dict_two) == 4
def test_write_labels_data(): """ Test the write_labels_data function by checking whether the query produces the correct labels """ # set up labeling config variables dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0) ] # make a dataframe of labels to test against labels_df = pd.DataFrame(labels, columns=[ 'entity_id', 'as_of_date', 'label_timespan', 'label_name', 'label_type', 'label' ]) labels_df['as_of_date'] = convert_string_column_to_date( labels_df['as_of_date']) labels_df.set_index(['entity_id', 'as_of_date']) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine, features_tables, labels, states) with TemporaryDirectory() as temp_dir: planner = Planner( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine, ) # make the entity-date table entity_date_table_name = planner.builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_timespan='1 month') csv_filename = planner.builder.write_labels_data( label_name=label_name, label_type=label_type, label_timespan='1 month', matrix_uuid='my_uuid', entity_date_table_name=entity_date_table_name, ) df = pd.DataFrame.from_dict({ 'entity_id': [2, 3, 4, 4], 'as_of_date': ['2016-02-01', '2016-02-01', '2016-01-01', '2016-02-01'], 'booking': [0, 0, 1, 0], }).set_index(['entity_id', 'as_of_date']) result = pd.read_csv(planner.builder.open_fh_for_reading(csv_filename))\ .set_index(['entity_id', 'as_of_date']) test = (result == df) assert (test.all().all())
def test_write_features_data(): dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0) ] # make dataframe for entity ids and dates ids_dates = create_entity_date_df(labels=labels, states=states, as_of_dates=dates, state_one=True, state_two=True, label_name='booking', label_type='binary', label_timespan='1 month') features = [['f1', 'f2'], ['f3', 'f4']] # make dataframes of features to test against features_dfs = [] for i, table in enumerate(features_tables): cols = ['entity_id', 'as_of_date'] + features[i] temp_df = pd.DataFrame(table, columns=cols) temp_df['as_of_date'] = convert_string_column_to_date( temp_df['as_of_date']) features_dfs.append( ids_dates.merge(right=temp_df, how='left', on=['entity_id', 'as_of_date'])) # create an engine and generate a table with fake feature data with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with TemporaryDirectory() as temp_dir: planner = Planner( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine, ) # make the entity-date table entity_date_table_name = planner.builder.make_entity_date_table( as_of_times=dates, label_type='binary', label_name='booking', state='state_one AND state_two', matrix_type='train', matrix_uuid='my_uuid', label_timespan='1 month') feature_dictionary = dict( ('features{}'.format(i), feature_list) for i, feature_list in enumerate(features)) features_csv_names = planner.builder.write_features_data( as_of_times=dates, feature_dictionary=feature_dictionary, entity_date_table_name=entity_date_table_name, matrix_uuid='my_uuid') # get the queries and test them for feature_csv_name, df in zip(sorted(features_csv_names), features_dfs): df = df.reset_index() result = pd.read_csv(planner.builder.open_fh_for_reading(feature_csv_name))\ .reset_index() result['as_of_date'] = convert_string_column_to_date( result['as_of_date']) test = (result == df) assert (test.all().all())