def test_valid_input(self): expected_result = [ datetime.datetime(2015, 3, 1, 0, 0), datetime.datetime(2015, 6, 1, 0, 0), datetime.datetime(2015, 9, 1, 0, 0), datetime.datetime(2015, 12, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), datetime.datetime(2016, 6, 1, 0, 0) ] chopper = Timechop( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), feature_end_time=datetime.datetime(2017, 1, 1, 0, 0), label_start_time=datetime.datetime(2015, 1, 1, 0, 0), label_end_time=datetime.datetime(2017, 1, 1, 0, 0), model_update_frequency='3 months', training_as_of_date_frequencies=['1 day'], test_as_of_date_frequencies=['1 day'], max_training_histories=['1 year'], test_durations=['6 months'], test_label_timespans=['1 months'], training_label_timespans=['3 days']) # this should throw an exception because last possible label date is after # end of feature time result = chopper.calculate_train_test_split_times( training_label_timespan=convert_str_to_relativedelta('3 days'), test_duration='6 months', test_label_timespan=convert_str_to_relativedelta('1 month')) assert result == expected_result
def test_calculate_as_of_times_one_day_freq(): expected_result = [ datetime.datetime(2011, 1, 1, 0, 0), datetime.datetime(2011, 1, 2, 0, 0), datetime.datetime(2011, 1, 3, 0, 0), datetime.datetime(2011, 1, 4, 0, 0), datetime.datetime(2011, 1, 5, 0, 0), datetime.datetime(2011, 1, 6, 0, 0), datetime.datetime(2011, 1, 7, 0, 0), datetime.datetime(2011, 1, 8, 0, 0), datetime.datetime(2011, 1, 9, 0, 0), datetime.datetime(2011, 1, 10, 0, 0), datetime.datetime(2011, 1, 11, 0, 0) ] chopper = Timechop(feature_start_time=datetime.datetime(1990, 1, 1, 0, 0), feature_end_time=datetime.datetime(2012, 1, 1, 0, 0), label_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_end_time=datetime.datetime(2012, 1, 1, 0, 0), model_update_frequency='1 year', training_as_of_date_frequencies=['1 days'], test_as_of_date_frequencies=['7 days'], max_training_histories=['10 days', '1 year'], test_durations=['1 month'], test_label_timespans=['1 day'], training_label_timespans=['3 months']) result = chopper.calculate_as_of_times( as_of_start_limit=datetime.datetime(2011, 1, 1, 0, 0), as_of_end_limit=datetime.datetime(2011, 1, 11, 0, 0), data_frequency=convert_str_to_relativedelta('1 days')) assert (result == expected_result)
def matrices(): timechop_config = snakify_keys(request.get_json()) for datetime_key in ['feature_start_time', 'feature_end_time', 'label_start_time', 'label_end_time']: timechop_config[datetime_key] = datetime.strptime(timechop_config[datetime_key], '%Y-%m-%d') try: chopper = Timechop(**timechop_config) results = chopper.chop_time() return jsonify(data=results, error='') except Exception as e: return jsonify(data=[], error=str(e))
def test_no_valid_label_dates(self): chopper = Timechop( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), feature_end_time=datetime.datetime(2016, 1, 1, 0, 0), label_start_time=datetime.datetime(2015, 1, 1, 0, 0), label_end_time=datetime.datetime(2015, 2, 1, 0, 0), model_update_frequency='3 months', training_as_of_date_frequencies=['1 day'], test_as_of_date_frequencies=['1 day'], max_training_histories=['1 year'], test_durations=['6 months'], test_label_timespans=['1 months'], training_label_timespans=['3 days']) # this should raise an error because there are no valid label dates in # the labeling time (label span is longer than labeling time) with self.assertRaises(ValueError): chopper.calculate_train_test_split_times( training_label_timespan=convert_str_to_relativedelta('3 days'), test_duration='6 months', test_label_timespan=convert_str_to_relativedelta('1 month'))
def test_labels_after_features(self): chopper = Timechop( feature_start_time=datetime.datetime(2010, 1, 1, 0, 0), feature_end_time=datetime.datetime(2016, 1, 1, 0, 0), label_start_time=datetime.datetime(2015, 1, 1, 0, 0), label_end_time=datetime.datetime(2017, 1, 1, 0, 0), model_update_frequency='3 months', training_as_of_date_frequencies=['1 day'], test_as_of_date_frequencies=['1 day'], max_training_histories=['1 year'], test_durations=['6 months'], test_label_timespans=['1 months'], training_label_timespans=['3 days']) # this should throw an exception because last possible label date is after # end of feature time with self.assertRaises(ValueError): result = chopper.calculate_train_test_split_times( training_label_timespan=convert_str_to_relativedelta('3 days'), test_duration='6 months', test_label_timespan=convert_str_to_relativedelta('1 month'))
def test_bad_feature_start_time(self): with self.assertRaises(ValueError): chopper = Timechop( feature_start_time=datetime.datetime(2011, 1, 1, 0, 0), feature_end_time=datetime.datetime(2010, 1, 16, 0, 0), label_start_time=datetime.datetime(2010, 1, 3, 0, 0), label_end_time=datetime.datetime(2010, 1, 16, 0, 0), model_update_frequency='5 days', training_as_of_date_frequencies=['1 days'], test_as_of_date_frequencies=['1 days'], max_training_histories=['5 days'], test_durations=['5 days'], test_label_timespans=['1 day'], training_label_timespans=['1 day'])
def basic_integration_test(state_filters, feature_group_create_rules, feature_group_mix_rules, expected_num_matrices): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) Base.metadata.create_all(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: chopper = Timechop( beginning_of_time=datetime(2010, 1, 1), modeling_start_time=datetime(2011, 1, 1), modeling_end_time=datetime(2014, 1, 1), update_window='1y', train_label_windows=['6months'], test_label_windows=['6months'], train_example_frequency='1day', test_example_frequency='3months', train_durations=['1months'], test_durations=['1months'], ) state_table_generator = StateTableGenerator(db_engine=db_engine, experiment_hash='abcd') label_generator = BinaryLabelGenerator(db_engine=db_engine, events_table='events') feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name='features', replace=True, ) feature_dictionary_creator = FeatureDictionaryCreator( db_engine=db_engine, features_schema_name='features') feature_group_creator = FeatureGroupCreator( feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) planner = Planner(engine=db_engine, beginning_of_time=datetime(2010, 1, 1), label_names=['outcome'], label_types=['binary'], db_config={ 'features_schema_name': 'features', 'labels_schema_name': 'public', 'labels_table_name': 'labels', 'sparse_state_table_name': 'tmp_sparse_states_abcd', }, matrix_directory=os.path.join( temp_dir, 'matrices'), states=state_filters, user_metadata={}, replace=True) # chop time split_definitions = chopper.chop_time() # generate as_of_times for feature/label/state generation all_as_of_times = [] for split in split_definitions: all_as_of_times.extend(split['train_matrix']['as_of_times']) for test_matrix in split['test_matrices']: all_as_of_times.extend(test_matrix['as_of_times']) all_as_of_times = list(set(all_as_of_times)) # generate sparse state table state_table_generator.generate_sparse_table( dense_state_table='states', as_of_dates=all_as_of_times) # create labels table label_generator.generate_all_labels(labels_table='labels', as_of_dates=all_as_of_times, label_windows=['6months']) # create feature table tasks # we would use FeatureGenerator#create_all_tables but want to use # the tasks dict directly to create a feature dict feature_table_tasks = feature_generator.generate_all_table_tasks( feature_aggregation_config=[{ 'prefix': 'cat', 'from_obj': 'cat_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'cat_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }, { 'prefix': 'dog', 'from_obj': 'dog_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'dog_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }], feature_dates=all_as_of_times, ) # create feature tables feature_generator.process_table_tasks(feature_table_tasks) # build feature dictionaries from feature tables and # subsetting config master_feature_dict = feature_dictionary_creator\ .feature_dictionary(feature_table_tasks.keys()) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict)) # figure out what matrices need to be built _, matrix_build_tasks =\ planner.generate_plans( split_definitions, feature_dicts ) # go and build the matrices planner.build_all_matrices(matrix_build_tasks) # super basic assertion: did matrices we expect get created? matrix_directory = os.path.join(temp_dir, 'matrices') matrices = [ path for path in os.listdir(matrix_directory) if '.csv' in path ] metadatas = [ path for path in os.listdir(matrix_directory) if '.yaml' in path ] assert len(matrices) == expected_num_matrices assert len(metadatas) == expected_num_matrices
def test_unevenly_divisible_update_window(self): expected_result = [{ 'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0), 'label_start_time': datetime.datetime(2010, 1, 3, 0, 0), 'feature_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'label_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'train_matrix': { 'first_as_of_time': datetime.datetime(2010, 1, 3, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 4, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 5, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0) ], 'training_label_timespan': '1 day', 'training_as_of_date_frequency': '1 days', 'max_training_history': '5 days' }, 'test_matrices': [{ 'first_as_of_time': datetime.datetime(2010, 1, 5, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 9, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 10, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 5, 0, 0), datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 7, 0, 0), datetime.datetime(2010, 1, 8, 0, 0), datetime.datetime(2010, 1, 9, 0, 0) ], 'test_label_timespan': '1 day', 'test_as_of_date_frequency': '1 days', 'test_duration': '5 days' }] }, { 'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0), 'label_start_time': datetime.datetime(2010, 1, 3, 0, 0), 'feature_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'label_end_time': datetime.datetime(2010, 1, 16, 0, 0), 'train_matrix': { 'first_as_of_time': datetime.datetime(2010, 1, 4, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 9, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 10, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0), datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 7, 0, 0), datetime.datetime(2010, 1, 8, 0, 0), datetime.datetime(2010, 1, 9, 0, 0) ], 'training_label_timespan': '1 day', 'training_as_of_date_frequency': '1 days', 'max_training_history': '5 days' }, 'test_matrices': [{ 'first_as_of_time': datetime.datetime(2010, 1, 10, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 14, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 15, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 10, 0, 0), datetime.datetime(2010, 1, 11, 0, 0), datetime.datetime(2010, 1, 12, 0, 0), datetime.datetime(2010, 1, 13, 0, 0), datetime.datetime(2010, 1, 14, 0, 0) ], 'test_label_timespan': '1 day', 'test_as_of_date_frequency': '1 days', 'test_duration': '5 days' }] }] chopper = Timechop( feature_start_time=datetime.datetime(1990, 1, 1, 0, 0), feature_end_time=datetime.datetime(2010, 1, 16, 0, 0), label_start_time=datetime.datetime(2010, 1, 3, 0, 0), label_end_time=datetime.datetime(2010, 1, 16, 0, 0), model_update_frequency='5 days', training_as_of_date_frequencies=['1 days'], test_as_of_date_frequencies=['1 days'], max_training_histories=['5 days'], test_durations=['5 days'], test_label_timespans=['1 day'], training_label_timespans=['1 day']) result = chopper.chop_time() assert (result == expected_result)
def test_look_back_time_before_modeling_start(self): expected_result = { 'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0), 'label_start_time': datetime.datetime(2010, 1, 1, 0, 0), 'feature_end_time': datetime.datetime(2010, 1, 11, 0, 0), 'label_end_time': datetime.datetime(2010, 1, 11, 0, 0), 'train_matrix': { 'first_as_of_time': datetime.datetime(2010, 1, 1, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 5, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 6, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0) ], 'training_label_timespan': '1 day', 'training_as_of_date_frequency': '1 days', 'max_training_history': '10 days' }, 'test_matrices': [{ 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 9, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 10, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 9, 0, 0) ], 'test_label_timespan': '1 day', 'test_as_of_date_frequency': '3 days', 'test_duration': '5 days' }, { 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 6, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 7, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 6, 0, 0), ], 'test_label_timespan': '1 day', 'test_as_of_date_frequency': '6 days', 'test_duration': '5 days' }] } chopper = Timechop( feature_start_time=datetime.datetime(1990, 1, 1, 0, 0), feature_end_time=datetime.datetime(2010, 1, 11, 0, 0), label_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_end_time=datetime.datetime(2010, 1, 11, 0, 0), model_update_frequency='5 days', training_as_of_date_frequencies=['1 days'], test_as_of_date_frequencies=['3 days', '6 days'], max_training_histories=['10 days'], test_durations=['5 days'], test_label_timespans=['1 day'], training_label_timespans=['1 day']) result = chopper.generate_matrix_definitions( train_test_split_time=datetime.datetime(2010, 1, 6, 0, 0), training_as_of_date_frequency='1 days', max_training_history='10 days', test_duration='5 days', test_label_timespan='1 day', training_label_timespan='1 day') assert result == expected_result
def test_look_back_time_equal_modeling_start(self): # TODO: rework this test since the test label window of 3 months # cannot be satisfied by the 10 day difference between modeling # start and end times, so it's not a very realistic case expected_result = { 'feature_start_time': datetime.datetime(1990, 1, 1, 0, 0), 'label_start_time': datetime.datetime(2010, 1, 1, 0, 0), 'feature_end_time': datetime.datetime(2010, 1, 11, 0, 0), 'label_end_time': datetime.datetime(2010, 1, 11, 0, 0), 'train_matrix': { 'first_as_of_time': datetime.datetime(2010, 1, 1, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 5, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 6, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 1, 0, 0), datetime.datetime(2010, 1, 2, 0, 0), datetime.datetime(2010, 1, 3, 0, 0), datetime.datetime(2010, 1, 4, 0, 0), datetime.datetime(2010, 1, 5, 0, 0) ], 'training_label_timespan': '1 day', 'training_as_of_date_frequency': '1 days', 'max_training_history': '5 days' }, 'test_matrices': [{ 'first_as_of_time': datetime.datetime(2010, 1, 6, 0, 0), 'last_as_of_time': datetime.datetime(2010, 1, 9, 0, 0), 'matrix_info_end_time': datetime.datetime(2010, 1, 10, 0, 0), 'as_of_times': [ datetime.datetime(2010, 1, 6, 0, 0), datetime.datetime(2010, 1, 9, 0, 0) ], 'test_label_timespan': '1 day', 'test_as_of_date_frequency': '3 days', 'test_duration': '5 days' }] } chopper = Timechop( feature_start_time=datetime.datetime(1990, 1, 1, 0, 0), feature_end_time=datetime.datetime(2010, 1, 11, 0, 0), label_start_time=datetime.datetime(2010, 1, 1, 0, 0), label_end_time=datetime.datetime(2010, 1, 11, 0, 0), model_update_frequency='5 days', training_as_of_date_frequencies=['1 days'], test_as_of_date_frequencies=['3 days'], max_training_histories=['5 days'], test_durations=['5 days'], test_label_timespans=['1 day'], training_label_timespans=['1 day']) result = chopper.generate_matrix_definitions( train_test_split_time=datetime.datetime(2010, 1, 6, 0, 0), training_as_of_date_frequency='1 days', max_training_history='5 days', test_duration='5 days', test_label_timespan='1 day', training_label_timespan='1 day') assert result == expected_result
def basic_integration_test( state_filters, feature_group_create_rules, feature_group_mix_rules, expected_num_matrices ): with testing.postgresql.Postgresql() as postgresql: db_engine = create_engine(postgresql.url()) Base.metadata.create_all(db_engine) populate_source_data(db_engine) with TemporaryDirectory() as temp_dir: chopper = Timechop( beginning_of_time=datetime(2010, 1, 1), modeling_start_time=datetime(2011, 1, 1), modeling_end_time=datetime(2014, 1, 1), update_window='1y', train_label_windows=['6months'], test_label_windows=['6months'], train_example_frequency='1day', test_example_frequency='3months', train_durations=['1months'], test_durations=['1months'], ) state_table_generator = StateTableGenerator( db_engine=db_engine, experiment_hash='abcd' ) label_generator = BinaryLabelGenerator( db_engine=db_engine, events_table='events' ) feature_generator = FeatureGenerator( db_engine=db_engine, features_schema_name='features', replace=True, ) feature_dictionary_creator = FeatureDictionaryCreator( db_engine=db_engine, features_schema_name='features' ) feature_group_creator = FeatureGroupCreator(feature_group_create_rules) feature_group_mixer = FeatureGroupMixer(feature_group_mix_rules) planner = Planner( engine=db_engine, beginning_of_time=datetime(2010, 1, 1), label_names=['outcome'], label_types=['binary'], db_config={ 'features_schema_name': 'features', 'labels_schema_name': 'public', 'labels_table_name': 'labels', 'sparse_state_table_name': 'tmp_sparse_states_abcd', }, matrix_directory=os.path.join(temp_dir, 'matrices'), states=state_filters, user_metadata={}, replace=True ) # chop time split_definitions = chopper.chop_time() # generate as_of_times for feature/label/state generation all_as_of_times = [] for split in split_definitions: all_as_of_times.extend(split['train_matrix']['as_of_times']) for test_matrix in split['test_matrices']: all_as_of_times.extend(test_matrix['as_of_times']) all_as_of_times = list(set(all_as_of_times)) # generate sparse state table state_table_generator.generate_sparse_table( dense_state_table='states', as_of_dates=all_as_of_times ) # create labels table label_generator.generate_all_labels( labels_table='labels', as_of_dates=all_as_of_times, label_windows=['6months'] ) # create feature table tasks # we would use FeatureGenerator#create_all_tables but want to use # the tasks dict directly to create a feature dict feature_table_tasks = feature_generator.generate_all_table_tasks( feature_aggregation_config=[{ 'prefix': 'cat', 'from_obj': 'cat_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'cat_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }, { 'prefix': 'dog', 'from_obj': 'dog_complaints', 'knowledge_date_column': 'as_of_date', 'aggregates': [{ 'quantity': 'dog_sightings', 'metrics': ['count', 'avg'], }], 'intervals': ['1y'], 'groups': ['entity_id'] }], feature_dates=all_as_of_times, ) # create feature tables feature_generator.process_table_tasks(feature_table_tasks) # build feature dictionaries from feature tables and # subsetting config master_feature_dict = feature_dictionary_creator\ .feature_dictionary(feature_table_tasks.keys()) feature_dicts = feature_group_mixer.generate( feature_group_creator.subsets(master_feature_dict) ) # figure out what matrices need to be built _, matrix_build_tasks =\ planner.generate_plans( split_definitions, feature_dicts ) # go and build the matrices planner.build_all_matrices(matrix_build_tasks) # super basic assertion: did matrices we expect get created? matrix_directory = os.path.join(temp_dir, 'matrices') matrices = [path for path in os.listdir(matrix_directory) if '.csv' in path] metadatas = [path for path in os.listdir(matrix_directory) if '.yaml' in path] assert len(matrices) == expected_num_matrices assert len(metadatas) == expected_num_matrices