def test_test_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine) uuid = metta.generate_uuid(self.good_metadata) builder.build_matrix( as_of_times=self.good_dates, label_name='booking', label_type='binary', feature_dictionary=self.good_feature_dictionary, matrix_metadata=self.good_metadata, matrix_uuid=uuid, matrix_type='test') assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
def test_hdf_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) with get_matrix_storage_engine() as matrix_storage_engine: matrix_storage_engine.matrix_storage_class = HDFMatrixStore builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) uuid = metta.generate_uuid(self.good_metadata) builder.build_matrix( as_of_times=self.good_dates, label_name="booking", label_type="binary", feature_dictionary=self.good_feature_dictionary, matrix_metadata=self.good_metadata, matrix_uuid=uuid, matrix_type="test", ) assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
def test_nullcheck(self): f0_dict = {(r[0], r[1]): r for r in features0_pre} f1_dict = {(r[0], r[1]): r for r in features1_pre} features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0])) features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0])) features_tables = [features0, features1] with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, ) feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"], } matrix_metadata = { "matrix_id": "hi", "state": "active", "label_name": "booking", "end_time": datetime.datetime(2016, 3, 1, 0, 0), "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0), "label_timespan": "1 month", "test_duration": "1 month", "indices": ["entity_id", "as_of_date"], } uuid = metta.generate_uuid(matrix_metadata) with self.assertRaises(ValueError): builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", )
def test_test_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: planner = Planner(feature_start_time=datetime.datetime( 2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine) matrix_dates = { 'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0), 'matrix_info_end_time': datetime.datetime(2016, 3, 1, 0, 0), 'as_of_times': dates } feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month' } uuid = metta.generate_uuid(matrix_metadata) planner.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid)) with open(matrix_filename, 'r') as f: reader = csv.reader(f) assert (len([row for row in reader]) == 6)
def test_replace_false_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, replace=False) feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month', 'test_duration': '1 month', 'indices': ['entity_id', 'as_of_date'], } uuid = metta.generate_uuid(matrix_metadata) builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') assert len(matrix_storage_engine.get_store(uuid).matrix) == 5 # rerun builder.make_entity_date_table = Mock() builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') assert not builder.make_entity_date_table.called
def test_replace_true_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) matrix_metadata = matrix_metadata_creator( state="state_one and state_two", test_duration="1month") dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"] } uuid = metta.generate_uuid(matrix_metadata) build_args = dict( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, replace=True, ) builder.build_matrix(**build_args) assert len(matrix_storage_engine.get_store(uuid).matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid) # rerun builder.build_matrix(**build_args) assert len(matrix_storage_engine.get_store(uuid).matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid)
def test_nullcheck(self): f0_dict = {(r[0], r[1]): r for r in features0_pre} f1_dict = {(r[0], r[1]): r for r in features1_pre} features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0])) features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0])) features_tables = [features0, features1] with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine) feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month', 'test_duration': '1 month', 'indices': ['entity_id', 'as_of_date'], } uuid = metta.generate_uuid(matrix_metadata) with self.assertRaises(ValueError): builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test')
def test_train_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: builder = builders.HighMemoryCSVBuilder( db_config=db_config, matrix_directory=temp_dir, engine=engine) feature_dictionary = FeatureGroup(name='mygroup', features_by_table={ 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], }) matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month', 'max_training_history': '1 month' } uuid = metta.generate_uuid(matrix_metadata) builder.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='train') matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid)) with open(matrix_filename, 'r') as f: reader = csv.reader(f) assert (len([row for row in reader]) == 6)
def test_replace_true_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) matrix_metadata = matrix_metadata_creator( state='state_one and state_two', test_duration='1month') dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } uuid = metta.generate_uuid(matrix_metadata) build_args = dict(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, engine=engine, replace=True) builder.build_matrix(**build_args) assert len(matrix_storage_engine.get_store(uuid).matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid) # rerun builder.build_matrix(**build_args) assert len(matrix_storage_engine.get_store(uuid).matrix) == 5 assert builder.sessionmaker().query(Matrix).get(uuid)
def generate_plans(self, matrix_set_definitions, feature_dictionaries): """Create build tasks and update the matrix definitions with UUIDs :param matrix_set_definitions: the temporal information needed to generate each matrix :param feature_dictionaries: combinations of features to include in matrices :type matrix_set_definitions: list :type feature_dictionaries: list :return: matrix set definitions (updated with matrix uuids) and build tasks :rtype: tuple (list, dict) """ updated_definitions = [] build_tasks = dict() for matrix_set in matrix_set_definitions: logging.info("Making plans for matrix set %s", matrix_set) logging.info( "Iterating over %s label names, %s label_types, %s states, " "%s feature dictionaries", len(self.label_names), len(self.label_types), len(self.states), len(feature_dictionaries), ) train_matrix = matrix_set["train_matrix"] for ( label_name, label_type, state, feature_dictionary, ) in itertools.product( self.label_names, self.label_types, self.states, feature_dictionaries ): matrix_set_clone = copy.deepcopy(matrix_set) # get a uuid train_metadata = self._make_metadata( train_matrix, feature_dictionary, label_name, label_type, state, "train", ) train_uuid = metta.generate_uuid(train_metadata) logging.info( "Matrix UUID %s found for train metadata %s", train_uuid, train_metadata, ) if train_uuid not in build_tasks: build_tasks[train_uuid] = self._generate_build_task( train_metadata, train_uuid, train_matrix, feature_dictionary ) logging.info( "Train uuid %s not found in build tasks yet, " "so added", train_uuid, ) else: logging.info( "Train uuid %s already found in build tasks", train_uuid ) matrix_set_clone["train_uuid"] = train_uuid test_uuids = [] for test_matrix in matrix_set_clone["test_matrices"]: test_metadata = self._make_metadata( test_matrix, feature_dictionary, label_name, label_type, state, "test", ) test_uuid = metta.generate_uuid(test_metadata) logging.info( "Matrix UUID %s found for test metadata %s", test_uuid, test_metadata, ) if test_uuid not in build_tasks: build_tasks[test_uuid] = self._generate_build_task( test_metadata, test_uuid, test_matrix, feature_dictionary ) logging.info( "Test uuid %s not found in build tasks " "yet, so added", test_uuid, ) else: logging.info( "Test uuid %s already found in build tasks", test_uuid ) test_uuids.append(test_uuid) matrix_set_clone["test_uuids"] = test_uuids updated_definitions.append(matrix_set_clone) logging.info( "Planner is finished generating matrix plans. " "%s matrix definitions and %s unique build tasks found", len(updated_definitions), len(build_tasks.keys()), ) logging.info("Associated all tasks with experiment in database") return updated_definitions, build_tasks
def test_replace_false_rerun(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) ensure_db(engine) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states, ) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0), ] with get_matrix_storage_engine() as matrix_storage_engine: builder = MatrixBuilder( db_config=db_config, matrix_storage_engine=matrix_storage_engine, experiment_hash=experiment_hash, engine=engine, replace=False, ) feature_dictionary = { "features0": ["f1", "f2"], "features1": ["f3", "f4"], } matrix_metadata = { "matrix_id": "hi", "state": "active", "label_name": "booking", "end_time": datetime.datetime(2016, 3, 1, 0, 0), "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0), "label_timespan": "1 month", "test_duration": "1 month", "indices": ["entity_id", "as_of_date"], } uuid = metta.generate_uuid(matrix_metadata) builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) assert len(matrix_storage_engine.get_store(uuid).matrix) == 5 # rerun builder.make_entity_date_table = Mock() builder.build_matrix( as_of_times=dates, label_name="booking", label_type="binary", feature_dictionary=feature_dictionary, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type="test", ) assert not builder.make_entity_date_table.called
def test_nullcheck(self): f0_dict = {(r[0], r[1]): r for r in features0_pre} f1_dict = {(r[0], r[1]): r for r in features1_pre} features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0])) features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0])) features_tables = [features0, features1] with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: planner = Planner(feature_start_time=datetime.datetime( 2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine) matrix_dates = { 'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0), 'matrix_info_end_time': datetime.datetime(2016, 3, 1, 0, 0), 'as_of_times': dates } feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month' } uuid = metta.generate_uuid(matrix_metadata) with self.assertRaises(ValueError): planner.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test')