def generate_plans(self, matrix_set_definitions, feature_dictionaries): """Create build tasks and update the matrix definitions with UUIDs :param matrix_set_definitions: the temporal information needed to generate each matrix :param feature_dictionaries: combinations of features to include in matrices :type matrix_set_definitions: list :type feature_dictionaries: list :return: matrix set definitions (updated with matrix uuids) and build tasks :rtype: tuple (list, dict) """ updated_definitions = [] build_tasks = dict() for matrix_set in matrix_set_definitions: train_matrix = matrix_set['train_matrix'] for label_name, label_type, state, feature_dictionary in itertools.product( self.label_names, self.label_types, self.states, feature_dictionaries): matrix_set_clone = copy.deepcopy(matrix_set) # get a uuid train_metadata = self._make_metadata( train_matrix, feature_dictionary, label_name, label_type, state, 'train', ) print(train_metadata) train_uuid = metta.generate_uuid(train_metadata) if train_uuid not in build_tasks: build_tasks[train_uuid] = self._generate_build_task( train_metadata, train_uuid, train_matrix, feature_dictionary) matrix_set_clone['train_uuid'] = train_uuid test_uuids = [] for test_matrix in matrix_set_clone['test_matrices']: test_metadata = self._make_metadata( test_matrix, feature_dictionary, label_name, label_type, state, 'test', ) test_uuid = metta.generate_uuid(test_metadata) if test_uuid not in build_tasks: build_tasks[test_uuid] = self._generate_build_task( test_metadata, test_uuid, test_matrix, feature_dictionary) test_uuids.append(test_uuid) matrix_set_clone['test_uuids'] = test_uuids updated_definitions.append(matrix_set_clone) return updated_definitions, build_tasks
def test_test_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: planner = Planner(feature_start_time=datetime.datetime( 2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine) matrix_dates = { 'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0), 'matrix_info_end_time': datetime.datetime(2016, 3, 1, 0, 0), 'as_of_times': dates } feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month' } uuid = metta.generate_uuid(matrix_metadata) planner.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test') matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid)) with open(matrix_filename, 'r') as f: reader = csv.reader(f) assert (len([row for row in reader]) == 6)
def test_train_matrix(self): with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas( engine=engine, features_tables=features_tables, labels=labels, states=states ) dates = [datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0)] with TemporaryDirectory() as temp_dir: planner = Planner( beginning_of_time = datetime.datetime(2010, 1, 1, 0, 0), label_names = ['booking'], label_types = ['binary'], states = ['state_one AND state_two'], db_config = db_config, matrix_directory = temp_dir, user_metadata = {}, engine = engine ) feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'beginning_of_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_window': '1 month' } uuid = metta.generate_uuid(matrix_metadata) planner.build_matrix( as_of_times = dates, label_name = 'booking', label_type = 'binary', feature_dictionary = feature_dictionary, matrix_directory = temp_dir, matrix_metadata = matrix_metadata, matrix_uuid = uuid, matrix_type = 'train' ) matrix_filename = os.path.join( temp_dir, '{}.csv'.format(uuid) ) with open(matrix_filename, 'r') as f: reader = csv.reader(f) assert(len([row for row in reader]) == 6)
def generate_plans(self, matrix_set_definitions, feature_dictionaries): """Create build tasks and update the matrix definitions with UUIDs :param matrix_set_definitions: the temporal information needed to generate each matrix :param feature_dictionaries: combinations of features to include in matrices :type matrix_set_definitions: list :type feature_dictionaries: list :return: matrix set definitions (updated with matrix uuids) and build tasks :rtype: tuple (list, dict) """ updated_definitions = [] build_tasks = dict() for matrix_set in matrix_set_definitions: train_matrix = matrix_set['train_matrix'] for label_name, label_type, state, feature_dictionary in itertools.product( self.label_names, self.label_types, self.states, feature_dictionaries ): matrix_set_clone = copy.deepcopy(matrix_set) # get a uuid train_metadata = self._make_metadata( train_matrix, feature_dictionary, label_name, label_type, state, 'train', ) print(train_metadata) train_uuid = metta.generate_uuid(train_metadata) if train_uuid not in build_tasks: build_tasks[train_uuid] = self._generate_build_task( train_metadata, train_uuid, train_matrix, feature_dictionary ) matrix_set_clone['train_uuid'] = train_uuid test_uuids = [] for test_matrix in matrix_set_clone['test_matrices']: test_metadata = self._make_metadata( test_matrix, feature_dictionary, label_name, label_type, state, 'test', ) test_uuid = metta.generate_uuid(test_metadata) if test_uuid not in build_tasks: build_tasks[test_uuid] = self._generate_build_task( test_metadata, test_uuid, test_matrix, feature_dictionary ) test_uuids.append(test_uuid) matrix_set_clone['test_uuids'] = test_uuids updated_definitions.append(matrix_set_clone) return updated_definitions, build_tasks
def generate_plans(self, matrix_set_definitions, feature_dictionaries): """Create build tasks and update the matrix definitions with UUIDs :param matrix_set_definitions: the temporal information needed to generate each matrix :param feature_dictionaries: combinations of features to include in matrices :type matrix_set_definitions: list :type feature_dictionaries: list :return: matrix set definitions (updated with matrix uuids) and build tasks :rtype: tuple (list, dict) """ updated_definitions = [] build_tasks = dict() for matrix_set in matrix_set_definitions: logging.info('Making plans for matrix set %s', matrix_set) logging.info( 'Iterating over %s label names, %s label_types, %s states, %s feature dictionaries', len(self.label_names), len(self.label_types), len(self.states), len(feature_dictionaries)) train_matrix = matrix_set['train_matrix'] for label_name, label_type, state, feature_dictionary in itertools.product( self.label_names, self.label_types, self.states, feature_dictionaries): matrix_set_clone = copy.deepcopy(matrix_set) # get a uuid train_metadata = self._make_metadata( train_matrix, feature_dictionary, label_name, label_type, state, 'train', ) train_uuid = metta.generate_uuid(train_metadata) logging.info('Matrix UUID %s found for train metadata %s', train_uuid, train_metadata) if train_uuid not in build_tasks: build_tasks[train_uuid] = self._generate_build_task( train_metadata, train_uuid, train_matrix, feature_dictionary) logging.info( 'Train uuid %s not found in build tasks yet, so added', train_uuid) else: logging.info('Train uuid %s already found in build tasks', train_uuid) matrix_set_clone['train_uuid'] = train_uuid test_uuids = [] for test_matrix in matrix_set_clone['test_matrices']: test_metadata = self._make_metadata( test_matrix, feature_dictionary, label_name, label_type, state, 'test', ) test_uuid = metta.generate_uuid(test_metadata) logging.info('Matrix UUID %s found for test metadata %s', test_uuid, test_metadata) if test_uuid not in build_tasks: build_tasks[test_uuid] = self._generate_build_task( test_metadata, test_uuid, test_matrix, feature_dictionary) logging.info( 'Test uuid %s not found in build tasks yet, so added', test_uuid) else: logging.info( 'Test uuid %s already found in build tasks', test_uuid) test_uuids.append(test_uuid) matrix_set_clone['test_uuids'] = test_uuids updated_definitions.append(matrix_set_clone) logging.info( 'Planner is finished generating matrix plans. %s matrix definitions and %s unique build tasks found', len(updated_definitions), len(build_tasks.keys())) return updated_definitions, build_tasks
def test_nullcheck(self): f0_dict = {(r[0], r[1]): r for r in features0_pre} f1_dict = {(r[0], r[1]): r for r in features1_pre} features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0])) features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0])) features_tables = [features0, features1] with testing.postgresql.Postgresql() as postgresql: # create an engine and generate a table with fake feature data engine = create_engine(postgresql.url()) create_schemas(engine=engine, features_tables=features_tables, labels=labels, states=states) dates = [ datetime.datetime(2016, 1, 1, 0, 0), datetime.datetime(2016, 2, 1, 0, 0), datetime.datetime(2016, 3, 1, 0, 0) ] with TemporaryDirectory() as temp_dir: planner = Planner(feature_start_time=datetime.datetime( 2010, 1, 1, 0, 0), label_names=['booking'], label_types=['binary'], states=['state_one AND state_two'], db_config=db_config, matrix_directory=temp_dir, user_metadata={}, engine=engine) matrix_dates = { 'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0), 'matrix_info_end_time': datetime.datetime(2016, 3, 1, 0, 0), 'as_of_times': dates } feature_dictionary = { 'features0': ['f1', 'f2'], 'features1': ['f3', 'f4'], } matrix_metadata = { 'matrix_id': 'hi', 'state': 'state_one AND state_two', 'label_name': 'booking', 'end_time': datetime.datetime(2016, 3, 1, 0, 0), 'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0), 'label_timespan': '1 month' } uuid = metta.generate_uuid(matrix_metadata) with self.assertRaises(ValueError): planner.build_matrix(as_of_times=dates, label_name='booking', label_type='binary', feature_dictionary=feature_dictionary, matrix_directory=temp_dir, matrix_metadata=matrix_metadata, matrix_uuid=uuid, matrix_type='test')