示例#1
0
    def test_test_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    engine=engine)

                uuid = metta.generate_uuid(self.good_metadata)
                builder.build_matrix(
                    as_of_times=self.good_dates,
                    label_name='booking',
                    label_type='binary',
                    feature_dictionary=self.good_feature_dictionary,
                    matrix_metadata=self.good_metadata,
                    matrix_uuid=uuid,
                    matrix_type='test')

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
示例#2
0
    def test_hdf_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states,
            )

            with get_matrix_storage_engine() as matrix_storage_engine:
                matrix_storage_engine.matrix_storage_class = HDFMatrixStore
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    experiment_hash=experiment_hash,
                    engine=engine,
                )

                uuid = metta.generate_uuid(self.good_metadata)
                builder.build_matrix(
                    as_of_times=self.good_dates,
                    label_name="booking",
                    label_type="binary",
                    feature_dictionary=self.good_feature_dictionary,
                    matrix_metadata=self.good_metadata,
                    matrix_uuid=uuid,
                    matrix_type="test",
                )

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
示例#3
0
    def test_nullcheck(self):
        f0_dict = {(r[0], r[1]): r for r in features0_pre}
        f1_dict = {(r[0], r[1]): r for r in features1_pre}

        features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0]))
        features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0]))

        features_tables = [features0, features1]

        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states,
            )

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0),
            ]

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    experiment_hash=experiment_hash,
                    engine=engine,
                )

                feature_dictionary = {
                    "features0": ["f1", "f2"],
                    "features1": ["f3", "f4"],
                }
                matrix_metadata = {
                    "matrix_id": "hi",
                    "state": "active",
                    "label_name": "booking",
                    "end_time": datetime.datetime(2016, 3, 1, 0, 0),
                    "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0),
                    "label_timespan": "1 month",
                    "test_duration": "1 month",
                    "indices": ["entity_id", "as_of_date"],
                }
                uuid = metta.generate_uuid(matrix_metadata)
                with self.assertRaises(ValueError):
                    builder.build_matrix(
                        as_of_times=dates,
                        label_name="booking",
                        label_type="binary",
                        feature_dictionary=feature_dictionary,
                        matrix_metadata=matrix_metadata,
                        matrix_uuid=uuid,
                        matrix_type="test",
                    )
示例#4
0
    def test_test_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with TemporaryDirectory() as temp_dir:
                planner = Planner(feature_start_time=datetime.datetime(
                    2010, 1, 1, 0, 0),
                                  label_names=['booking'],
                                  label_types=['binary'],
                                  states=['state_one AND state_two'],
                                  db_config=db_config,
                                  matrix_directory=temp_dir,
                                  user_metadata={},
                                  engine=engine)

                matrix_dates = {
                    'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'matrix_info_end_time':
                    datetime.datetime(2016, 3, 1, 0, 0),
                    'as_of_times': dates
                }
                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                planner.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_directory=temp_dir,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='test')
                matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid))

                with open(matrix_filename, 'r') as f:
                    reader = csv.reader(f)
                    assert (len([row for row in reader]) == 6)
示例#5
0
    def test_replace_false_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    engine=engine,
                    replace=False)

                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month',
                    'test_duration': '1 month',
                    'indices': ['entity_id', 'as_of_date'],
                }
                uuid = metta.generate_uuid(matrix_metadata)
                builder.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='test')

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                # rerun
                builder.make_entity_date_table = Mock()
                builder.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='test')
                assert not builder.make_entity_date_table.called
示例#6
0
    def test_replace_true_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states,
            )
            matrix_metadata = matrix_metadata_creator(
                state="state_one and state_two", test_duration="1month")

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0),
            ]

            feature_dictionary = {
                "features0": ["f1", "f2"],
                "features1": ["f3", "f4"]
            }
            uuid = metta.generate_uuid(matrix_metadata)
            build_args = dict(
                as_of_times=dates,
                label_name="booking",
                label_type="binary",
                feature_dictionary=feature_dictionary,
                matrix_metadata=matrix_metadata,
                matrix_uuid=uuid,
                matrix_type="test",
            )

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    experiment_hash=experiment_hash,
                    engine=engine,
                    replace=True,
                )

                builder.build_matrix(**build_args)

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
                # rerun
                builder.build_matrix(**build_args)
                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
示例#7
0
    def test_nullcheck(self):
        f0_dict = {(r[0], r[1]): r for r in features0_pre}
        f1_dict = {(r[0], r[1]): r for r in features1_pre}

        features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0]))
        features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0]))

        features_tables = [features0, features1]

        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    engine=engine)

                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month',
                    'test_duration': '1 month',
                    'indices': ['entity_id', 'as_of_date'],
                }
                uuid = metta.generate_uuid(matrix_metadata)
                with self.assertRaises(ValueError):
                    builder.build_matrix(as_of_times=dates,
                                         label_name='booking',
                                         label_type='binary',
                                         feature_dictionary=feature_dictionary,
                                         matrix_metadata=matrix_metadata,
                                         matrix_uuid=uuid,
                                         matrix_type='test')
示例#8
0
    def test_train_matrix(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with TemporaryDirectory() as temp_dir:
                builder = builders.HighMemoryCSVBuilder(
                    db_config=db_config,
                    matrix_directory=temp_dir,
                    engine=engine)
                feature_dictionary = FeatureGroup(name='mygroup',
                                                  features_by_table={
                                                      'features0':
                                                      ['f1', 'f2'],
                                                      'features1':
                                                      ['f3', 'f4'],
                                                  })
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month',
                    'max_training_history': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                builder.build_matrix(as_of_times=dates,
                                     label_name='booking',
                                     label_type='binary',
                                     feature_dictionary=feature_dictionary,
                                     matrix_directory=temp_dir,
                                     matrix_metadata=matrix_metadata,
                                     matrix_uuid=uuid,
                                     matrix_type='train')

                matrix_filename = os.path.join(temp_dir, '{}.csv'.format(uuid))
                with open(matrix_filename, 'r') as f:
                    reader = csv.reader(f)
                    assert (len([row for row in reader]) == 6)
示例#9
0
    def test_replace_true_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)
            matrix_metadata = matrix_metadata_creator(
                state='state_one and state_two', test_duration='1month')

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            feature_dictionary = {
                'features0': ['f1', 'f2'],
                'features1': ['f3', 'f4'],
            }
            uuid = metta.generate_uuid(matrix_metadata)
            build_args = dict(as_of_times=dates,
                              label_name='booking',
                              label_type='binary',
                              feature_dictionary=feature_dictionary,
                              matrix_metadata=matrix_metadata,
                              matrix_uuid=uuid,
                              matrix_type='test')

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    engine=engine,
                    replace=True)

                builder.build_matrix(**build_args)

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
                # rerun
                builder.build_matrix(**build_args)
                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                assert builder.sessionmaker().query(Matrix).get(uuid)
示例#10
0
文件: planner.py 项目: SPETHW/triage
    def generate_plans(self, matrix_set_definitions, feature_dictionaries):
        """Create build tasks and update the matrix definitions with UUIDs

        :param matrix_set_definitions: the temporal information needed to generate each matrix
        :param feature_dictionaries: combinations of features to include in matrices
        :type matrix_set_definitions: list
        :type feature_dictionaries: list

        :return: matrix set definitions (updated with matrix uuids) and build tasks
        :rtype: tuple (list, dict)
        """
        updated_definitions = []
        build_tasks = dict()
        for matrix_set in matrix_set_definitions:
            logging.info("Making plans for matrix set %s", matrix_set)
            logging.info(
                "Iterating over %s label names, %s label_types, %s states, "
                "%s feature dictionaries",
                len(self.label_names),
                len(self.label_types),
                len(self.states),
                len(feature_dictionaries),
            )
            train_matrix = matrix_set["train_matrix"]
            for (
                label_name,
                label_type,
                state,
                feature_dictionary,
            ) in itertools.product(
                self.label_names, self.label_types, self.states, feature_dictionaries
            ):
                matrix_set_clone = copy.deepcopy(matrix_set)
                # get a uuid
                train_metadata = self._make_metadata(
                    train_matrix,
                    feature_dictionary,
                    label_name,
                    label_type,
                    state,
                    "train",
                )
                train_uuid = metta.generate_uuid(train_metadata)
                logging.info(
                    "Matrix UUID %s found for train metadata %s",
                    train_uuid,
                    train_metadata,
                )
                if train_uuid not in build_tasks:
                    build_tasks[train_uuid] = self._generate_build_task(
                        train_metadata, train_uuid, train_matrix, feature_dictionary
                    )
                    logging.info(
                        "Train uuid %s not found in build tasks yet, " "so added",
                        train_uuid,
                    )
                else:
                    logging.info(
                        "Train uuid %s already found in build tasks", train_uuid
                    )
                matrix_set_clone["train_uuid"] = train_uuid

                test_uuids = []
                for test_matrix in matrix_set_clone["test_matrices"]:
                    test_metadata = self._make_metadata(
                        test_matrix,
                        feature_dictionary,
                        label_name,
                        label_type,
                        state,
                        "test",
                    )
                    test_uuid = metta.generate_uuid(test_metadata)
                    logging.info(
                        "Matrix UUID %s found for test metadata %s",
                        test_uuid,
                        test_metadata,
                    )
                    if test_uuid not in build_tasks:
                        build_tasks[test_uuid] = self._generate_build_task(
                            test_metadata, test_uuid, test_matrix, feature_dictionary
                        )
                        logging.info(
                            "Test uuid %s not found in build tasks " "yet, so added",
                            test_uuid,
                        )
                    else:
                        logging.info(
                            "Test uuid %s already found in build tasks", test_uuid
                        )

                    test_uuids.append(test_uuid)
                matrix_set_clone["test_uuids"] = test_uuids
                updated_definitions.append(matrix_set_clone)

        logging.info(
            "Planner is finished generating matrix plans. "
            "%s matrix definitions and %s unique build tasks found",
            len(updated_definitions),
            len(build_tasks.keys()),
        )
        logging.info("Associated all tasks with experiment in database")
        return updated_definitions, build_tasks
示例#11
0
    def test_replace_false_rerun(self):
        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            ensure_db(engine)
            create_schemas(
                engine=engine,
                features_tables=features_tables,
                labels=labels,
                states=states,
            )

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0),
            ]

            with get_matrix_storage_engine() as matrix_storage_engine:
                builder = MatrixBuilder(
                    db_config=db_config,
                    matrix_storage_engine=matrix_storage_engine,
                    experiment_hash=experiment_hash,
                    engine=engine,
                    replace=False,
                )

                feature_dictionary = {
                    "features0": ["f1", "f2"],
                    "features1": ["f3", "f4"],
                }
                matrix_metadata = {
                    "matrix_id": "hi",
                    "state": "active",
                    "label_name": "booking",
                    "end_time": datetime.datetime(2016, 3, 1, 0, 0),
                    "feature_start_time": datetime.datetime(2016, 1, 1, 0, 0),
                    "label_timespan": "1 month",
                    "test_duration": "1 month",
                    "indices": ["entity_id", "as_of_date"],
                }
                uuid = metta.generate_uuid(matrix_metadata)
                builder.build_matrix(
                    as_of_times=dates,
                    label_name="booking",
                    label_type="binary",
                    feature_dictionary=feature_dictionary,
                    matrix_metadata=matrix_metadata,
                    matrix_uuid=uuid,
                    matrix_type="test",
                )

                assert len(matrix_storage_engine.get_store(uuid).matrix) == 5
                # rerun
                builder.make_entity_date_table = Mock()
                builder.build_matrix(
                    as_of_times=dates,
                    label_name="booking",
                    label_type="binary",
                    feature_dictionary=feature_dictionary,
                    matrix_metadata=matrix_metadata,
                    matrix_uuid=uuid,
                    matrix_type="test",
                )
                assert not builder.make_entity_date_table.called
示例#12
0
    def test_nullcheck(self):
        f0_dict = {(r[0], r[1]): r for r in features0_pre}
        f1_dict = {(r[0], r[1]): r for r in features1_pre}

        features0 = sorted(f0_dict.values(), key=lambda x: (x[1], x[0]))
        features1 = sorted(f1_dict.values(), key=lambda x: (x[1], x[0]))

        features_tables = [features0, features1]

        with testing.postgresql.Postgresql() as postgresql:
            # create an engine and generate a table with fake feature data
            engine = create_engine(postgresql.url())
            create_schemas(engine=engine,
                           features_tables=features_tables,
                           labels=labels,
                           states=states)

            dates = [
                datetime.datetime(2016, 1, 1, 0, 0),
                datetime.datetime(2016, 2, 1, 0, 0),
                datetime.datetime(2016, 3, 1, 0, 0)
            ]

            with TemporaryDirectory() as temp_dir:
                planner = Planner(feature_start_time=datetime.datetime(
                    2010, 1, 1, 0, 0),
                                  label_names=['booking'],
                                  label_types=['binary'],
                                  states=['state_one AND state_two'],
                                  db_config=db_config,
                                  matrix_directory=temp_dir,
                                  user_metadata={},
                                  engine=engine)

                matrix_dates = {
                    'first_as_of_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'matrix_info_end_time':
                    datetime.datetime(2016, 3, 1, 0, 0),
                    'as_of_times': dates
                }
                feature_dictionary = {
                    'features0': ['f1', 'f2'],
                    'features1': ['f3', 'f4'],
                }
                matrix_metadata = {
                    'matrix_id': 'hi',
                    'state': 'state_one AND state_two',
                    'label_name': 'booking',
                    'end_time': datetime.datetime(2016, 3, 1, 0, 0),
                    'feature_start_time': datetime.datetime(2016, 1, 1, 0, 0),
                    'label_timespan': '1 month'
                }
                uuid = metta.generate_uuid(matrix_metadata)
                with self.assertRaises(ValueError):
                    planner.build_matrix(as_of_times=dates,
                                         label_name='booking',
                                         label_type='binary',
                                         feature_dictionary=feature_dictionary,
                                         matrix_directory=temp_dir,
                                         matrix_metadata=matrix_metadata,
                                         matrix_uuid=uuid,
                                         matrix_type='test')