예제 #1
0
    def run_fn(self):
        feature_spec = get_feature_spec_from_schema(self.schema_path)
        train_dataset = self.input_fn(self.train_files, feature_spec)
        eval_dataset = self.input_fn(self.eval_files, feature_spec)

        model = self.model_fn(train_dataset=train_dataset,
                              eval_dataset=eval_dataset)

        model.save_pretrained(self.serving_model_dir, saved_model=True)
예제 #2
0
    def get_predictions(self, sample_size: int = 100000):
        """
        Samples prediction data as a pandas DataFrame.

        Args:
            sample_size: # of rows to sample.
        """
        base_uri = self.get_artifacts_uri_by_component(
            GDPComponent.Inferrer.name)[0]
        data_files = path_utils.list_dir(os.path.join(base_uri, 'examples'))
        dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP')
        schema_uri = self.get_artifacts_uri_by_component(
            GDPComponent.DataSchema.name)[0]
        spec = get_feature_spec_from_schema(schema_uri)
        return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
예제 #3
0
    def sample_data(self, sample_size: int = 100000):
        """
        Sampels data from datasource as a pandas DataFrame.
        Args:
            sample_size: # of rows to sample.
        """
        pipeline = self._get_one_pipeline()
        data_files = self._get_data_file_paths(pipeline)

        schema_uri = pipeline.get_artifacts_uri_by_component(
            GDPComponent.DataSchema.name)[0]
        spec = get_feature_spec_from_schema(schema_uri)

        dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP')
        return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
예제 #4
0
    def sample_transformed_data(self,
                                split_name: Text = 'eval',
                                sample_size: int = 100000):
        """
        Samples transformed data as a pandas DataFrame.

        Args:
            split_name: name of split to see
            sample_size: # of rows to sample.
        """
        base_uri = self.get_artifacts_uri_by_component(
            GDPComponent.Transform.name)[0]
        transform_schema = os.path.join(base_uri, 'transformed_metadata')
        spec = get_feature_spec_from_schema(transform_schema)

        base_uri = Path(base_uri)
        id_ = base_uri.name
        transform_data_path = os.path.join(str(base_uri.parent.parent),
                                           'transformed_examples', id_)

        split_data_path = os.path.join(transform_data_path, split_name)
        data_files = path_utils.list_dir(split_data_path)
        dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP')
        return convert_raw_dataset_to_pandas(dataset, spec, sample_size)