예제 #1
0
    def input_fn(self,
                 file_pattern: List[Text],
                 tf_transform_output: tft.TFTransformOutput):
        """
        Load TFRecords on disk to pandas dataframe.

        Args:
            file_pattern: File pattern matching saved TFRecords on disk.
            tf_transform_output: Output of the preceding Transform /
             Preprocessing component.

        Returns:
            dataset: tf.data.Dataset created out of the input files.
        """
        xf_feature_spec = tf_transform_output.transformed_feature_spec()

        xf_feature_spec = {x: xf_feature_spec[x]
                           for x in xf_feature_spec
                           if x.endswith('_xf')}

        root_path = [x.replace("*", "") for x in file_pattern][0]
        dataset = tf.data.TFRecordDataset(
            path_utils.list_dir(root_path),  # a bit ugly
            compression_type='GZIP')
        df = convert_raw_dataset_to_pandas(dataset, xf_feature_spec, 100000)

        # Seperate labels
        X = df[[x for x in df.columns if 'label_' not in x]]
        y = df[[x for x in df.columns if 'label_' in x]]
        return X, y
예제 #2
0
    def get_predictions(self, sample_size: int = 100000):
        """
        Samples prediction data as a pandas DataFrame.

        Args:
            sample_size: # of rows to sample.
        """
        base_uri = self.get_artifacts_uri_by_component(
            GDPComponent.Inferrer.name)[0]
        data_files = path_utils.list_dir(os.path.join(base_uri, 'examples'))
        dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP')
        schema_uri = self.get_artifacts_uri_by_component(
            GDPComponent.DataSchema.name)[0]
        spec = get_feature_spec_from_schema(schema_uri)
        return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
예제 #3
0
    def sample_data(self, sample_size: int = 100000):
        """
        Sampels data from datasource as a pandas DataFrame.
        Args:
            sample_size: # of rows to sample.
        """
        pipeline = self._get_one_pipeline()
        data_files = self._get_data_file_paths(pipeline)

        schema_uri = pipeline.get_artifacts_uri_by_component(
            GDPComponent.DataSchema.name)[0]
        spec = get_feature_spec_from_schema(schema_uri)

        dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP')
        return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
예제 #4
0
    def sample_transformed_data(self,
                                split_name: Text = 'eval',
                                sample_size: int = 100000):
        """
        Samples transformed data as a pandas DataFrame.

        Args:
            split_name: name of split to see
            sample_size: # of rows to sample.
        """
        base_uri = self.get_artifacts_uri_by_component(
            GDPComponent.Transform.name)[0]
        transform_schema = os.path.join(base_uri, 'transformed_metadata')
        spec = get_feature_spec_from_schema(transform_schema)

        base_uri = Path(base_uri)
        id_ = base_uri.name
        transform_data_path = os.path.join(str(base_uri.parent.parent),
                                           'transformed_examples', id_)

        split_data_path = os.path.join(transform_data_path, split_name)
        data_files = path_utils.list_dir(split_data_path)
        dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP')
        return convert_raw_dataset_to_pandas(dataset, spec, sample_size)