def input_fn(self, file_pattern: List[Text], tf_transform_output: tft.TFTransformOutput): """ Load TFRecords on disk to pandas dataframe. Args: file_pattern: File pattern matching saved TFRecords on disk. tf_transform_output: Output of the preceding Transform / Preprocessing component. Returns: dataset: tf.data.Dataset created out of the input files. """ xf_feature_spec = tf_transform_output.transformed_feature_spec() xf_feature_spec = {x: xf_feature_spec[x] for x in xf_feature_spec if x.endswith('_xf')} root_path = [x.replace("*", "") for x in file_pattern][0] dataset = tf.data.TFRecordDataset( path_utils.list_dir(root_path), # a bit ugly compression_type='GZIP') df = convert_raw_dataset_to_pandas(dataset, xf_feature_spec, 100000) # Seperate labels X = df[[x for x in df.columns if 'label_' not in x]] y = df[[x for x in df.columns if 'label_' in x]] return X, y
def get_predictions(self, sample_size: int = 100000): """ Samples prediction data as a pandas DataFrame. Args: sample_size: # of rows to sample. """ base_uri = self.get_artifacts_uri_by_component( GDPComponent.Inferrer.name)[0] data_files = path_utils.list_dir(os.path.join(base_uri, 'examples')) dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP') schema_uri = self.get_artifacts_uri_by_component( GDPComponent.DataSchema.name)[0] spec = get_feature_spec_from_schema(schema_uri) return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
def sample_data(self, sample_size: int = 100000): """ Sampels data from datasource as a pandas DataFrame. Args: sample_size: # of rows to sample. """ pipeline = self._get_one_pipeline() data_files = self._get_data_file_paths(pipeline) schema_uri = pipeline.get_artifacts_uri_by_component( GDPComponent.DataSchema.name)[0] spec = get_feature_spec_from_schema(schema_uri) dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP') return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
def sample_transformed_data(self, split_name: Text = 'eval', sample_size: int = 100000): """ Samples transformed data as a pandas DataFrame. Args: split_name: name of split to see sample_size: # of rows to sample. """ base_uri = self.get_artifacts_uri_by_component( GDPComponent.Transform.name)[0] transform_schema = os.path.join(base_uri, 'transformed_metadata') spec = get_feature_spec_from_schema(transform_schema) base_uri = Path(base_uri) id_ = base_uri.name transform_data_path = os.path.join(str(base_uri.parent.parent), 'transformed_examples', id_) split_data_path = os.path.join(transform_data_path, split_name) data_files = path_utils.list_dir(split_data_path) dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP') return convert_raw_dataset_to_pandas(dataset, spec, sample_size)