def load_vocab(self, path_to_vocab: Text): """ Re-instantiate the class tokenizer with output vocabulary / merges. Args: path_to_vocab: Path to vocab / merges files from a training run. """ # inspect contents of output dir contents = path_utils.list_dir(path_to_vocab) try: vocab_file = next(f for f in contents if "vocab" in f) except StopIteration: vocab_file = None # update tokenizer params with vocab file name self.tokenizer_params.update({"vocab": vocab_file}) # merges are only needed for BPE Tokenizers if "bpe" in self.tokenizer_name: try: merges_file = next(f for f in contents if "merge" in f) except StopIteration: merges_file = None self.tokenizer_params.update({"merges": merges_file}) # reconstruct tokenizer object self.tokenizer = tokenizer_map.get( self.tokenizer_name)(**self.tokenizer_params) self.tokenizer.enable_padding(length=self.sentence_length) self.tokenizer.enable_truncation(max_length=self.sentence_length)
def input_fn(self, file_pattern: List[Text], tf_transform_output: tft.TFTransformOutput): """ Load TFRecords on disk to pandas dataframe. Args: file_pattern: File pattern matching saved TFRecords on disk. tf_transform_output: Output of the preceding Transform / Preprocessing component. Returns: dataset: tf.data.Dataset created out of the input files. """ xf_feature_spec = tf_transform_output.transformed_feature_spec() xf_feature_spec = {x: xf_feature_spec[x] for x in xf_feature_spec if x.endswith('_xf')} root_path = [x.replace("*", "") for x in file_pattern][0] dataset = tf.data.TFRecordDataset( path_utils.list_dir(root_path), # a bit ugly compression_type='GZIP') df = convert_raw_dataset_to_pandas(dataset, xf_feature_spec, 100000) # Seperate labels X = df[[x for x in df.columns if 'label_' not in x]] y = df[[x for x in df.columns if 'label_' in x]] return X, y
def wrapper(): repo: Repository = Repository.get_instance() repo.zenml_config.set_pipelines_dir(pipeline_root) for p_config in path_utils.list_dir(pipeline_root): y = yaml_utils.read_yaml(p_config) p: TrainingPipeline = TrainingPipeline.from_config(y) p.run()
def wrapper(): repo: Repository = Repository.get_instance() pipelines_dir = repo.zenml_config.get_pipelines_dir() for p_config in path_utils.list_dir(pipelines_dir): try: os.remove(p_config) except Exception as e: print(e)
def get_pipeline_file_paths(self, only_file_names: bool = False) -> \ Optional[List[Text]]: """Gets list of pipeline file path""" self._check_if_initialized() pipelines_dir = self.zenml_config.get_pipelines_dir() if not path_utils.is_dir(pipelines_dir): return [] return path_utils.list_dir(pipelines_dir, only_file_names)
def read_files_from_disk(pipeline: beam.Pipeline, base_path: Text) -> beam.pvalue.PCollection: """ The Beam PTransform used to read data from a collection of CSV files on a local file system. Args: pipeline: Input beam.Pipeline object coming from a TFX Executor. base_path: Base path pointing either to the directory containing the CSV files, or to a (single) CSV file. Returns: A beam.PCollection of data points. Each row in the collection of CSV files represents a single data point. """ wildcard_qualifier = "*" file_pattern = os.path.join(base_path, wildcard_qualifier) if path_utils.is_dir(base_path): csv_files = path_utils.list_dir(base_path) if not csv_files: raise RuntimeError( 'Split pattern {} does not match any files.'.format( file_pattern)) else: if path_utils.file_exists(base_path): csv_files = [base_path] else: raise RuntimeError(f'{base_path} does not exist.') # weed out bad file exts with this logic allowed_file_exts = [".csv", ".txt"] # ".dat" csv_files = [ uri for uri in csv_files if os.path.splitext(uri)[1] in allowed_file_exts ] logger.info(f'Matched {len(csv_files)}: {csv_files}') # Always use header from file logger.info(f'Using header from file: {csv_files[0]}.') column_names = path_utils.load_csv_header(csv_files[0]) logger.info(f'Header: {column_names}.') parsed_csv_lines = ( pipeline | 'ReadFromText' >> beam.io.ReadFromText(file_pattern=base_path, skip_header_lines=1) | 'ParseCSVLine' >> beam.ParDo(csv_decoder.ParseCSVLine(delimiter=',')) | 'ExtractParsedCSVLines' >> beam.Map(lambda x: dict(zip(column_names, x[0])))) return parsed_csv_lines
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) tokenizer_step: BaseTokenizer = c(**args) tokenizer_location = artifact_utils.get_single_uri( output_dict["tokenizer"]) split_uris, split_names, all_files = [], [], [] for artifact in input_dict["examples"]: for split in artifact_utils.decode_split_names( artifact.split_names): split_names.append(split) uri = os.path.join(artifact.uri, split) split_uris.append((split, uri)) all_files += path_utils.list_dir(uri) # Get output split path output_examples = artifact_utils.get_single_instance( output_dict["output_examples"]) output_examples.split_names = artifact_utils.encode_split_names( split_names) if not tokenizer_step.skip_training: tokenizer_step.train(files=all_files) tokenizer_step.save(output_dir=tokenizer_location) with self._make_beam_pipeline() as p: for split, uri in split_uris: input_uri = io_utils.all_files_pattern(uri) _ = (p | 'ReadData.' + split >> beam.io.ReadFromTFRecord( file_pattern=input_uri) | "ParseTFExFromString." + split >> beam.Map( tf.train.Example.FromString) | "AddTokens." + split >> beam.Map( append_tf_example, tokenizer_step=tokenizer_step) | 'Serialize.' + split >> beam.Map( lambda x: x.SerializeToString()) | 'WriteSplit.' + split >> WriteSplit( get_split_uri( output_dict["output_examples"], split)))
def get_predictions(self, sample_size: int = 100000): """ Samples prediction data as a pandas DataFrame. Args: sample_size: # of rows to sample. """ base_uri = self.get_artifacts_uri_by_component( GDPComponent.Inferrer.name)[0] data_files = path_utils.list_dir(os.path.join(base_uri, 'examples')) dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP') schema_uri = self.get_artifacts_uri_by_component( GDPComponent.DataSchema.name)[0] spec = get_feature_spec_from_schema(schema_uri) return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
def _get_data_file_paths(self, pipeline): """ Gets path where data is stored as list of file paths. Args: pipeline: a pipeline with this datasource embedded """ if pipeline.datasource._id != self._id: raise AssertionError('This pipeline does not belong to this ' 'datasource.') # Take any pipeline and get the datagen data_uri = os.path.join( pipeline.get_artifacts_uri_by_component( GDPComponent.DataGen.name)[0], 'examples') data_files = path_utils.list_dir(data_uri) return data_files
def test_get_artifacts_uri_by_component(repo): test_component_name = GDPComponent.SplitGen.name p_names = sorted(repo.get_pipeline_names()) p: BasePipeline = repo.get_pipeline_by_name(p_names[0]) uri_list = p.get_artifacts_uri_by_component(test_component_name) # assert it is not empty assert uri_list # assert artifact was written uri = uri_list[0] written_artifacts = path_utils.list_dir(uri) assert written_artifacts # TODO: Ugly TFRecord validation assert all((("tfrecord" in name and os.path.splitext(name)[-1] == ".gz") for name in f) for _, _, f in os.walk(uri))
def check_module_clean(self, source: Text): """ Returns True if all files within source's module are committed. Args: source (str): relative module path pointing to a Class. """ # import here to resolve circular dependency from zenml.utils import source_utils # Get the module path module_path = source_utils.get_module_source_from_source(source) # Get relative path of module because check_file_committed needs that module_dir = source_utils.get_relative_path_from_module_source( module_path) # Get absolute path of module because path_utils.list_dir needs that mod_abs_dir = source_utils.get_absolute_path_from_module_source( module_path) module_file_names = path_utils.list_dir(mod_abs_dir, only_file_names=True) # Go through each file in module and see if there are uncommitted ones for file_path in module_file_names: path = os.path.join(module_dir, file_path) # if its .gitignored then continue and dont do anything if len(self.git_repo.ignored(path)) > 0: continue if path_utils.is_dir(os.path.join(mod_abs_dir, file_path)): logger.warning( f'The step {source} is contained inside a module ' f'that ' f'has sub-directories (the sub-directory {file_path} at ' f'{mod_abs_dir}). For now, ZenML supports only a flat ' f'directory structure in which to place Steps. Please make' f' sure that the Step does not utilize the sub-directory.') if not self.check_file_committed(path): return False return True
def sample_transformed_data(self, split_name: Text = 'eval', sample_size: int = 100000): """ Samples transformed data as a pandas DataFrame. Args: split_name: name of split to see sample_size: # of rows to sample. """ base_uri = self.get_artifacts_uri_by_component( GDPComponent.Transform.name)[0] transform_schema = os.path.join(base_uri, 'transformed_metadata') spec = get_feature_spec_from_schema(transform_schema) base_uri = Path(base_uri) id_ = base_uri.name transform_data_path = os.path.join(str(base_uri.parent.parent), 'transformed_examples', id_) split_data_path = os.path.join(transform_data_path, split_name) data_files = path_utils.list_dir(split_data_path) dataset = tf.data.TFRecordDataset(data_files, compression_type='GZIP') return convert_raw_dataset_to_pandas(dataset, spec, sample_size)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: # Check the inputs if constants.EXAMPLES not in input_dict: raise ValueError(f'{constants.EXAMPLES} is missing from inputs') examples_artifact = input_dict[constants.EXAMPLES] input_uri = artifact_utils.get_single_uri(examples_artifact) if len(zenml_path_utils.list_dir(input_uri)) == 0: raise AssertionError( 'ZenML can not run the evaluation as the provided input ' 'configuration does not point towards any data. Specifically, ' 'if you are using the agnostic evaluator, please make sure ' 'that you are using a proper test_fn in your trainer step to ' 'write these results.') else: # Check the outputs if constants.EVALUATION not in output_dict: raise ValueError( f'{constants.EVALUATION} is missing from outputs') evaluation_artifact = output_dict[constants.EVALUATION] output_uri = artifact_utils.get_single_uri(evaluation_artifact) # Resolve the schema schema = None if constants.SCHEMA in input_dict: schema_artifact = input_dict[constants.SCHEMA] schema_uri = artifact_utils.get_single_uri(schema_artifact) reader = io_utils.SchemaReader() schema = reader.read(io_utils.get_only_uri_in_dir(schema_uri)) # Create the step with the schema attached if provided source = exec_properties[StepKeys.SOURCE] args = exec_properties[StepKeys.ARGS] c = source_utils.load_source_path_class(source) evaluator_step: BaseEvaluatorStep = c(**args) # Check the execution parameters eval_config = evaluator_step.build_config() eval_config = tfma.update_eval_config_with_defaults(eval_config) tfma.verify_eval_config(eval_config) # Resolve the model if constants.MODEL in input_dict: model_artifact = input_dict[constants.MODEL] model_uri = artifact_utils.get_single_uri(model_artifact) model_path = path_utils.serving_model_path(model_uri) model_fn = try_get_fn(evaluator_step.CUSTOM_MODULE, 'custom_eval_shared_model' ) or tfma.default_eval_shared_model eval_shared_model = model_fn( model_name='', # TODO: Fix with model names eval_saved_model_path=model_path, eval_config=eval_config) else: eval_shared_model = None self._log_startup(input_dict, output_dict, exec_properties) # Main pipeline logging.info('Evaluating model.') with self._make_beam_pipeline() as pipeline: examples_list = [] tensor_adapter_config = None if tfma.is_batched_input(eval_shared_model, eval_config): tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact( examples=[ artifact_utils.get_single_instance( examples_artifact) ], telemetry_descriptors=_TELEMETRY_DESCRIPTORS, schema=schema, raw_record_column_name=tfma_constants. ARROW_INPUT_COLUMN) for split in evaluator_step.splits: file_pattern = io_utils.all_files_pattern( artifact_utils.get_split_uri( examples_artifact, split)) tfxio = tfxio_factory(file_pattern) data = (pipeline | 'ReadFromTFRecordToArrow[%s]' % split >> tfxio.BeamSource()) examples_list.append(data) if schema is not None: tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfxio.ArrowSchema(), tensor_representations=tfxio.TensorRepresentations( )) else: for split in evaluator_step.splits: file_pattern = io_utils.all_files_pattern( artifact_utils.get_split_uri( examples_artifact, split)) data = (pipeline | 'ReadFromTFRecord[%s]' % split >> beam.io. ReadFromTFRecord(file_pattern=file_pattern)) examples_list.append(data) # Resolve custom extractors custom_extractors = try_get_fn(evaluator_step.CUSTOM_MODULE, 'custom_extractors') extractors = None if custom_extractors: extractors = custom_extractors( eval_shared_model=eval_shared_model, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config) # Resolve custom evaluators custom_evaluators = try_get_fn(evaluator_step.CUSTOM_MODULE, 'custom_evaluators') evaluators = None if custom_evaluators: evaluators = custom_evaluators( eval_shared_model=eval_shared_model, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config) # Extract, evaluate and write (examples_list | 'FlattenExamples' >> beam.Flatten() | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, output_path=output_uri, extractors=extractors, evaluators=evaluators, tensor_adapter_config=tensor_adapter_config)) logging.info('Evaluation complete. Results written to %s.', output_uri)