def ReadCSVFilesPlain( p: beam.Pipeline, file_pattern: str, fieldnames: typing.List[str], ): return (p | "Read files csv files" >> beam_io.ReadFromText( file_pattern=file_pattern, skip_header_lines=1) | "Chunk for parsing" >> beam.BatchElements() | "Parse csv lines to dicts" >> beam.FlatMap( lambda x: map(dict, csv.DictReader(x, fieldnames=fieldnames))) | "Chunk for processing" >> beam.BatchElements() | "Convert chunks to DF" >> beam.Map(_to_dataframe))
def RunInferenceImpl( # pylint: disable=invalid-name examples: beam.pvalue.PCollection, inference_spec_type: model_spec_pb2.InferenceSpecType ) -> beam.pvalue.PCollection: """Implementation of RunInference API. Args: examples: A PCollection containing examples. inference_spec_type: Model inference endpoint. Returns: A PCollection containing prediction logs. Raises: ValueError; when operation is not supported. """ logging.info('RunInference on model: %s', inference_spec_type) batched_examples = examples | 'BatchExamples' >> beam.BatchElements() operation_type = _get_operation_type(inference_spec_type) if operation_type == OperationType.CLASSIFICATION: return batched_examples | 'Classify' >> _Classify(inference_spec_type) elif operation_type == OperationType.REGRESSION: return batched_examples | 'Regress' >> _Regress(inference_spec_type) elif operation_type == OperationType.PREDICTION: return batched_examples | 'Predict' >> _Predict(inference_spec_type) elif operation_type == OperationType.MULTIHEAD: return (batched_examples | 'MultiInference' >> _MultiInference(inference_spec_type)) else: raise ValueError('Unsupported operation_type %s' % operation_type)
def _ExtractPredictions( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig, eval_shared_models: List[types.EvalSharedModel]) -> beam.pvalue.PCollection: """A PTransform that adds predictions and possibly other tensors to extracts. Args: extracts: PCollection of extracts containing model inputs keyed by tfma.FEATURES_KEY (if model inputs are named) or tfma.INPUTS_KEY (if model takes raw tf.Examples as input). eval_config: Eval config. eval_shared_models: Shared model parameters. Returns: PCollection of Extracts updated with the predictions. """ batch_args = {} if eval_config.options.HasField('desired_batch_size'): batch_args = dict( min_batch_size=eval_config.options.desired_batch_size.value, max_batch_size=eval_config.options.desired_batch_size.value) extracts = ( extracts | 'Batch' >> beam.BatchElements(**batch_args) | 'Predict' >> beam.ParDo( _PredictionDoFn( eval_config=eval_config, eval_shared_models=eval_shared_models))) return extracts
def expand(self, pcoll: beam.PCollection) -> beam.PCollection: return (pcoll | beam.BatchElements() | beam.ParDo( _RunInferenceDoFn(shared.Shared(), self._model_loader, self._clock)) | beam.FlatMap(_unbatch))
def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection): return ( raw_record_pcoll | 'Batch' >> beam.BatchElements( **record_based_tfxio.GetBatchElementsKwargs(batch_size)) | 'ToRecordBatch' >> beam.Map(_BatchedRecordsToArrow, self.raw_record_column_name))
def _ExtractTFLitePredictions( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig, eval_shared_models: Dict[Text, types.EvalSharedModel], desired_batch_size: Optional[int]) -> beam.pvalue.PCollection: """A PTransform that adds predictions and possibly other tensors to extracts. Args: extracts: PCollection of extracts containing model inputs keyed by tfma.FEATURES_KEY. eval_config: Eval config. eval_shared_models: Shared model parameters keyed by model name. desired_batch_size: Optional batch size. Returns: PCollection of Extracts updated with the predictions. """ batch_args = {} # TODO(b/143484017): Consider removing this option if autotuning is better # able to handle batch size selection. if desired_batch_size is not None: batch_args = dict( min_batch_size=desired_batch_size, max_batch_size=desired_batch_size) else: # TODO(b/155887292): Remove the following and allow dynamic batch sizing # once the bug is addressed. Also add unit tests to exercise. batch_args = dict(min_batch_size=1, max_batch_size=1) return ( extracts | 'Batch' >> beam.BatchElements(**batch_args) | 'Predict' >> beam.ParDo( _TFLitePredictionDoFn( eval_config=eval_config, eval_shared_models=eval_shared_models)))
def expand(self, lines: beam.pvalue.PCollection): """Decodes the input CSV records into an in-memory dict representation. Args: lines: A PCollection of strings representing the lines in the CSV file. Returns: A PCollection of dicts representing the CSV records. """ csv_lines = (lines | 'ParseCSVLines' >> beam.ParDo( csv_decoder.ParseCSVLine(self._delimiter))) if self._infer_type_from_schema: column_infos = _get_feature_types_from_schema( self._schema, self._column_names) else: # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT. # Do first pass to infer the feature types. column_infos = beam.pvalue.AsSingleton( csv_lines | 'InferColumnTypes' >> beam.CombineGlobally( csv_decoder.ColumnTypeInferrer( column_names=self._column_names, skip_blank_lines=self._skip_blank_lines))) # Do second pass to generate the in-memory dict representation. return ( csv_lines | 'BatchCSVLines' >> beam.BatchElements( **batch_util.GetBeamBatchKwargs(self._desired_batch_size)) | 'BatchedCSVRowsToArrow' >> beam.ParDo( _BatchedCSVRowsToArrow( skip_blank_lines=self._skip_blank_lines), column_infos))
def test_mini_beam_pipeline_batched(self): def test_call_fn(batched_model_input, sr, mod, key, name): del sr, mod, key, name return np.zeros([batched_model_input.shape[0], 5, 1024], np.float32) with beam.Pipeline() as root: _ = (root | beam.Create([('k1', make_tfexample(5)), ('k2', make_tfexample(5))]) | 'Batch' >> beam.BatchElements(min_batch_size=2, max_batch_size=2) | beam.ParDo( beam_dofns.ComputeBatchedChunkedSingleEmbeddings( name='all', module='dummy_mod_loc', output_key=['k1'], audio_key='audio', sample_rate_key='sample_rate', sample_rate=None, chunk_len=2, average_over_time=True, feature_fn=None, setup_fn=lambda _: MockModule(['k1']), module_call_fn=test_call_fn)) | beam.Map(data_prep_utils.single_audio_emb_to_tfex, embedding_name='ename', audio_key='audio', embedding_length=1024))
def run(argv=None): """Main entry point; defines and runs the pipeline.""" logging.info("Starting pipeline.") parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--project=voterdb-test', '--job_name=news-pipeline', '--temp_location gs://voterdb-test-dataflow-temp/', '--staging_location gs://voterdb-test-dataflow-staging/', '--requirements_file=requirements.txt', '--max_num_workers=8', '--disk_size_gb=50' ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Creating an initial PCollection of one value is necessary to ensure # the HTTP gets are deferred until the DataflowRunner starts # in the cloud. beam.Create(read_zip_csv()) otherwise creates a pickled # Python image which is bigger than the upload limit, and fails. raw = (p | "beam.Create" >> beam.Create(KEYWORDS) | "get_news_items" >> beam.FlatMap(get_news_items) | "BatchElements" >> beam.BatchElements() | "BatchRunner analyze" >> beam.ParDo(BatchRunner(), analyze) | "format_bq" >> beam.Map(format_bq) | "News.Semantic" >> beam.io.WriteToBigQuery( table='News.Semantic', schema=gen_schema(SCHEMA_FIELDS), write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='Input folder to process.') parser.add_argument('--output', dest='output', required=True, help='Output folder to write results to.') parser.add_argument('--models', dest='models', help='Input folder to read model parameters.') parser.add_argument('--batchsize', dest='batchsize', help='Batch size for processing') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=pipeline_options) as p: filtered_images = ( p | "Read Images" >> beam.Create( glob.glob(known_args.input + '*wms*' + '.png')) | "Batch elements" >> beam.BatchElements(20, known_args.batchsize) | "Filter Cloudy images" >> beam.ParDo( FilterCloudyFn.FilterCloudyFn(known_args.models))) filtered_images | "Segment for Land use" >> beam.ParDo( UNetInference.UNetInferenceFn(known_args.models, known_args.output))
def _ExtractPredictions( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_shared_model: types.EvalSharedModel, desired_batch_size: Optional[int] = None) -> beam.pvalue.PCollection: """A PTransform that adds predictions and possibly other tensors to extracts. Args: extracts: PCollection of extracts containing model inputs keyed by tfma.FEATURES_KEY (if model inputs are named) or tfma.INPUTS_KEY (if model takes raw tf.Examples as input). eval_shared_model: Shared model parameters. desired_batch_size: Optional batch size for prediction. Returns: PCollection of Extracts updated with the predictions. """ batch_args = {} if desired_batch_size: batch_args = dict(min_batch_size=desired_batch_size, max_batch_size=desired_batch_size) extracts = (extracts | 'Batch' >> beam.BatchElements(**batch_args) | 'Predict' >> beam.ParDo( _PredictionDoFn(eval_shared_model=eval_shared_model))) return extracts
def ptransform_fn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn(self._GetSchemaForDecoding(), self.raw_record_column_name)))
def ModelAgnosticExtract( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, model_agnostic_config: agnostic_predict.ModelAgnosticConfig, desired_batch_size: Optional[int] = None) -> beam.pvalue.PCollection: """A PTransform that generates features, predictions, labels. Args: extracts: PCollection of Extracts containing a serialized example to be fed to the model. model_agnostic_config: A config specifying how to extract FeaturesPredictionsLabels from the input input Extracts. desired_batch_size: Optional batch size for batching in Aggregate. Returns: PCollection of Extracts, where the extracts contains the features, predictions, labels retrieved. """ batch_args = {} if desired_batch_size: batch_args = dict( min_batch_size=desired_batch_size, max_batch_size=desired_batch_size) return (extracts | 'Batch' >> beam.BatchElements(**batch_args) | 'ModelAgnosticExtract' >> beam.ParDo( _ModelAgnosticExtractDoFn( model_agnostic_config=model_agnostic_config)))
def _PTransformFn(raw_record_pcoll: beam.pvalue.PCollection): return (raw_record_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "ToRecordBatch" >> beam.Map( _BatchedRecordsToArrow, self.raw_record_column_name, self._can_produce_large_types))
def _ExtractPredictions( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig, eval_shared_models: Dict[Text, types.EvalSharedModel], desired_batch_size: Optional[int]) -> beam.pvalue.PCollection: """A PTransform that adds predictions and possibly other tensors to extracts. Args: extracts: PCollection of extracts containing model inputs keyed by tfma.FEATURES_KEY (if model inputs are named) or tfma.INPUTS_KEY (if model takes raw tf.Examples as input). eval_config: Eval config. eval_shared_models: Shared model parameters keyed by model name. desired_batch_size: Optional batch size. Returns: PCollection of Extracts updated with the predictions. """ batch_args = {} # TODO(b/143484017): Consider removing this option if autotuning is better # able to handle batch size selection. if desired_batch_size is not None: batch_args = dict(min_batch_size=desired_batch_size, max_batch_size=desired_batch_size) return (extracts | 'Batch' >> beam.BatchElements(**batch_args) | 'Predict' >> beam.ParDo( _PredictionDoFn(eval_config=eval_config, eval_shared_models=eval_shared_models)))
def BatchExamplesToArrowTables( examples: beam.pvalue.PCollection, desired_batch_size: Optional[int] = constants. DEFAULT_DESIRED_INPUT_BATCH_SIZE ) -> beam.pvalue.PCollection: """Batches example dicts into Arrow tables. Args: examples: A PCollection of example dicts. desired_batch_size: Batch size. The output Arrow tables will have as many rows as the `desired_batch_size`. Returns: A PCollection of Arrow tables. """ # DecodedExamplesToTable should be called within a lambda function instead of # specifying the function name in beam.Map for the reasons discussed in # b/143648957. # TODO(b/131315065): Remove the comment above when the CSV decoder no longer # uses BatchExamplesToArrowTables. return ( examples | "BatchBeamExamples" >> beam.BatchElements(**GetBeamBatchKwargs(desired_batch_size)) | "DecodeExamplesToTable" >> # pylint: disable=unnecessary-lambda beam.Map(lambda x: decoded_examples_to_arrow.DecodedExamplesToTable(x)) )
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "BatchElements" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _RecordsToRecordBatch(self._saved_decoder_path, self.raw_record_column_name, self._can_produce_large_types)))
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return (raw_records_pcoll | "Batch" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn( self._schema, self.raw_record_column_name, self._can_produce_large_types)))
def _ptransform_fn(raw_records_pcoll: beam.pvalue.PCollection): return ( raw_records_pcoll | "Batch" >> beam.BatchElements( **record_based_tfxio.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo( _DecodeBatchExamplesDoFn(self._schema, self.raw_record_column_name)))
def expand(self, pcoll: beam.pvalue.PCollection): record_batches = (pcoll | beam.BatchElements( min_batch_size=self._desired_batch_size, max_batch_size=self._desired_batch_size) | beam.ParDo(RecordsToTable(), self._column_specs)) return record_batches
def expand(self, pcoll): return ( pcoll | 'BatchElements' >> beam.BatchElements( min_batch_size=self.num_threads, max_batch_size=self.num_threads, ) | 'ParDo' >> beam.ParDo(self.get_dofn(), *self.args, **self.kwargs) )
def _ptransform_fn(pipeline: beam.pvalue.PCollection): # TODO(zhuo): collect telemetry from RecordBatches. return ( pipeline | "ReadExamples" >> self._SerializedExamplesSource() | "Batch" >> beam.BatchElements(**_GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo(_DecodeBatchExamplesDoFn(self._schema)))
def _TFMAPredict( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_shared_models: Dict[str, types.EvalSharedModel], desired_batch_size: Optional[int] = None, materialize: Optional[bool] = True, eval_config: Optional[ config_pb2.EvalConfig] = None) -> beam.pvalue.PCollection: """A PTransform that adds predictions to Extracts. Args: extracts: PCollection of Extracts containing a serialized example to be fed to the model. eval_shared_models: Shared model parameters keyed by model name. desired_batch_size: Optional. Desired batch size for prediction. materialize: True to call the FeatureExtractor to add MaterializedColumn entries for the features, predictions, and labels. eval_config: Eval config. Returns: PCollection of Extracts, where the extracts contains the features, predictions, labels retrieved. """ if not eval_config: batch_args = {} # TODO(b/143484017): Consider removing this option if autotuning is better # able to handle batch size selection. if desired_batch_size: batch_args = dict( min_batch_size=desired_batch_size, max_batch_size=desired_batch_size) extracts = (extracts | 'Batch' >> beam.BatchElements(**batch_args)) else: extracts = ( extracts | 'UnwrapBatchedExtract' >> beam.Map(_unwrap_batched_extract)) # We don't actually need to add the add_metrics_callbacks to do Predict, # but because if we want to share the model between Predict and subsequent # stages (i.e. we use same shared handle for this and subsequent stages), # then if we don't add the metrics callbacks here, they won't be present # in the model in the later stages if we reuse the model from this stage. extracts = ( extracts | 'Predict' >> beam.ParDo( _TFMAPredictionDoFn( eval_shared_models=eval_shared_models, eval_config=eval_config))) if materialize and not eval_config: additional_fetches = [] for m in eval_shared_models.values(): if m.additional_fetches: additional_fetches.extend(m.additional_fetches) return extracts | 'ExtractFeatures' >> legacy_feature_extractor._ExtractFeatures( # pylint: disable=protected-access additional_extracts=additional_fetches or None) return extracts
def _PTransformFn(raw_records_pcoll: beam.pvalue.PCollection): return ( raw_records_pcoll | "BatchElements" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(batch_size)) | "Decode" >> beam.ParDo(_RecordsToRecordBatch( self._saved_decoder_path, self.telemetry_descriptors, shared.Shared() if self._use_singleton_decoder else None, self.raw_record_column_name, self._record_index_column_name)))
def expand( self, pcoll: beam.PCollection[ExampleT] ) -> beam.PCollection[PredictionT]: resource_hints = self._model_handler.get_resource_hints() return ( pcoll # TODO(https://github.com/apache/beam/issues/21440): Hook into the # batching DoFn APIs. | beam.BatchElements(**self._model_handler.batch_elements_kwargs()) | (beam.ParDo( _RunInferenceDoFn(self._model_handler, self._clock), self._inference_args).with_resource_hints(**resource_hints)))
def _RawRecordsToRecordBatch(pcoll, batch_size): batch_size = 1 if not batch_size else batch_size class _CreateRBDoFn(beam.DoFn): def process(self, examples): return [ pa.RecordBatch.from_arrays([pa.array(examples)], ["column_name"]) ] return (pcoll | beam.BatchElements(batch_size) | beam.ParDo(_CreateRBDoFn()))
def durations(root, ds_file, ds_name, reader_type, suffix): """Beam pipeline for durations from a particular file or glob.""" logging.info('Reading from %s: (%s, %s)', reader_type, ds_name, ds_file) input_examples = utils.reader_functions[reader_type](root, ds_file, f'Read-{suffix}') if FLAGS.batch_size: input_examples = input_examples | f'Batch-{suffix}' >> beam.BatchElements( min_batch_size=FLAGS.batch_size, max_batch_size=FLAGS.batch_size) return input_examples | f'Lens-{suffix}' >> beam.FlatMap( durations_from_tfexs) else: return input_examples | f'Lens-{suffix}' >> beam.Map( duration_from_tfex)
def _Predict( # pylint: disable=invalid-name examples, eval_saved_model_path, desired_batch_size=None): batch_args = {} if desired_batch_size: batch_args = dict(min_batch_size=desired_batch_size, max_batch_size=desired_batch_size) return (examples | 'Batch' >> beam.BatchElements(**batch_args) | beam.ParDo( _PredictionDoFn(eval_saved_model_path=eval_saved_model_path, add_metrics_callbacks=None, shared_handle=shared.Shared())))
def run(args=None): options = PipelineOptions() pipeline = beam.Pipeline(options=options) runtime_options = options.view_as(RunTimeOptions) gcp_proejct = options.view_as(GoogleCloudOptions).project pipeline | "Read Files" >> beam.io.ReadFromText(runtime_options.input) | \ "Parse Event" >> beam.ParDo(ParseEventFn()) | \ "BatchElements" >> beam.BatchElements(max_batch_size=200) | \ "Apply DLP" >> beam.ParDo(TokenizationFxn(gcp_proejct, runtime_options.deIdentiyTemplateId)) | \ "Write to BigQuery" >> beam.io.WriteToBigQuery( runtime_options.bq, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED) pipeline.run()
def run(argv=None): pipeline_options = PipelineOptions(argv) lda_options = pipeline_options.view_as(LdaOptions) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True with apache_beam.Pipeline(options=pipeline_options) as p: articles = (p | "Read Articles" >> apache_beam.Create(glob.glob(lda_options.input + '*.txt'))) \ | apache_beam.Map(load_text)\ | "Batch elements" >> apache_beam.BatchElements(lda_options.batchsize, lda_options.batchsize) articles | apache_beam.ParDo( LdaFn(lda_options.K, lda_options.tau0, lda_options.kappa)) | "Write" >> WriteToText("test.txt")