def main(): """Configures pipeline and spawns preprocessing job.""" args = _parse_arguments(sys.argv) config_path = os.path.abspath( os.path.join(__file__, os.pardir, 'preprocessing_config.ini')) config = _parse_config('CLOUD' if args.cloud else 'LOCAL', config_path) ml_project = args.project_id options = {'project': ml_project} if args.cloud: if not args.job_name: raise ValueError('Job name must be specified for cloud runs.') options.update({ 'job_name': args.job_name, 'num_workers': int(config.get('num_workers')), 'max_num_workers': int(config.get('max_num_workers')), 'staging_location': os.path.join(args.job_dir, 'staging'), 'temp_location': os.path.join(args.job_dir, 'tmp'), 'region': config.get('region'), 'setup_file': os.path.abspath( os.path.join(__file__, '../..', 'dataflow_setup.py')), }) pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) _set_logging(config.get('log_level')) with beam.Pipeline(config.get('runner'), options=pipeline_options) as p: preprocess.run(p, args.input_data, args.job_dir)
def main(): """Configures pipeline and spawns preprocessing job.""" args = _parse_arguments(sys.argv) config_path = os.path.abspath( os.path.join(__file__, os.pardir, 'preprocessing_config.ini')) config = _parse_config('CLOUD' if args.cloud else 'LOCAL', config_path) ml_project = args.project_id options = {'project': ml_project} if args.cloud: if not args.job_name: raise ValueError('Job name must be specified for cloud runs.') options.update({ 'job_name': args.job_name, 'num_workers': int(config.get('num_workers')), 'max_num_workers': int(config.get('max_num_workers')), 'staging_location': os.path.join(args.job_dir, 'staging'), 'temp_location': os.path.join(args.job_dir, 'tmp'), 'region': config.get('region'), 'setup_file': os.path.abspath( os.path.join(__file__, '../..', 'dataflow_setup.py')), }) pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) _set_logging(config.get('log_level')) with beam.Pipeline( config.get('runner'), options=pipeline_options) as pipeline: preprocess.run(pipeline, args.input_data, args.job_dir)
def main(): """Configures and runs a pipeline.""" args = parse_arguments(sys.argv) config = parse_config("CLOUD" if args.cloud else "LOCAL", get_relative_path("config.ini")) set_logging(config.get("log_level")) options = get_pipeline_options(args, config) runner = str(config.get("runner")) with beam.Pipeline(runner, options=options) as pipeline: with beam_impl.Context( temp_dir=os.path.join(args.tft_dir, constants.TMP_DIR)): preprocess.run(pipeline, args)
def run(params): """Sets and runs Beam preprocessing pipeline. Args: params: Object holding a set of parameters as name-value pairs. Raises: ValueError: If `gcp` argument is `True` and `project_id` or `job_name` are not specified. """ options = {} if params.gcp: options = { 'project': params.project_id, 'job_name': params.job_name, 'temp_location': os.path.join(params.output_dir, 'temp'), 'staging_location': os.path.join(params.output_dir, 'staging'), 'setup_file': os.path.abspath(os.path.join(os.path.dirname(__file__), 'setup.py')) } def _update(param_name): param_value = getattr(params, param_name) if param_value: options.update({param_name: param_value}) _update('worker_machine_type') _update('num_workers') _update('region') pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = _DATAFLOW_RUNNER if params.gcp else _DIRECT_RUNNER with beam.Pipeline(runner, options=pipeline_options) as p: preprocess.run(p=p, params=params)