Exemplo n.º 1
0
print("SDK version:", azureml.core.VERSION)

dataset_name = 'grib-dataset'

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

datastore = ws.get_default_datastore()

input_ds = Dataset.get_by_name(ws, dataset_name)
batch_data = DatasetConsumptionConfig("batch_dataset", input_ds, mode='mount')

output_dir = PipelineData(name='batch_output', datastore=datastore)

parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws,
                                                  path='convert_parallel.yml')

batch_step = ParallelRunStep(name="batch-conversion-step",
                             parallel_run_config=parallel_run_config,
                             arguments=['--data_output_path', output_dir],
                             inputs=[batch_data],
                             output=output_dir,
                             allow_reuse=False)

steps = [batch_step]

pipeline = Pipeline(workspace=ws, steps=steps)
pipeline.validate()

pipeline_run = Experiment(ws, 'convert-batch-pipeline').submit(pipeline)
pipeline_run.wait_for_completion()
parser.add_argument("--runconfig",
                    type=str,
                    help="Path to the parallel runconfig for pipeline",
                    dest="runconfig",
                    required=True)
args = parser.parse_args()
print(f'Arguments: {args}')

print('Connecting to workspace')
ws = Workspace.from_config()
print(
    f'WS name: {ws.name}\nRegion: {ws.location}\nSubscription id: {ws.subscription_id}\nResource group: {ws.resource_group}'
)

print('Loading parallel runconfig for pipeline')
parallel_run_config = ParallelRunConfig.load_yaml(workspace=ws,
                                                  path=args.runconfig)

print('Loading default batch dataset')
batch_dataset = Dataset.get_by_name(ws, args.dataset)

# Parametrize dataset input and dataset output name (batch scoring result) to the pipeline
batch_dataset_parameter = PipelineParameter(name="batch_dataset",
                                            default_value=batch_dataset)
batch_dataset_consumption = DatasetConsumptionConfig(
    "batch_dataset", batch_dataset_parameter).as_mount()

datastore = ws.get_default_datastore()
output_dataset_name = "batch_scoring_results"

# Existing, GA-code - does not allow to specify the path on the datastore
# output_dataset = PipelineData(name='batch_output', datastore=datastore).as_dataset()