def _add_spark_datasource(datasource_name: str, dataset: AbstractDataSet, ge_context: DataContext) -> str: from great_expectations.datasource import SparkDFDatasource path = str(dataset._filepath.parent) if path.startswith("./"): path = path[2:] configuration = SparkDFDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } }) configuration["class_name"] = "SparkDFDatasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {0:s}".format(errors)) ge_context.add_datasource(name=datasource_name, **configuration) return datasource_name
def _add_spark_datasource(context): path = click.prompt( msg_prompt_filesys_enter_base_path, # default='/data/', type=click.Path(exists=True, file_okay=False, dir_okay=True, readable=True), show_default=True) if path.startswith("./"): path = path[2:] if path.endswith("/"): path = path[:-1] default_data_source_name = os.path.basename(path) + "__dir" data_source_name = click.prompt(msg_prompt_datasource_name, default=default_data_source_name, show_default=True) configuration = SparkDFDatasource.build_configuration( base_directory=os.path.join("..", path)) context.add_datasource(name=data_source_name, class_name='SparkDFDatasource', **configuration) return data_source_name
def _add_spark_datasource( context, passthrough_generator_only=True, prompt_for_datasource_name=True ): toolkit.send_usage_message( data_context=context, event="cli.new_ds_choice", event_payload={"type": "spark"}, success=True, ) if not _verify_pyspark_dependent_modules(): return None if passthrough_generator_only: datasource_name = "files_spark_datasource" # configuration = SparkDFDatasource.build_configuration(batch_kwargs_generators={ # "default": { # "class_name": "PassthroughGenerator", # } # } # ) configuration = SparkDFDatasource.build_configuration() else: path = click.prompt( msg_prompt_filesys_enter_base_path, type=click.Path(exists=True, file_okay=False), ).strip() if path.startswith("./"): path = path[2:] if path.endswith("/"): basenamepath = path[:-1] else: basenamepath = path datasource_name = os.path.basename(basenamepath) + "__dir" if prompt_for_datasource_name: datasource_name = click.prompt( msg_prompt_datasource_name, default=datasource_name ) configuration = SparkDFDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } } ) configuration["class_name"] = "SparkDFDatasource" configuration["module_name"] = "great_expectations.datasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {:s}".format(errors) ) cli_message( """ Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml: {:s} """.format( datasource_name, textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), " "), ) ) toolkit.confirm_proceed_or_exit() context.add_datasource(name=datasource_name, **configuration) return datasource_name