def process_recipe(path, tdir, dataframe_source, cov_report_file, test_case, spark_session): """Prepare the recipe and trigger the recipe execution. """ with open(path) as f: code = f.read() code_w_reporting = prepare(code) cov_dict = {} cov_dict["cov_report_file"] = cov_report_file cov_dict["test_case"] = test_case globals_dict = { "BIRGITTA_SPARK_SESSION_TYPE": "LOCAL", "BIRGITTA_TEST_COVERAGE": cov_dict, "BIRGITTA_DBG_COUNTS": dbg_counts() } full_code = script_prepend.code(tdir) + code_w_reporting dump_test_recipe(test_case, tdir, full_code) timing.time("execute_recipe before exec") runner.exec_code(full_code, globals_dict) timing.time("execute_recipe after exec")
def default_server_session(*, conf): """Don't override app_name, since context might have given it a useful name.""" session = (SparkSession.builder .config(conf=conf) .getOrCreate()) timing.time("spark.default_server_session created/gotten") return session
def write_fixtures(fixtures, variant_name, spark_session, dataframe_source): """Write fixtures to storage. Args: fixtures (dict): Dict of fixtures variant_name (str): Name of fixture variant spark_session (SparkSession): Spark session used to create fixtures dataframe_source (DataframeSource): The source to write to, e.g. FS """ timing.time("write_fixtures start") dfs = dataframes(fixtures, variant_name, spark_session) for ds_name in dfs.keys(): dataframe_source.write(dfs[ds_name], ds_name) timing.time("write_fixtures end")
def log_entry(test_case, line_no, line_str, report_file, metrics): """Log a report entry for line of the recipe. """ timing.time(line_str) if dbg_counts() and (metrics['var_type'] == 'DataFrame'): print("l:", line_no, repr(line_str), "count:", metrics['count']) with open(report_file, 'a') as f: json_dict = { "test_case": test_case, "line_no": line_no, "line_str": line_str, "metrics": metrics, } f.write(json.dumps(json_dict)) f.write("\n")
def local_session(*, app_name='birgitta_spark_test'): """Get a local spark session. Used for recipe tests, both running them, and creating fixtures.""" conf = local_conf_spark() # Sets the Spark master URL to connect to, such as: # # "local" to run locally, # "local[4]" to run locally with 4 cores, # local[*] Run Spark locally with as many worker threads as logical cores # on your machine, # "spark://89.9.250.25:7077" or "spark://master:7077" to run on a Spark # standalone cluster. master_spark_url = 'local[*]' session = (SparkSession.builder .config(conf=conf) .master(master_spark_url) .appName(app_name) .getOrCreate()) timing.time("spark.local_session created/gotten") return session
def run_case(tmpdir, cov_report_file, cov_results, # noqa F811 test_case, spark_session, in_fixtures, out_fixtures, recipe_path, fixture_name): """Run a test case. Does the following: * Reporting and timing. * Setup input and result fixtures. * Run the spark script in recipe_path. * Asserts the outputs. * Collects and presents report. Returns: None """ timing.time("run_case_fn start: %s" % (fixture_name)) tdir = tmpdir.strpath dataframe_source = LocalSource(dataset_dir=tdir) fixture_name = test_case[5:] # Lose 'test_' prefix fixturing.write_fixtures(in_fixtures, fixture_name, spark_session, dataframe_source) expected_dfs = fixturing.dataframes(out_fixtures, fixture_name, spark_session) localtest.process_recipe(recipe_path, tdir, dataframe_source, cov_report_file, test_case, spark_session) timing.time("runcase_fn run_script done: %s" % (fixture_name)) assertion.assert_outputs(expected_dfs, dataframe_source, spark_session) report.collect(cov_report_file, test_case, cov_results) timing.time("run_case_fn end: %s" % (fixture_name)) timing.print_results(test_case)