def extract(spark_docs_markdown_text): tables = re.findall(TABLE_REGEX, spark_docs_markdown_text, re.DOTALL | re.MULTILINE) spark_configs = [] for name, table in tables: parsed_table = list(ptr.HtmlTableTextLoader(table).load())[0] df = parsed_table.as_dataframe() for _, row in df.iterrows(): s = SparkConfig(row['Property Name'], row['Default'], name + ": " + row['Meaning']) spark_configs.append(s) result = SparkConfigNode() for spark_config in spark_configs: # TODO: we should handle this thing if spark_config.path == 'spark.executorEnv.[EnvironmentVariableName]': continue # Traverse spark.app.name key paths, creating SparkConfigNode at each tree node. # The leaves of the tree (stored in SparkConfigNode.value) are SparkConfig values. print(spark_config.path, file=sys.stderr) key_path = spark_config.split_path d = result while key_path: key = key_path.pop(0) if key not in d.children: d.children[key] = SparkConfigNode() d = d.children[key] d.value = spark_config return result
def test_exception_null(self, table_text, expected): loader = ptr.HtmlTableTextLoader(table_text) loader.table_name = "dummy" with pytest.raises(expected): for _tabletuple in loader.load(): pass
def test_normal(self, table_text, table_name, expected_tabletuple_list): loader = ptr.HtmlTableTextLoader(table_text) loader.table_name = table_name for table_data in loader.load(): print("[actual]\n{}".format(dump_tabledata(table_data))) assert table_data.in_tabledata_list(expected_tabletuple_list)
def main(): r = requests.get( 'https://raw.githubusercontent.com/apache/spark/{}/docs/configuration.md' .format(SPARK_VERSION)) tables = re.findall(TABLE_REGEX, r.text, re.DOTALL | re.MULTILINE) spark_configs = [] for name, table in tables: parsed_table = list(ptr.HtmlTableTextLoader(table).load())[0] df = parsed_table.as_dataframe() for _, row in df.iterrows(): s = SparkConfig(row['Property Name'], row['Default'], name + ": " + row['Meaning']) spark_configs.append(s) result = SparkConfigNode() for s in spark_configs: # TODO: we should handle this thing if s.path == 'spark.executorEnv.[EnvironmentVariableName]': continue print(s.path, file=sys.stderr) key_path = s.split_path d = result while key_path: key = key_path.pop(0) if key not in d.children: d.children[key] = SparkConfigNode() d = d.children[key] d.value = s with IndentingBufferPrinter() as printer: printer.line("'''NOTE: THIS FILE IS AUTO-GENERATED. DO NOT EDIT") printer.blank_line() printer.line('Produced via:') printer.line( ' python parse_spark_configs.py > ' '../event-pipeline-demo/event_pipeline_demo/configs_spark.py') printer.blank_line() printer.line("'''") printer.blank_line() printer.blank_line() printer.line( 'from dagster import Bool, Field, Float, Int, PermissiveDict, String' ) printer.blank_line() printer.blank_line() printer.line('# pylint: disable=line-too-long') printer.line('def spark_config():') with printer.with_indent(): printer.append('return ') result.print(printer) printer.line('# pylint: enable=line-too-long') print(printer.read().strip())
def test_exception_HtmlTableTextLoader(self, monkeypatch, value, source, expected): monkeypatch.setattr(HtmlTableFormatter, "table_id", self.valid_tag_property) loader = ptr.HtmlTableTextLoader(source) loader.table_name = value formatter = HtmlTableFormatter(source) formatter.accept(loader) with pytest.raises(expected): print(formatter._make_table_name())
def test_normal_HtmlTableTextLoader_null_tag(self, monkeypatch, value, expected): monkeypatch.setattr(HtmlTableFormatter, "table_id", self.null_tag_property) source = """ <title>nulltag</title> <table></table> """ loader = ptr.HtmlTableTextLoader(source) loader.table_name = value formatter = HtmlTableFormatter(source) formatter.accept(loader) assert formatter._make_table_name() == expected