def extract(spark_docs_markdown_text):
    tables = re.findall(TABLE_REGEX, spark_docs_markdown_text,
                        re.DOTALL | re.MULTILINE)

    spark_configs = []
    for name, table in tables:
        parsed_table = list(ptr.HtmlTableTextLoader(table).load())[0]
        df = parsed_table.as_dataframe()
        for _, row in df.iterrows():
            s = SparkConfig(row['Property Name'], row['Default'],
                            name + ": " + row['Meaning'])
            spark_configs.append(s)

    result = SparkConfigNode()
    for spark_config in spark_configs:
        # TODO: we should handle this thing
        if spark_config.path == 'spark.executorEnv.[EnvironmentVariableName]':
            continue

        # Traverse spark.app.name key paths, creating SparkConfigNode at each tree node.
        # The leaves of the tree (stored in SparkConfigNode.value) are SparkConfig values.
        print(spark_config.path, file=sys.stderr)
        key_path = spark_config.split_path

        d = result
        while key_path:
            key = key_path.pop(0)
            if key not in d.children:
                d.children[key] = SparkConfigNode()
            d = d.children[key]
        d.value = spark_config

    return result
示例#2
0
    def test_exception_null(self, table_text, expected):
        loader = ptr.HtmlTableTextLoader(table_text)
        loader.table_name = "dummy"

        with pytest.raises(expected):
            for _tabletuple in loader.load():
                pass
示例#3
0
    def test_normal(self, table_text, table_name, expected_tabletuple_list):
        loader = ptr.HtmlTableTextLoader(table_text)
        loader.table_name = table_name

        for table_data in loader.load():
            print("[actual]\n{}".format(dump_tabledata(table_data)))

            assert table_data.in_tabledata_list(expected_tabletuple_list)
示例#4
0
def main():
    r = requests.get(
        'https://raw.githubusercontent.com/apache/spark/{}/docs/configuration.md'
        .format(SPARK_VERSION))

    tables = re.findall(TABLE_REGEX, r.text, re.DOTALL | re.MULTILINE)

    spark_configs = []
    for name, table in tables:
        parsed_table = list(ptr.HtmlTableTextLoader(table).load())[0]
        df = parsed_table.as_dataframe()
        for _, row in df.iterrows():
            s = SparkConfig(row['Property Name'], row['Default'],
                            name + ": " + row['Meaning'])
            spark_configs.append(s)

    result = SparkConfigNode()
    for s in spark_configs:
        # TODO: we should handle this thing
        if s.path == 'spark.executorEnv.[EnvironmentVariableName]':
            continue

        print(s.path, file=sys.stderr)
        key_path = s.split_path
        d = result
        while key_path:
            key = key_path.pop(0)
            if key not in d.children:
                d.children[key] = SparkConfigNode()
            d = d.children[key]
        d.value = s

    with IndentingBufferPrinter() as printer:
        printer.line("'''NOTE: THIS FILE IS AUTO-GENERATED. DO NOT EDIT")
        printer.blank_line()
        printer.line('Produced via:')
        printer.line(
            '  python parse_spark_configs.py > '
            '../event-pipeline-demo/event_pipeline_demo/configs_spark.py')
        printer.blank_line()
        printer.line("'''")
        printer.blank_line()
        printer.blank_line()
        printer.line(
            'from dagster import Bool, Field, Float, Int, PermissiveDict, String'
        )
        printer.blank_line()
        printer.blank_line()
        printer.line('# pylint: disable=line-too-long')
        printer.line('def spark_config():')
        with printer.with_indent():
            printer.append('return ')
            result.print(printer)
        printer.line('# pylint: enable=line-too-long')
        print(printer.read().strip())
示例#5
0
    def test_exception_HtmlTableTextLoader(self, monkeypatch, value, source,
                                           expected):
        monkeypatch.setattr(HtmlTableFormatter, "table_id",
                            self.valid_tag_property)

        loader = ptr.HtmlTableTextLoader(source)
        loader.table_name = value
        formatter = HtmlTableFormatter(source)
        formatter.accept(loader)

        with pytest.raises(expected):
            print(formatter._make_table_name())
示例#6
0
    def test_normal_HtmlTableTextLoader_null_tag(self, monkeypatch, value,
                                                 expected):
        monkeypatch.setattr(HtmlTableFormatter, "table_id",
                            self.null_tag_property)

        source = """
        <title>nulltag</title>
        <table></table>
        """
        loader = ptr.HtmlTableTextLoader(source)
        loader.table_name = value
        formatter = HtmlTableFormatter(source)
        formatter.accept(loader)

        assert formatter._make_table_name() == expected