Exemplo n.º 1
0
def create_and_run_tasks_from_yaml(is_full_extraction_enabled=False,
                                   verbose=True):
    with open(CONNECTION_PATH) as f:
        raw_connection_dicts = yaml.safe_load(f)

    for raw_connection_dict in raw_connection_dicts:
        connection = dump_connection_config_in_schema(raw_connection_dict)

        if connection.type == 'presto':
            extractor, conf = configure_presto_extractor(
                connection,
                is_full_extraction_enabled=is_full_extraction_enabled)
        elif connection.type == 'neo4j':
            extractor, conf = configure_neo4j_extractor(connection)
        elif connection.type == 'bigquery':
            extractor, conf = configure_bigquery_extractor(connection)
        elif connection.type == 'build_script':
            run_build_script(connection)
            break
        else:
            break

        conf.put('loader.metaframe.database_name', connection.name)

        task = DefaultTask(
            extractor=extractor,
            transformer=MarkdownTransformer(),
            loader=MetaframeLoader(),
        )
        task.init(conf)
        task.run()
Exemplo n.º 2
0
from databuilder.task.task import DefaultTask
from databuilder.extractor.bigquery_metadata_extractor import BigQueryMetadataExtractor
from databuilder.models.table_metadata import TableMetadata
from metaframe.extractor.presto_loop_extractor import PrestoLoopExtractor
from metaframe.loader.metaframe_loader import MetaframeLoader
from metaframe.transformer.markdown_transformer import MarkdownTransformer


conf = ConfigFactory.from_dict({
    'extractor.presto_loop.conn_string': 'PUT CONN STRING HERE',
    'extractor.presto_loop.is_table_metadata_enabled': True,
    'extractor.presto_loop.is_full_extraction_enabled': True,
    'extractor.presto_loop.is_watermark_enabled': False,
    'extractor.presto_loop.is_stats_enabled': False,
    'extractor.presto_loop.is_analyze_enabled': False,
    'extractor.presto_loop.database': None,
    'extractor.presto_loop.cluster': None,
    'extractor.presto_loop.included_schemas': None,
    'extractor.presto_loop.excluded_schemas': None,
    'loader.metaframe.database_name': 'presto-test',
})

task = DefaultTask(
    extractor=PrestoLoopExtractor(),
    transformer=MarkdownTransformer(),
    loader=MetaframeLoader(),
)
task.init(conf)
task.run()
Exemplo n.º 3
0
def main(is_full_extraction_enabled=False, verbose=True):
    with open(os.path.join(BASE_DIR, 'config/connections.yaml')) as f:
        connections = yaml.safe_load(f)

    for connection in connections:

        def get_connection_value(key, fallback=None):
            return connection[key] if key in connection else fallback

        # Parse configuration.
        host = connection['host']
        connection_type = connection['type']
        username = get_connection_value('username')
        password = get_connection_value('password')
        name = get_connection_value('name')

        # Parse optional configuration entries.
        cluster = get_connection_value('cluster')
        included_schemas = get_connection_value('included_schemas')
        excluded_schemas = get_connection_value('excluded_schemas')
        included_keys = get_connection_value('included_keys')
        excluded_keys = get_connection_value('excluded_keys')
        included_key_regex = get_connection_value('included_key_regex')
        excluded_key_regex = get_connection_value('excluded_key_regex')

        if connection_type == 'presto':

            extractor = PrestoLoopExtractor()
            scope = extractor.get_scope()
            conn_string_key = '{}.conn_string'.format(scope)

            username_password_placeholder = \
                    '{}:{}'.format(username, password) if password is not None else ''

            conn_string = '{connection_type}://{username_password}{host}'.format(
                connection_type=connection_type,
                username_password=username_password_placeholder,
                host=host)

            conf = ConfigFactory.from_dict({
                conn_string_key: conn_string,
                'extractor.presto_loop.is_table_metadata_enabled': True,
                'extractor.presto_loop.is_full_extraction_enabled': \
                        is_full_extraction_enabled,
                'extractor.presto_loop.is_watermark_enabled': False,
                'extractor.presto_loop.is_stats_enabled': False,
                'extractor.presto_loop.is_analyze_enabled': False,
                'extractor.presto_loop.database': name,
                'extractor.presto_loop.cluster': cluster,
                'extractor.presto_loop.included_schemas': included_schemas,
                'extractor.presto_loop.excluded_schemas': excluded_schemas,
            })

        elif connection_type == 'neo4j':
            extractor = AmundsenNeo4jMetadataExtractor()
            scope = extractor.get_scope()
            conf = ConfigFactory.from_dict({
                '{}.graph_url'.format(scope):
                'bolt://' + host,
                '{}.neo4j_auth_user'.format(scope):
                username,
                '{}.neo4j_auth_pw'.format(scope):
                password,
                '{}.included_keys'.format(scope):
                included_keys,
                '{}.excluded_keys'.format(scope):
                excluded_keys,
                '{}.included_key_regex'.format(scope):
                included_key_regex,
                '{}.excluded_key_regex'.format(scope):
                excluded_key_regex,
            })

        conf.put('loader.markdown.database_name', name)

        task = DefaultTask(
            extractor=extractor,
            loader=MarkdownLoader(),
        )
        task.init(conf)
        task.run()