def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() generic_transformer = GenericTransformer() dict_to_model_transformer = DictToModel() transformer = ChainedTransformer( transformers=[generic_transformer, dict_to_model_transformer], is_init_transformers=True) task = DefaultTask(extractor=csv_extractor, loader=loader, transformer=transformer) job_config = ConfigFactory.from_dict({ csv_extractor.get_scope(): { CsvExtractor.FILE_LOCATION: 'example/sample_data/sample_dashboard_table.csv' }, transformer.get_scope(): { generic_transformer.get_scope(): { FIELD_NAME: 'table_ids', CALLBACK_FUNCTION: _str_to_list }, dict_to_model_transformer.get_scope(): { MODEL_CLASS: 'databuilder.models.dashboard.dashboard_table.DashboardTable', } }, loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name, NeptuneCSVPublisher.AWS_REGION: AWS_REGION, NeptuneCSVPublisher.AWS_ACCESS_KEY: aws_access_key, NeptuneCSVPublisher.AWS_SECRET_ACCESS_KEY: aws_access_secret, NeptuneCSVPublisher.AWS_SESSION_TOKEN: aws_token } }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() generic_transformer = GenericTransformer() dict_to_model_transformer = DictToModel() transformer = ChainedTransformer( transformers=[generic_transformer, dict_to_model_transformer], is_init_transformers=True) task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=transformer) publisher = Neo4jCsvPublisher() job_config = ConfigFactory.from_dict({ '{}.file_location'.format(csv_extractor.get_scope()): 'example/sample_data/sample_dashboard_table.csv', '{}.{}.{}'.format(transformer.get_scope(), generic_transformer.get_scope(), FIELD_NAME): 'table_ids', '{}.{}.{}'.format(transformer.get_scope(), generic_transformer.get_scope(), CALLBACK_FUNCTION): _str_to_list, '{}.{}.{}'.format(transformer.get_scope(), dict_to_model_transformer.get_scope(), MODEL_CLASS): 'databuilder.models.dashboard.dashboard_table.DashboardTable', '{}.node_dir_path'.format(csv_loader.get_scope()): node_files_folder, '{}.relationship_dir_path'.format(csv_loader.get_scope()): relationship_files_folder, '{}.delete_created_directories'.format(csv_loader.get_scope()): True, '{}.node_files_directory'.format(publisher.get_scope()): node_files_folder, '{}.relation_files_directory'.format(publisher.get_scope()): relationship_files_folder, '{}.neo4j_endpoint'.format(publisher.get_scope()): neo4j_endpoint, '{}.neo4j_user'.format(publisher.get_scope()): neo4j_user, '{}.neo4j_password'.format(publisher.get_scope()): neo4j_password, '{}.neo4j_encrypted'.format(publisher.get_scope()): False, '{}.job_publish_tag'.format(publisher.get_scope()): 'unique_tag', # should use unique tag here like {ds} }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def test_extraction_with_model_class(self) -> None: """ Test Extraction using model class """ config_dict = { f'extractor.csv.{CsvExtractor.FILE_LOCATION}': 'example/sample_data/sample_table.csv', f'extractor.csv.model_class': 'databuilder.models.table_metadata.TableMetadata', } self.conf = ConfigFactory.from_dict(config_dict) extractor = CsvExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEqual(result.name, 'test_table1') self.assertEqual(result.description.text, '1st test table') self.assertEqual(result.database, 'hive') self.assertEqual(result.cluster, 'gold') self.assertEqual(result.schema, 'test_schema') self.assertEqual(result.tags, ['tag1', 'tag2']) self.assertEqual(result.is_view, 'false') result2 = extractor.extract() self.assertEqual(result2.name, 'test_table2') self.assertEqual(result2.is_view, 'false') result3 = extractor.extract() self.assertEqual(result3.name, 'test_view1') self.assertEqual(result3.is_view, 'true')
def run_csv_job(file_loc, job_name, model): tmp_folder = f'/var/tmp/amundsen/{job_name}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, 'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder, 'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder, 'loader.filesystem_csv_neo4j.delete_created_directories': True, 'publisher.neo4j.node_files_directory': node_files_folder, 'publisher.neo4j.relation_files_directory': relationship_files_folder, 'publisher.neo4j.neo4j_endpoint': neo4j_endpoint, 'publisher.neo4j.neo4j_user': neo4j_user, 'publisher.neo4j.neo4j_password': neo4j_password, 'publisher.neo4j.neo4j_encrypted': False, 'publisher.neo4j.job_publish_tag': 'unique_tag', # should use unique tag here like {ds} }) DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()).launch()
def run_csv_job(file_loc, job_name, model): tmp_folder = f'/var/tmp/amundsen/{job_name}' record_files_folder = f'{tmp_folder}/records' csv_extractor = CsvExtractor() csv_loader = FSMySQLCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, 'loader.mysql_filesystem_csv.record_dir_path': record_files_folder, 'loader.mysql_filesystem_csv.delete_created_directories': True, 'publisher.mysql.record_files_directory': record_files_folder, 'publisher.mysql.conn_string': mysql_conn_string, 'publisher.mysql.job_publish_tag': 'unique_tag', }) DefaultJob(conf=job_config, task=task, publisher=MySQLCSVPublisher()).launch()
def run_csv_job(file_loc, job_name, model): tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() task = DefaultTask(extractor=csv_extractor, loader=loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name }, }) DefaultJob(conf=job_config, task=task, publisher=publisher).launch()
def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' csv_extractor = CsvExtractor() csv_loader = FsAtlasCSVLoader() generic_transformer = GenericTransformer() dict_to_model_transformer = DictToModel() transformer = ChainedTransformer( transformers=[generic_transformer, dict_to_model_transformer], is_init_transformers=True) task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=transformer) publisher = AtlasCSVPublisher() job_config = ConfigFactory.from_dict({ f'{csv_extractor.get_scope()}.file_location': 'example/sample_data/sample_dashboard_table.csv', f'{transformer.get_scope()}.{generic_transformer.get_scope()}.{FIELD_NAME}': 'table_ids', f'{transformer.get_scope()}.{generic_transformer.get_scope()}.{CALLBACK_FUNCTION}': _str_to_list, f'{transformer.get_scope()}.{dict_to_model_transformer.get_scope()}.{MODEL_CLASS}': 'databuilder.models.dashboard.dashboard_table.DashboardTable', f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.ENTITY_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.RELATIONSHIP_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_CLIENT}': AtlasClient(atlas_endpoint, (atlas_user, atlas_password)), f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ENTITY_DIR_PATH}': node_files_folder, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.RELATIONSHIP_DIR_PATH}': relationship_files_folder, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_ENTITY_CREATE_BATCH_SIZE}': ATLAS_CREATE_BATCH_SIZE, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.REGISTER_ENTITY_TYPES}': False }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def test_extraction_with_model_class(self) -> None: """ Test Extraction using model class """ extractor = CsvExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEqual(result.name, 'test_table1') self.assertEqual(result.description._text, '1st test table') self.assertEqual(result.database, 'hive') self.assertEqual(result.cluster, 'gold') self.assertEqual(result.schema, 'test_schema')
def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' record_files_folder = f'{tmp_folder}/records' model_class = 'databuilder.models.dashboard.dashboard_table.DashboardTable' csv_extractor = CsvExtractor() csv_loader = FSMySQLCSVLoader() generic_transformer = GenericTransformer() dict_to_model_transformer = DictToModel() transformer = ChainedTransformer( transformers=[generic_transformer, dict_to_model_transformer], is_init_transformers=True) task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=transformer) publisher = MySQLCSVPublisher() job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': 'example/sample_data/sample_dashboard_table.csv', 'transformer.chained.transformer.generic.field_name': 'table_ids', 'transformer.chained.transformer.generic.callback_function': _str_to_list, 'transformer.chained.transformer.dict_to_model.model_class': model_class, 'loader.mysql_filesystem_csv.record_dir_path': record_files_folder, 'loader.mysql_filesystem_csv.delete_created_directories': True, 'publisher.mysql.record_files_directory': record_files_folder, 'publisher.mysql.conn_string': mysql_conn_string, 'publisher.mysql.job_publish_tag': 'unique_tag', }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def run_csv_job(file_loc, table_name, model): tmp_folder = '/var/tmp/amundsen/{table_name}'.format(table_name=table_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): file_loc, 'extractor.csv.model_class': model, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag', # should use unique tag here like {ds} }) DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()).launch()
def create_model_job(self, file_loc, metadata_name, model): # type: (str, str, str) -> DefaultJob tmp_folder = '/tmp/amundsen/{metadata_name}'.format(metadata_name=metadata_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) csv_extractor = CsvExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): file_loc, 'extractor.csv.model_class': model, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): self.neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): self.neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): self.neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' + (str(uuid.uuid4()) if self.use_unique_id else "") }) return DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
def run_csv_job(file_loc, job_name, model): tmp_folder = f'/var/tmp/amundsen/{job_name}' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' csv_extractor = CsvExtractor() csv_loader = FsAtlasCSVLoader() task = DefaultTask(extractor=csv_extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.ENTITY_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.RELATIONSHIP_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_atlas.{FsAtlasCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_CLIENT}': AtlasClient(atlas_endpoint, (atlas_user, atlas_password)), f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ENTITY_DIR_PATH}': node_files_folder, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.RELATIONSHIP_DIR_PATH}': relationship_files_folder, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.ATLAS_ENTITY_CREATE_BATCH_SIZE}': ATLAS_CREATE_BATCH_SIZE, f'publisher.atlas_csv_publisher.{AtlasCSVPublisher.REGISTER_ENTITY_TYPES}': False }) DefaultJob(conf=job_config, task=task, publisher=AtlasCSVPublisher()).launch()