def run_csv_job(file_loc, job_name, model): tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() task = DefaultTask(extractor=csv_extractor, loader=loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csv.file_location': file_loc, 'extractor.csv.model_class': model, loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name }, }) DefaultJob(conf=job_config, task=task, publisher=publisher).launch()
def create_last_updated_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/last_updated_data' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) loader = FSNeptuneCSVLoader() task = DefaultTask(extractor=EsLastUpdatedExtractor(), loader=loader) publisher = NeptuneCSVPublisher() job_config = ConfigFactory.from_dict({ 'extractor.es_last_updated.model_class': 'databuilder.models.es_last_updated.ESLastUpdated', loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name, 'job_publish_tag': 'unique_lastupdated_tag' } }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def run_table_column_job(table_path, column_path): tmp_folder = '/var/tmp/amundsen/table_column' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) extractor = CsvTableColumnExtractor() csv_loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() task = DefaultTask(extractor, loader=csv_loader, transformer=NoopTransformer()) job_config = ConfigFactory.from_dict({ 'extractor.csvtablecolumn.table_file_location': table_path, 'extractor.csvtablecolumn.column_file_location': column_path, csv_loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name } }) job = DefaultJob(conf=job_config, task=task, publisher=publisher) job.launch()
def create_dashboard_tables_job(): # loader saves data to these folders and publisher reads it from here tmp_folder = '/var/tmp/amundsen/dashboard_table' node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) csv_extractor = CsvExtractor() loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() generic_transformer = GenericTransformer() dict_to_model_transformer = DictToModel() transformer = ChainedTransformer( transformers=[generic_transformer, dict_to_model_transformer], is_init_transformers=True) task = DefaultTask(extractor=csv_extractor, loader=loader, transformer=transformer) job_config = ConfigFactory.from_dict({ csv_extractor.get_scope(): { CsvExtractor.FILE_LOCATION: 'example/sample_data/sample_dashboard_table.csv' }, transformer.get_scope(): { generic_transformer.get_scope(): { FIELD_NAME: 'table_ids', CALLBACK_FUNCTION: _str_to_list }, dict_to_model_transformer.get_scope(): { MODEL_CLASS: 'databuilder.models.dashboard.dashboard_table.DashboardTable', } }, loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name, NeptuneCSVPublisher.AWS_REGION: AWS_REGION, NeptuneCSVPublisher.AWS_ACCESS_KEY: aws_access_key, NeptuneCSVPublisher.AWS_SECRET_ACCESS_KEY: aws_access_secret, NeptuneCSVPublisher.AWS_SESSION_TOKEN: aws_token } }) return DefaultJob(conf=job_config, task=task, publisher=publisher)
def test_load(self) -> None: actors = [Actor('Tom Cruise'), Actor('Meg Ryan')] cities = [City('San Diego'), City('Oakland')] movie = Movie('Top Gun', actors, cities) loader = FSNeptuneCSVLoader() loader.init(self._conf) loader.load(movie) loader.close() expected_node_path = '{}/../resources/fs_neptune_csv_loader/nodes'.format( os.path.join(os.path.dirname(__file__))) expected_nodes = self._get_csv_rows(expected_node_path, itemgetter('~id')) actual_nodes = self._get_csv_rows( self._conf.get_string(FSNeptuneCSVLoader.NODE_DIR_PATH), itemgetter('~id')) self.maxDiff = None self.assertEqual(expected_nodes, actual_nodes) expected_rel_path = '{}/../resources/fs_neptune_csv_loader/relationships'.format( os.path.join(os.path.dirname(__file__))) expected_relations = self._get_csv_rows(expected_rel_path, itemgetter('~id')) actual_relations = self._get_csv_rows( self._conf.get_string(FSNeptuneCSVLoader.RELATION_DIR_PATH), itemgetter('~id')) self.assertListEqual(list(expected_relations), list(actual_relations))
def run_postgres_job(job_name): tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name) node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) loader = FSNeptuneCSVLoader() publisher = NeptuneCSVPublisher() where_clause_suffix = textwrap.dedent( """ where table_schema = '{}' """.format(postgres_schema) ) job_config = ConfigFactory.from_dict({ f'extractor.postgres_metadata.{PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'extractor.postgres_metadata.{PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME}': True, f'extractor.postgres_metadata.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), loader.get_scope(): { FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder, FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder, FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True, FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag' }, publisher.get_scope(): { NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder, NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder, NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME, NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH, NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT, NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name, NeptuneCSVPublisher.AWS_REGION: AWS_REGION }, }) DefaultJob( conf=job_config, task=DefaultTask(extractor=PostgresMetadataExtractor(), loader=loader), publisher=publisher).launch()