Пример #1
0
    def test_scope(self):
        config = ConfigFactory.from_dict({})

        transformer = BigqueryUsageTransformer()
        transformer.init(config)

        self.assertEqual(transformer.get_scope(), 'transformer.bigquery_usage')
Пример #2
0
    def test_transform_function(self):
        # type: () -> None
        config = ConfigFactory.from_dict({})

        transformer = BigqueryUsageTransformer()
        transformer.init(config)

        key = TableColumnUsageTuple(
            database=TestBigQueryUsageTransform.DATABASE,
            cluster=TestBigQueryUsageTransform.CLUSTER,
            schema=TestBigQueryUsageTransform.DATASET,
            table=TestBigQueryUsageTransform.TABLE,
            column=TestBigQueryUsageTransform.COLUMN,
            email=TestBigQueryUsageTransform.EMAIL)

        t1 = (key, TestBigQueryUsageTransform.READ_COUNT)
        xformed = transformer.transform(t1)

        self.assertIsInstance(xformed, TableColumnUsage)
        self.assertEqual(len(xformed.col_readers), 1)
        col_reader = xformed.col_readers[0]
        self.assertEqual(col_reader.cluster,
                         TestBigQueryUsageTransform.CLUSTER)
        self.assertEqual(col_reader.database,
                         TestBigQueryUsageTransform.DATABASE)
        self.assertEqual(col_reader.schema, TestBigQueryUsageTransform.DATASET)
        self.assertEqual(col_reader.table, TestBigQueryUsageTransform.TABLE)
        self.assertEqual(col_reader.column, TestBigQueryUsageTransform.COLUMN)
        self.assertEqual(col_reader.user_email,
                         TestBigQueryUsageTransform.EMAIL)
        self.assertEqual(col_reader.read_count,
                         TestBigQueryUsageTransform.READ_COUNT)
Пример #3
0
    def test_transform_function(self) -> None:
        config = ConfigFactory.from_dict({})

        transformer = BigqueryUsageTransformer()
        transformer.init(config)

        key = TableColumnUsageTuple(database=TestBigQueryUsageTransform.DATABASE,
                                    cluster=TestBigQueryUsageTransform.CLUSTER,
                                    schema=TestBigQueryUsageTransform.DATASET,
                                    table=TestBigQueryUsageTransform.TABLE,
                                    column=TestBigQueryUsageTransform.COLUMN,
                                    email=TestBigQueryUsageTransform.EMAIL)

        t1 = (key, TestBigQueryUsageTransform.READ_COUNT)
        xformed = transformer.transform(t1)

        assert xformed is not None
        self.assertIsInstance(xformed, TableColumnUsage)
        col_readers = list(xformed.col_readers)
        self.assertEqual(len(col_readers), 1)
        col_reader = col_readers[0]
        self.assertEqual(col_reader.start_label, 'Table')
        self.assertEqual(col_reader.start_key, TestBigQueryUsageTransform.TABLE_KEY)
        self.assertEqual(col_reader.user_email, TestBigQueryUsageTransform.EMAIL)
        self.assertEqual(col_reader.read_count, TestBigQueryUsageTransform.READ_COUNT)
Пример #4
0
def create_transformer(metadata_type):
    if metadata_type == MetadataType.DSL:
        return NoopTransformer()
    elif metadata_type == MetadataType.USAGE:
        return BigqueryUsageTransformer()
    else:
        raise ValueError('Invalid metadata_type')
Пример #5
0
def run_bq_tu_job(job_name):
    
    #where_clause_suffix = " "
    gcloud_project = "peya-data-pocs"
    #label_filter = ""

    tmp_folder = '/var/tmp/amundsen/{job_name}'.format(job_name=job_name)
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder)

    bq_usage_extractor = BigQueryTableUsageExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=bq_usage_extractor,
                       loader=csv_loader,
                       transformer=BigqueryUsageTransformer())

    job_config = ConfigFactory.from_dict({
        'extractor.bigquery_table_usage.{}'.format(BigQueryTableUsageExtractor.PROJECT_ID_KEY):
            gcloud_project,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH):
            node_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH):
            relationship_files_folder,
        'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR):
            True,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR):
            node_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR):
            relationship_files_folder,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY):
            neo4j_endpoint,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER):
            neo4j_user,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD):
            neo4j_password,
        'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG):
            'unique_tag',  # should use unique tag here like {ds}
    })


    job = DefaultJob(conf=job_config,
                     task=task,
                     publisher=Neo4jCsvPublisher())    

    job.launch()
Пример #6
0
def create_bq_job(metadata_type, gcloud_project):
    tmp_folder = f'/var/tmp/amundsen/{metadata_type}'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    bq_usage_extractor = BigQueryTableUsageExtractor()
    csv_loader = FsNeo4jCSVLoader()

    task = DefaultTask(extractor=bq_usage_extractor,
                       loader=csv_loader,
                       transformer=BigqueryUsageTransformer())

    job_config = ConfigFactory.from_dict({
        f'extractor.bigquery_table_usage.{BigQueryTableUsageExtractor.PROJECT_ID_KEY}':
        gcloud_project,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}':
        node_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}':
        relationship_files_folder,
        f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}':
        True,
        f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}':
        node_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}':
        relationship_files_folder,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}':
        neo4j_endpoint,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}':
        neo4j_user,
        f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}':
        neo4j_password,
        f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}':
        'unique_tag',  # should use unique tag here like {ds}
    })
    job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher())
    return job