def test_extraction_with_database_specified(self) -> None: """ Test DATABASE_KEY in extractor result """ with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute sql_execute.return_value = [{ 'schema': 'test_schema', 'table_name': 'test_table', 'last_updated_time': 1000, 'cluster': 'MY_CLUSTER', }] extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableLastUpdated(schema='test_schema', table_name='test_table', last_updated_time_epoch=1000, db=self.database_key, cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_single_result(self) -> None: """ Test Extraction with default cluster and database and with one table as result """ with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute sql_execute.return_value = [{ 'schema': 'test_schema', 'table_name': 'test_table', 'last_updated_time': 1000, 'cluster': self.conf['extractor.snowflake_table_last_updated.{}'.format( SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)], }] extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableLastUpdated(schema='test_schema', table_name='test_table', last_updated_time_epoch=1000, db='snowflake', cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def setUp(self) -> None: super(TestTableLastUpdated, self).setUp() self.tableLastUpdated = TableLastUpdated( table_name='test_table', last_updated_time_epoch=25195665, schema='default') self.expected_node_results = [{ NODE_KEY: 'hive://gold.default/test_table/timestamp', NODE_LABEL: 'Timestamp', 'last_updated_timestamp:UNQUOTED': 25195665, timestamp_constants.TIMESTAMP_PROPERTY + ":UNQUOTED": 25195665, 'name': 'last_updated_timestamp' }] self.expected_relation_results = [{ RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: 'Table', RELATION_END_KEY: 'hive://gold.default/test_table/timestamp', RELATION_END_LABEL: 'Timestamp', RELATION_TYPE: 'LAST_UPDATED_AT', RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF' }]
def test_extraction(self) -> None: old_datetime = datetime(2018, 8, 14, 4, 12, 3, tzinfo=UTC) new_datetime = datetime(2018, 11, 14, 4, 12, 3, tzinfo=UTC) fs = MagicMock() fs.ls = MagicMock(return_value=['/foo/bar', '/foo/baz']) fs.is_file = MagicMock(return_value=True) fs.info = MagicMock(side_effect=[ FileMetadata( path='/foo/bar', last_updated=old_datetime, size=15093), FileMetadata( path='/foo/baz', last_updated=new_datetime, size=15094) ]) pt_alchemy_extractor_instance = MagicMock() non_pt_alchemy_extractor_instance = MagicMock() with patch.object(HiveTableLastUpdatedExtractor, '_get_partitioned_table_sql_alchemy_extractor', return_value=pt_alchemy_extractor_instance), \ patch.object(HiveTableLastUpdatedExtractor, '_get_non_partitioned_table_sql_alchemy_extractor', return_value=non_pt_alchemy_extractor_instance), \ patch.object(HiveTableLastUpdatedExtractor, '_get_filesystem', return_value=fs): pt_alchemy_extractor_instance.extract = MagicMock( return_value=None) non_pt_alchemy_extractor_instance.extract = MagicMock( side_effect=null_iterator([ { 'schema': 'foo_schema', 'table_name': 'table_1', 'location': '/foo/bar' }, ])) extractor = HiveTableLastUpdatedExtractor() extractor.init(ConfigFactory.from_dict({})) result = extractor.extract() expected = TableLastUpdated(schema='foo_schema', table_name='table_1', last_updated_time_epoch=1542168723, db='hive', cluster='gold') self.assertEqual(result.__repr__(), expected.__repr__()) self.assertIsNone(extractor.extract())
def _retrieve_tables(self, dataset): # type: () -> Any for page in self._page_table_list_results(dataset): if 'tables' not in page: continue for table in page['tables']: tableRef = table['tableReference'] table_id = tableRef['tableId'] # BigQuery tables that have 8 digits as last characters are # considered date range tables and are grouped together in the UI. # ( e.g. ga_sessions_20190101, ga_sessions_20190102, etc. ) if self._is_sharded_table(table_id): # If the last eight characters are digits, we assume the table is of a table date range type # and then we only need one schema definition table_prefix = table_id[:-BigQueryLastUpdatedExtractor.DATE_LENGTH] if table_prefix in self.grouped_tables: # If one table in the date range is processed, then ignore other ones # (it adds too much metadata) continue table_id = table_prefix self.grouped_tables.add(table_prefix) table = self.bigquery_service.tables().get( projectId=tableRef['projectId'], datasetId=tableRef['datasetId'], tableId=tableRef['tableId']).execute(num_retries=BigQueryLastUpdatedExtractor.NUM_RETRIES) table_last_upd = TableLastUpdated(table_name=table_id, last_updated_time_epoch=int(table['lastModifiedTime'])//1000,schema=tableRef['datasetId'],db='bigquery',cluster=tableRef['projectId']) yield(table_last_upd)
def _get_last_updated_datetime_from_filesystem(self, table: str, schema: str, storage_location: str, ) -> Union[TableLastUpdated, None]: """ Fetching metadata within files under storage_location to get latest timestamp. (First level only under storage_location) Utilizes thread pool to enhance performance. Not using processpool, as it's almost entirely IO bound operation. :param table: :param schema: :param storage_location: :return: """ if LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug(f'Getting last updated datetime for {schema}.{table} in {storage_location}') last_updated = OLDEST_TIMESTAMP paths = self._ls(storage_location) if not paths: LOGGER.info(f'{schema}.{table} does not have any file in path {storage_location}. Skipping') return None LOGGER.info(f'Fetching metadata for {schema}.{table} of {len(paths)} files') if 0 < self._last_updated_filecheck_threshold < len(paths): LOGGER.info(f'Skipping {schema}.{table} due to too many files. ' f'{len(paths)} files exist in {storage_location}') return None time_stamp_futures = \ [self._fs_worker_pool.apply_async(self._get_timestamp, (path, schema, table, storage_location)) for path in paths] for time_stamp_future in time_stamp_futures: try: time_stamp = time_stamp_future.get(timeout=self._fs_worker_timeout) if time_stamp: last_updated = max(time_stamp, last_updated) except TimeoutError: LOGGER.warning('Timed out on paths %s . Skipping', paths) if last_updated == OLDEST_TIMESTAMP: LOGGER.info(f'No timestamp was derived on {schema}.{table} from location: {storage_location} . Skipping') return None result = TableLastUpdated(table_name=table, last_updated_time_epoch=int((last_updated - OLDEST_TIMESTAMP).total_seconds()), schema=schema, db=HiveTableLastUpdatedExtractor.DATABASE, cluster=self._cluster) return result
def _get_extract_iter(self) -> Iterator[TableLastUpdated]: """ Provides iterator of result row from SQLAlchemy extractor """ tbl_last_updated_row = self._alchemy_extractor.extract() while tbl_last_updated_row: yield TableLastUpdated(table_name=tbl_last_updated_row['table_name'], last_updated_time_epoch=tbl_last_updated_row['last_updated_time'], schema=tbl_last_updated_row['schema'], db=self._database, cluster=tbl_last_updated_row['cluster']) tbl_last_updated_row = self._alchemy_extractor.extract()
def _get_extract_iter(self): # type: () -> Iterator[TableLastUpdated] """ An iterator that utilizes Generator pattern. First it provides TableLastUpdated objects for partitioned table, straight from partitioned_table_extractor (SQLAlchemyExtractor) Once partitioned table is done, it uses non_partitioned_table_extractor to get storage location of table, and probing files under storage location to get max timestamp per table. :return: """ partitioned_tbl_row = self._partitioned_table_extractor.extract() while partitioned_tbl_row: yield TableLastUpdated( table_name=partitioned_tbl_row['table_name'], last_updated_time_epoch=partitioned_tbl_row[ 'last_updated_time'], schema_name=partitioned_tbl_row['schema_name'], db=HiveTableLastUpdatedExtractor.DATABASE, cluster=self._cluster) partitioned_tbl_row = self._partitioned_table_extractor.extract() LOGGER.info('Extracting non-partitioned table') count = 0 non_partitioned_tbl_row = self._non_partitioned_table_extractor.extract( ) while non_partitioned_tbl_row: count += 1 if count % 10 == 0: LOGGER.info( 'Processed {} non-partitioned tables'.format(count)) if not non_partitioned_tbl_row['location']: LOGGER.warning( 'Skipping as no storage location available. {}'.format( non_partitioned_tbl_row)) non_partitioned_tbl_row = self._non_partitioned_table_extractor.extract( ) continue start = time.time() table_last_updated = self._get_last_updated_datetime_from_filesystem( table=non_partitioned_tbl_row['table_name'], schema=non_partitioned_tbl_row['schema_name'], storage_location=non_partitioned_tbl_row['location']) LOGGER.info('Elapsed: {} seconds'.format(time.time() - start)) if table_last_updated: yield table_last_updated non_partitioned_tbl_row = self._non_partitioned_table_extractor.extract( )
def create_table_last_updated( self, table: ScrapedTableMetadata) -> Optional[TableLastUpdated]: '''Creates the amundsen table last updated metadata object from the ScrapedTableMetadata object.''' last_modified = table.get_last_modified() if last_modified: return TableLastUpdated(table_name=table.table, last_updated_time_epoch=int( last_modified.timestamp()), schema=table.schema, db=self._db, cluster=self._cluster) else: return None
class TestTableLastUpdated(unittest.TestCase): def setUp(self) -> None: super(TestTableLastUpdated, self).setUp() self.tableLastUpdated = TableLastUpdated( table_name='test_table', last_updated_time_epoch=25195665, schema='default') self.expected_node_result = { NODE_KEY: 'hive://gold.default/test_table/timestamp', NODE_LABEL: 'Timestamp', 'last_updated_timestamp:UNQUOTED': 25195665, timestamp_constants.TIMESTAMP_PROPERTY + ":UNQUOTED": 25195665, 'name': 'last_updated_timestamp' } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: 'Table', RELATION_END_KEY: 'hive://gold.default/test_table/timestamp', RELATION_END_LABEL: 'Timestamp', RELATION_TYPE: 'LAST_UPDATED_AT', RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF' } def test_create_next_node(self) -> None: next_node = self.tableLastUpdated.create_next_node() next_node_serialized = neo4_serializer.serialize_node(next_node) self.assertEqual(next_node_serialized, self.expected_node_result) def test_create_next_relation(self) -> None: next_relation = self.tableLastUpdated.create_next_relation() next_relation_serialized = neo4_serializer.serialize_relationship( next_relation) self.assertEqual(next_relation_serialized, self.expected_relation_result) def test_get_table_model_key(self) -> None: table = self.tableLastUpdated.get_table_model_key() self.assertEqual(table, 'hive://gold.default/test_table') def test_get_last_updated_model_key(self) -> None: last_updated = self.tableLastUpdated.get_last_updated_model_key() self.assertEqual(last_updated, 'hive://gold.default/test_table/timestamp') def test_create_nodes(self) -> None: nodes = self.tableLastUpdated.create_nodes() self.assertEquals(len(nodes), 1) serialize_node = neo4_serializer.serialize_node(nodes[0]) self.assertEquals(serialize_node, self.expected_node_result) def test_create_relation(self) -> None: relation = self.tableLastUpdated.create_relation() self.assertEquals(len(relation), 1) serialized_relation = neo4_serializer.serialize_relationship( relation[0]) self.assertEquals(serialized_relation, self.expected_relation_result)
class TestTableLastUpdated(unittest.TestCase): def setUp(self): # type: () -> None super(TestTableLastUpdated, self).setUp() self.tableLastUpdated = TableLastUpdated( table_name='test_table', last_updated_time_epoch=25195665, schema='default') self.expected_node_result = { NODE_KEY: 'hive://gold.default/test_table/timestamp', NODE_LABEL: 'Timestamp', 'last_updated_timestamp': 25195665, 'name': 'last_updated_timestamp' } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: 'Table', RELATION_END_KEY: 'hive://gold.default/test_table/timestamp', RELATION_END_LABEL: 'Timestamp', RELATION_TYPE: 'LAST_UPDATED_AT', RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF' } def test_create_next_node(self): # type: () -> None next_node = self.tableLastUpdated.create_next_node() self.assertEquals(next_node, self.expected_node_result) def test_create_next_relation(self): # type: () -> None next_relation = self.tableLastUpdated.create_next_relation() self.assertEquals(next_relation, self.expected_relation_result) def test_get_table_model_key(self): # type: () -> None table = self.tableLastUpdated.get_table_model_key() self.assertEquals(table, 'hive://gold.default/test_table') def test_get_last_updated_model_key(self): # type: () -> None last_updated = self.tableLastUpdated.get_last_updated_model_key() self.assertEquals(last_updated, 'hive://gold.default/test_table/timestamp') def test_create_nodes(self): # type: () -> None nodes = self.tableLastUpdated.create_nodes() self.assertEquals(len(nodes), 1) self.assertEquals(nodes[0], self.expected_node_result) def test_create_relation(self): # type: () -> None relation = self.tableLastUpdated.create_relation() self.assertEquals(len(relation), 1) self.assertEquals(relation[0], self.expected_relation_result)
def setUp(self): # type: () -> None super(TestTableLastUpdated, self).setUp() self.tableLastUpdated = TableLastUpdated(table_name='test_table', last_updated_time_epoch=25195665, schema_name='default') self.expected_node_result = { NODE_KEY: 'hive://gold.default/test_table/timestamp', NODE_LABEL: 'Timestamp', 'last_updated_timestamp': 25195665, 'name': 'last_updated_timestamp' } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: 'Table', RELATION_END_KEY: 'hive://gold.default/test_table/timestamp', RELATION_END_LABEL: 'Timestamp', RELATION_TYPE: 'LAST_UPDATED_AT', RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF' }
def test_extraction_with_partition_table_result(self): # type: () -> None config_dict = { 'filesystem.{}'.format(FileSystem.DASK_FILE_SYSTEM): MagicMock() } conf = ConfigFactory.from_dict(config_dict) pt_alchemy_extractor_instance = MagicMock() non_pt_alchemy_extractor_instance = MagicMock() with patch.object(HiveTableLastUpdatedExtractor, '_get_partitioned_table_sql_alchemy_extractor', return_value=pt_alchemy_extractor_instance),\ patch.object(HiveTableLastUpdatedExtractor, '_get_non_partitioned_table_sql_alchemy_extractor', return_value=non_pt_alchemy_extractor_instance): pt_alchemy_extractor_instance.extract = MagicMock( side_effect=[{ 'schema': 'foo_schema', 'table_name': 'table_1', 'last_updated_time': 1 }, { 'schema': 'foo_schema', 'table_name': 'table_2', 'last_updated_time': 2 }]) non_pt_alchemy_extractor_instance.extract = MagicMock( return_value=None) extractor = HiveTableLastUpdatedExtractor() extractor.init(conf) result = extractor.extract() expected = TableLastUpdated(schema='foo_schema', table_name='table_1', last_updated_time_epoch=1, db='hive', cluster='gold') self.assertEqual(result.__repr__(), expected.__repr__()) result = extractor.extract() expected = TableLastUpdated(schema='foo_schema', table_name='table_2', last_updated_time_epoch=2, db='hive', cluster='gold') self.assertEqual(result.__repr__(), expected.__repr__()) self.assertIsNone(extractor.extract())
def _get_last_updated_datetime_from_filesystem( self, table, # type: str schema, # type: str storage_location, # type: str ): # type: (...) -> Union[TableLastUpdated, None] """ Fetching metadata within files under storage_location to get latest timestamp. (First level only under storage_location) Utilizes thread pool to enhance performance. Not using processpool, as it's almost entirely IO bound operation. :param table: :param schema: :param storage_location: :return: """ if LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug( 'Getting last updated datetime for {}.{} in {}'.format( schema, table, storage_location)) last_updated = OLDEST_TIMESTAMP paths = self._ls(storage_location) if not paths: LOGGER.info( '{schema}.{table} does not have any file in path {path}. Skipping' .format(schema=schema, table=table, path=storage_location)) return None LOGGER.info( 'Fetching metadata for {schema}.{table} of {num_files} files'. format(schema=schema, table=table, num_files=len(paths))) if self._last_updated_filecheck_threshold > 0 and len( paths) > self._last_updated_filecheck_threshold: LOGGER.info( 'Skipping {schema}.{table} due to too many files. {len_files} files exist in {location}' .format(schema=schema, table=table, len_files=len(paths), location=storage_location)) return None time_stamp_futures = \ [self._fs_worker_pool.apply_async(self._get_timestamp, (path, schema, table, storage_location)) for path in paths] for time_stamp_future in time_stamp_futures: try: time_stamp = time_stamp_future.get( timeout=self._fs_worker_timeout) if time_stamp: last_updated = max(time_stamp, last_updated) except Exception as e: if e.__class__.__name__ == 'TimeoutError': LOGGER.warning( 'Timed out on paths {} . Skipping'.format(paths)) else: raise e if last_updated == OLDEST_TIMESTAMP: LOGGER.info( 'No timestamp was derived on {schema}.{table} from location: {location} . Skipping' .format(schema=schema, table=table, location=storage_location)) return None result = TableLastUpdated(table_name=table, last_updated_time_epoch=int( (last_updated - OLDEST_TIMESTAMP).total_seconds()), schema_name=schema, db=HiveTableLastUpdatedExtractor.DATABASE, cluster=self._cluster) return result
def test_extraction_with_multiple_result(self) -> None: """ Test Extraction with default cluster and database and with multiple tables as result """ with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute default_cluster = self.conf[ 'extractor.snowflake_table_last_updated.{}'.format( SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)] table = { 'schema': 'test_schema1', 'table_name': 'test_table1', 'last_updated_time': 1000, 'cluster': default_cluster } table1 = { 'schema': 'test_schema1', 'table_name': 'test_table2', 'last_updated_time': 2000, 'cluster': default_cluster } table2 = { 'schema': 'test_schema2', 'table_name': 'test_table3', 'last_updated_time': 3000, 'cluster': default_cluster } sql_execute.return_value = [table, table1, table2] extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) expected = TableLastUpdated(schema='test_schema1', table_name='test_table1', last_updated_time_epoch=1000, db='snowflake', cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableLastUpdated(schema='test_schema1', table_name='test_table2', last_updated_time_epoch=2000, db='snowflake', cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableLastUpdated(schema='test_schema2', table_name='test_table3', last_updated_time_epoch=3000, db='snowflake', cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertIsNone(extractor.extract())
class TestTableLastUpdated(unittest.TestCase): def setUp(self) -> None: super(TestTableLastUpdated, self).setUp() self.tableLastUpdated = TableLastUpdated(table_name='test_table', last_updated_time_epoch=25195665, schema='default') self.expected_node_results = [{ NODE_KEY: 'hive://gold.default/test_table/timestamp', NODE_LABEL: 'Timestamp', 'last_updated_timestamp:UNQUOTED': 25195665, timestamp_constants.TIMESTAMP_PROPERTY + ":UNQUOTED": 25195665, 'name': 'last_updated_timestamp' }] self.expected_relation_results = [{ RELATION_START_KEY: 'hive://gold.default/test_table', RELATION_START_LABEL: 'Table', RELATION_END_KEY: 'hive://gold.default/test_table/timestamp', RELATION_END_LABEL: 'Timestamp', RELATION_TYPE: 'LAST_UPDATED_AT', RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF' }] def test_get_table_model_key(self) -> None: table = self.tableLastUpdated.get_table_model_key() self.assertEqual(table, 'hive://gold.default/test_table') def test_get_last_updated_model_key(self) -> None: last_updated = self.tableLastUpdated.get_last_updated_model_key() self.assertEqual(last_updated, 'hive://gold.default/test_table/timestamp') def test_create_nodes(self) -> None: actual = [] node = self.tableLastUpdated.create_next_node() while node: serialize_node = neo4_serializer.serialize_node(node) actual.append(serialize_node) node = self.tableLastUpdated.create_next_node() self.assertEqual(actual, self.expected_node_results) def test_create_nodes_neptune(self) -> None: node_id = TableLastUpdated.LAST_UPDATED_NODE_LABEL + ":" + self.tableLastUpdated.get_last_updated_model_key() expected_nodes = [{ NEPTUNE_HEADER_ID: node_id, METADATA_KEY_PROPERTY_NAME: node_id, NEPTUNE_HEADER_LABEL: TableLastUpdated.LAST_UPDATED_NODE_LABEL, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'name:String(single)': 'last_updated_timestamp', 'last_updated_timestamp:Long(single)': 25195665, timestamp_constants.TIMESTAMP_PROPERTY + ":Long(single)": 25195665, }] actual = [] next_node = self.tableLastUpdated.create_next_node() while next_node: next_node_serialized = neptune_serializer.convert_node(next_node) actual.append(next_node_serialized) next_node = self.tableLastUpdated.create_next_node() self.assertEqual(actual, expected_nodes) def test_create_relation(self) -> None: actual = [] relation = self.tableLastUpdated.create_next_relation() while relation: serialized_relation = neo4_serializer.serialize_relationship(relation) actual.append(serialized_relation) relation = self.tableLastUpdated.create_next_relation() def test_create_relation_neptune(self) -> None: actual = [] next_relation = self.tableLastUpdated.create_next_relation() while next_relation: next_relation_serialized = neptune_serializer.convert_relationship(next_relation) actual.append(next_relation_serialized) next_relation = self.tableLastUpdated.create_next_relation() expected = [ [ { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:hive://gold.default/test_table', to_vertex_id='Timestamp:hive://gold.default/test_table/timestamp', label='LAST_UPDATED_AT' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:hive://gold.default/test_table', to_vertex_id='Timestamp:hive://gold.default/test_table/timestamp', label='LAST_UPDATED_AT' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Table:hive://gold.default/test_table', NEPTUNE_RELATIONSHIP_HEADER_TO: 'Timestamp:hive://gold.default/test_table/timestamp', NEPTUNE_HEADER_LABEL: 'LAST_UPDATED_AT', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Timestamp:hive://gold.default/test_table/timestamp', to_vertex_id='Table:hive://gold.default/test_table', label='LAST_UPDATED_TIME_OF' ), METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Timestamp:hive://gold.default/test_table/timestamp', to_vertex_id='Table:hive://gold.default/test_table', label='LAST_UPDATED_TIME_OF' ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Timestamp:hive://gold.default/test_table/timestamp', NEPTUNE_RELATIONSHIP_HEADER_TO: 'Table:hive://gold.default/test_table', NEPTUNE_HEADER_LABEL: 'LAST_UPDATED_TIME_OF', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB } ] ] self.assertEqual(actual, expected)