def __init__(self, extractor: Extractor, transformer: Transformer = NoopTransformer()) -> None: self.extractor = extractor self.transformer = transformer self._closer = Closer() self._closer.register(self.extractor.close) self._closer.register(self.transformer.close)
def __init__(self, extractor, loader, transformer=NoopTransformer()): # type: (Extractor, Loader, Transformer) -> None self.extractor = extractor self.transformer = transformer self.loader = loader self._closer = Closer() self._closer.register(self.extractor.close) self._closer.register(self.transformer.close) self._closer.register(self.loader.close)
class DefaultTask(Task): """ A default task expecting to extract, transform and load. """ # Determines the frequency of the log on task progress PROGRESS_REPORT_FREQUENCY = 'progress_report_frequency' def __init__( self, extractor: Extractor, loader: Loader, transformer: Transformer = NoopTransformer()) -> None: self.extractor = extractor self.transformer = transformer self.loader = loader self._closer = Closer() self._closer.register(self.extractor.close) self._closer.register(self.transformer.close) self._closer.register(self.loader.close) def init(self, conf: ConfigTree) -> None: self._progress_report_frequency = \ conf.get_int('{}.{}'.format(self.get_scope(), DefaultTask.PROGRESS_REPORT_FREQUENCY), 500) self.extractor.init( Scoped.get_scoped_conf(conf, self.extractor.get_scope())) self.transformer.init( Scoped.get_scoped_conf(conf, self.transformer.get_scope())) self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope())) def run(self) -> None: """ Runs a task :return: """ LOGGER.info('Running a task') try: record = self.extractor.extract() count = 1 while record: record = self.transformer.transform(record) if not record: record = self.extractor.extract() continue self.loader.load(record) record = self.extractor.extract() count += 1 if count > 0 and count % self._progress_report_frequency == 0: LOGGER.info('Extracted {} records so far'.format(count)) finally: self._closer.close()
class DefaultTask(Task): """ A default task expecting to extract, transform and load. """ def __init__(self, extractor, loader, transformer=NoopTransformer()): # type: (Extractor, Loader, Transformer) -> None self.extractor = extractor self.transformer = transformer self.loader = loader self._closer = Closer() self._closer.register(self.extractor.close) self._closer.register(self.transformer.close) self._closer.register(self.loader.close) def init(self, conf): # type: (ConfigTree) -> None self.extractor.init( Scoped.get_scoped_conf(conf, self.extractor.get_scope())) self.transformer.init( Scoped.get_scoped_conf(conf, self.transformer.get_scope())) self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope())) def run(self): # type: () -> None """ Runs a task :return: """ logging.info('Running a task') try: record = self.extractor.extract() while record: record = self.transformer.transform(record) if not record: continue self.loader.load(record) record = self.extractor.extract() finally: self._closer.close()
class Job(Scoped): closer = Closer() """ A Databuilder job that represents single work unit. """ @abc.abstractmethod def init(self, conf: ConfigTree) -> None: pass @abc.abstractmethod def launch(self) -> None: """ Launch a job :return: None """ pass def get_scope(self) -> str: return 'job'
def __init__(self): # type: () -> None self._node_file_mapping = {} # type: Dict[Any, DictWriter] self._relation_file_mapping = {} # type: Dict[Any, DictWriter] self._closer = Closer()
class FsNeo4jCSVLoader(Loader): """ Write node and relationship CSV file(s) that can be consumed by Neo4jCsvPublisher. It assumes that the record it consumes is instance of Neo4jCsvSerializable """ # Config keys NODE_DIR_PATH = 'node_dir_path' RELATION_DIR_PATH = 'relationship_dir_path' SHOULD_DELETE_CREATED_DIR = 'delete_created_directories' _DEFAULT_CONFIG = ConfigFactory.from_dict( {SHOULD_DELETE_CREATED_DIR: True}) def __init__(self): # type: () -> None self._node_file_mapping = {} # type: Dict[Any, DictWriter] self._relation_file_mapping = {} # type: Dict[Any, DictWriter] self._closer = Closer() def init(self, conf): # type: (ConfigTree) -> None """ Initializing FsNeo4jCsvLoader by creating directory for node files and relationship files. Note that the directory defined in configuration should not exist. :param conf: :return: """ conf = conf.with_fallback(FsNeo4jCSVLoader._DEFAULT_CONFIG) self._node_dir = conf.get_string(FsNeo4jCSVLoader.NODE_DIR_PATH) self._relation_dir = \ conf.get_string(FsNeo4jCSVLoader.RELATION_DIR_PATH) self._delete_created_dir = \ conf.get_bool(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR) self._create_directory(self._node_dir) self._create_directory(self._relation_dir) def _create_directory(self, path): # type: (str) -> None """ Validate directory does not exist, creates it, register deletion of created directory function to Job.closer. :param path: :return: """ if os.path.exists(path): raise RuntimeError('Directory should not exist: {}'.format(path)) os.makedirs(path) def _delete_dir(): # type: () -> None if not self._delete_created_dir: LOGGER.warn('Skip Deleting directory {}'.format(path)) return LOGGER.info('Deleting directory {}'.format(path)) shutil.rmtree(path) # Directory should be deleted after publish is finished Job.closer.register(_delete_dir) def load(self, csv_serializable): # type: (Neo4jCsvSerializable) -> None """ Writes Neo4jCsvSerializable into CSV files. There are multiple CSV files that this method writes. This is because there're not only node and relationship, but also it can also have different nodes, and relationships. Common pattern for both nodes and relations: 1. retrieve csv row (a dict where keys represent a header, values represent a row) 2. using this dict to get a appropriate csv writer and write to it. 3. repeat 1 and 2 :param csv_serializable: :return: """ node_dict = csv_serializable.next_node() while node_dict: key = (node_dict[NODE_LABEL], len(node_dict)) file_suffix = '{}_{}'.format(*key) node_writer = self._get_writer(node_dict, self._node_file_mapping, key, self._node_dir, file_suffix) node_writer.writerow(node_dict) node_dict = csv_serializable.next_node() relation_dict = csv_serializable.next_relation() while relation_dict: key2 = (relation_dict[RELATION_START_LABEL], relation_dict[RELATION_END_LABEL], relation_dict[RELATION_TYPE], len(relation_dict)) file_suffix = '{}_{}_{}'.format(key2[0], key2[1], key2[2]) relation_writer = self._get_writer(relation_dict, self._relation_file_mapping, key2, self._relation_dir, file_suffix) relation_writer.writerow(relation_dict) relation_dict = csv_serializable.next_relation() def _get_writer( self, csv_record_dict, # type: Dict[str, Any] file_mapping, # type: Dict[Any, DictWriter] key, # type: Any dir_path, # type: str file_suffix # type: str ): # type: (...) -> DictWriter """ Finds a writer based on csv record, key. If writer does not exist, it's creates a csv writer and update the mapping. :param csv_record_dict: :param file_mapping: :param key: :param file_suffix: :return: """ writer = file_mapping.get(key) if writer: return writer LOGGER.info('Creating file for {}'.format(key)) file_out = open('{}/{}.csv'.format(dir_path, file_suffix), 'w') def file_out_close(): # type: () -> None LOGGER.info('Closing file IO {}'.format(file_out)) file_out.close() self._closer.register(file_out_close) writer = csv.DictWriter(file_out, fieldnames=csv_record_dict.keys(), quoting=csv.QUOTE_NONNUMERIC) writer.writeheader() file_mapping[key] = writer return writer def close(self): # type: () -> None """ Any closeable callable registered in _closer, it will close. :return: """ self._closer.close() def get_scope(self): # type: () -> str return "loader.filesystem_csv_neo4j"
def __init__(self) -> None: self._node_file_mapping: Dict[Any, DictWriter] = {} self._relation_file_mapping: Dict[Any, DictWriter] = {} self._closer = Closer()
class FSNeptuneCSVLoader(Loader): """ Write node and relationship CSV file(s) that can be consumed by NeptuneCsvPublisher. It assumes that the record it consumes is instance of GraphSerializable """ # Config keys NODE_DIR_PATH = 'node_dir_path' RELATION_DIR_PATH = 'relationship_dir_path' FORCE_CREATE_DIR = 'force_create_directory' SHOULD_DELETE_CREATED_DIR = 'delete_created_directories' JOB_PUBLISHER_TAG = 'job_publisher_tag' _DEFAULT_CONFIG = ConfigFactory.from_dict({ SHOULD_DELETE_CREATED_DIR: True, FORCE_CREATE_DIR: False }) def __init__(self) -> None: self._node_file_mapping: Dict[Any, DictWriter] = {} self._relation_file_mapping: Dict[Any, DictWriter] = {} self._closer = Closer() def init(self, conf: ConfigTree) -> None: """ Initializing FSNeptuneCSVLoader by creating directory for node files and relationship files. Note that the directory defined in configuration should not exist. """ conf = conf.with_fallback(FSNeptuneCSVLoader._DEFAULT_CONFIG) self._node_dir = conf.get_string(FSNeptuneCSVLoader.NODE_DIR_PATH) self._relation_dir = conf.get_string( FSNeptuneCSVLoader.RELATION_DIR_PATH) self._delete_created_dir = conf.get_bool( FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR) self._force_create_dir = conf.get_bool( FSNeptuneCSVLoader.FORCE_CREATE_DIR) self._create_directory(self._node_dir) self._create_directory(self._relation_dir) self.job_publisher_tag = conf.get_string( FSNeptuneCSVLoader.JOB_PUBLISHER_TAG) def _create_directory(self, path: str) -> None: """ Validate directory does not exist, creates it, register deletion of created directory function to Job.closer. """ if os.path.exists(path): if self._force_create_dir: LOGGER.info( 'Directory exist. Deleting directory {}'.format(path)) shutil.rmtree(path) else: raise RuntimeError( 'Directory should not exist: {}'.format(path)) os.makedirs(path) def _delete_dir() -> None: if not self._delete_created_dir: LOGGER.warn('Skip Deleting directory {}'.format(path)) return LOGGER.info('Deleting directory {}'.format(path)) shutil.rmtree(path) # Directory should be deleted after publish is finished Job.closer.register(_delete_dir) def load(self, csv_serializable: GraphSerializable) -> None: """ Writes GraphSerializable into CSV files. There are multiple CSV files that this method writes. This is because there're not only node and relationship, but also it can also have different nodes, and relationships. Common pattern for both nodes and relations: 1. retrieve csv row (a dict where keys represent a header, values represent a row) 2. using this dict to get a appropriate csv writer and write to it. 3. repeat 1 and 2 :param csv_serializable: :return: """ node = csv_serializable.next_node() while node: node.attributes[ PUBLISHED_TAG_PROPERTY_NAME] = self.job_publisher_tag node_dict = neptune_serializer.convert_node(node) if node_dict: key = (node.label, len(node_dict)) file_suffix = '{}_{}'.format(*key) node_writer = self._get_writer(node_dict, self._node_file_mapping, key, self._node_dir, file_suffix) node_writer.writerow(node_dict) node = csv_serializable.next_node() relation = csv_serializable.next_relation() while relation: relation.attributes[ PUBLISHED_TAG_PROPERTY_NAME] = self.job_publisher_tag relation_dicts = neptune_serializer.convert_relationship(relation) if relation_dicts: key2 = (relation.start_label, relation.end_label, relation.type, len(relation_dicts[0])) file_suffix = '{}_{}_{}'.format(key2[0], key2[1], key2[2]) relation_writer = self._get_writer(relation_dicts[0], self._relation_file_mapping, key2, self._relation_dir, file_suffix) relation_writer.writerows(relation_dicts) relation = csv_serializable.next_relation() def _get_writer(self, csv_record_dict: Dict[str, Any], file_mapping: Dict[Any, DictWriter], key: Any, dir_path: str, file_suffix: str) -> DictWriter: """ Finds a writer based on csv record, key. If writer does not exist, it's creates a csv writer and update the mapping. """ writer = file_mapping.get(key) if writer: return writer LOGGER.info('Creating file for {}'.format(key)) file_out = open('{}/{}.csv'.format(dir_path, file_suffix), 'w', encoding='utf8') writer = csv.DictWriter(file_out, fieldnames=csv_record_dict.keys(), quoting=csv.QUOTE_NONNUMERIC) def file_out_close() -> None: LOGGER.info('Closing file IO {}'.format(file_out)) file_out.close() self._closer.register(file_out_close) writer.writeheader() file_mapping[key] = writer return writer def close(self) -> None: """ Any closeable callable registered in _closer, it will close. """ self._closer.close() def get_scope(self) -> str: return "loader.neptune_filesystem_csv"
class SearchMetadatatoElasticasearchTask(Task): ENTITY_TYPE = 'doc_type' ELASTICSEARCH_CLIENT_CONFIG_KEY = 'client' MAPPING_CLASS = 'document_mapping' ELASTICSEARCH_ALIAS_CONFIG_KEY = 'alias' ELASTICSEARCH_NEW_INDEX = 'new_index' ELASTICSEARCH_PUBLISHER_BATCH_SIZE = 'batch_size' ELASTICSEARCH_TIMEOUT_SEC = 'es_timeout_sec' DATE = 'date' today = date.today().strftime("%Y%m%d") def __init__(self, extractor: Extractor, transformer: Transformer = NoopTransformer()) -> None: self.extractor = extractor self.transformer = transformer self._closer = Closer() self._closer.register(self.extractor.close) self._closer.register(self.transformer.close) def init(self, conf: ConfigTree) -> None: # initialize extractor with configurarion self.extractor.init(Scoped.get_scoped_conf(conf, self.extractor.get_scope())) # initialize transformer with configuration self.transformer.init(Scoped.get_scoped_conf(conf, self.transformer.get_scope())) # task configuration conf = Scoped.get_scoped_conf(conf, self.get_scope()) self.date = conf.get_string(SearchMetadatatoElasticasearchTask.DATE, self.today) self.entity = conf.get_string(SearchMetadatatoElasticasearchTask.ENTITY_TYPE).lower() self.elasticsearch_client = conf.get( SearchMetadatatoElasticasearchTask.ELASTICSEARCH_CLIENT_CONFIG_KEY ) self.elasticsearch_alias = conf.get( SearchMetadatatoElasticasearchTask.ELASTICSEARCH_ALIAS_CONFIG_KEY ) self.elasticsearch_new_index = conf.get( SearchMetadatatoElasticasearchTask.ELASTICSEARCH_NEW_INDEX, self.create_new_index_name()) self.document_mapping = conf.get(SearchMetadatatoElasticasearchTask.MAPPING_CLASS, RESOURCE_TO_MAPPING[self.entity]) LOGGER.info(issubclass(self.document_mapping, SearchableResource)) if not issubclass(self.document_mapping, SearchableResource): msg = "Provided document_mapping should be instance" \ f" of SearchableResource not {type(self.document_mapping)}" LOGGER.error(msg) raise TypeError(msg) self.elasticsearch_batch_size = conf.get( SearchMetadatatoElasticasearchTask.ELASTICSEARCH_PUBLISHER_BATCH_SIZE, 10000 ) self.elasticsearch_timeout_sec = conf.get( SearchMetadatatoElasticasearchTask.ELASTICSEARCH_TIMEOUT_SEC, 120 ) def create_new_index_name(self) -> str: hex_string = uuid4().hex return f"{self.elasticsearch_alias}_{self.date}_{hex_string}" def to_document(self, metadata: Any) -> Document: return self.document_mapping(_index=self.elasticsearch_new_index, **metadata) def generate_documents(self, record: Any) -> Generator: # iterate through records while record: record = self.transformer.transform(record) if not record: # Move on if the transformer filtered the record out record = self.extractor.extract() continue document = self.to_document(metadata=record).to_dict(True) document['_source']['resource_type'] = self.entity yield document record = self.extractor.extract() def _get_old_index(self, connection: Connections) -> List[str]: """ Retrieve all indices that currently have {elasticsearch_alias} alias :return: list of elasticsearch indices """ try: indices = connection.indices.get_alias(self.elasticsearch_alias).keys() return indices except NotFoundError: LOGGER.warn("Received index not found error from Elasticsearch. " + "The index doesn't exist for a newly created ES. It's OK on first run.") # return empty list on exception return [] def _delete_old_index(self, connection: Connections, document_index: Index) -> None: alias_updates = [] previous_index = self._get_old_index(connection=connection) for previous_index_name in previous_index: if previous_index_name != document_index._name: LOGGER.info(f"Deleting old index {previous_index_name}") alias_updates.append({"remove_index": {"index": previous_index_name}}) alias_updates.append({"add": { "index": self.elasticsearch_new_index, "alias": self.elasticsearch_alias}}) connection.indices.update_aliases({"actions": alias_updates}) def run(self) -> None: LOGGER.info('Running search metadata to Elasticsearch task') try: # extract records from metadata store record = self.extractor.extract() # create connection connections.add_connection('default', self.elasticsearch_client) connection = connections.get_connection() # health check ES health = connection.cluster.health() status = health["status"] if status not in ("green", "yellow"): msg = f"Elasticsearch healthcheck failed: {status}" LOGGER.error(msg) raise Exception(msg) # create index LOGGER.info(f"Creating ES index {self.elasticsearch_new_index}") index = Index(name=self.elasticsearch_new_index, using=self.elasticsearch_client) index.document(self.document_mapping) index.create() # publish search metadata to ES cnt = 0 for success, info in parallel_bulk(connection, self.generate_documents(record=record), raise_on_error=False, chunk_size=self.elasticsearch_batch_size, request_timeout=self.elasticsearch_timeout_sec): if not success: LOGGER.warn(f"There was an error while indexing a document to ES: {info}") else: cnt += 1 if cnt == self.elasticsearch_batch_size: LOGGER.info(f'Published {str(cnt*self.elasticsearch_batch_size)} records to ES') # delete old index self._delete_old_index(connection=connection, document_index=index) LOGGER.info("Elasticsearch Indexing completed") finally: self._closer.close() def get_scope(self) -> str: return 'task.search_metadata_to_elasticsearch'
def __init__(self) -> None: self._record_file_mapping: Dict[Any, DictWriter] = {} self._keys: Dict[FrozenSet[str], int] = {} self._closer = Closer()
class FSMySQLCSVLoader(Loader): """ Write table record CSV file(s) that can be consumed by MySQLCsvPublisher. It assumes that the record it consumes is instance of TableSerializable. """ # Config keys RECORD_DIR_PATH = 'record_dir_path' FORCE_CREATE_DIR = 'force_create_directory' SHOULD_DELETE_CREATED_DIR = 'delete_created_directories' _DEFAULT_CONFIG = ConfigFactory.from_dict({ SHOULD_DELETE_CREATED_DIR: True, FORCE_CREATE_DIR: False }) def __init__(self) -> None: self._record_file_mapping: Dict[Any, DictWriter] = {} self._keys: Dict[FrozenSet[str], int] = {} self._closer = Closer() def init(self, conf: ConfigTree) -> None: """ Initializing FsMySQLCSVLoader by creating directory for record files. Note that the directory defined in configuration should not exist. :param conf: :return: """ conf = conf.with_fallback(FSMySQLCSVLoader._DEFAULT_CONFIG) self._record_dir = conf.get_string(FSMySQLCSVLoader.RECORD_DIR_PATH) self._delete_created_dir = conf.get_bool( FSMySQLCSVLoader.SHOULD_DELETE_CREATED_DIR) self._force_create_dir = conf.get_bool( FSMySQLCSVLoader.FORCE_CREATE_DIR) self._create_directory(self._record_dir) def _create_directory(self, path: str) -> None: """ Validate directory does not exist, creates it, register deletion of created directory function to Job.closer. :param path: :return: """ if os.path.exists(path): if self._force_create_dir: LOGGER.info(f'Directory exist. Deleting directory {path}') shutil.rmtree(path) else: raise RuntimeError(f'Directory should not exist: {path}') os.makedirs(path) def _delete_dir() -> None: if not self._delete_created_dir: LOGGER.warning(f'Skip Deleting directory {path}') return LOGGER.info(f'Deleting directory {path}') shutil.rmtree(path) # Directory should be deleted after publish is finished Job.closer.register(_delete_dir) def load(self, csv_serializable: TableSerializable) -> None: """ Writes TableSerializable records into CSV files. There are multiple CSV files meaning different tables that this method writes. Common pattern for table records: 1. retrieve csv row (a dict where keys represent a header, values represent a row) 2. using this dict to get a appropriate csv writer and write to it. 3. repeat 1 and 2 :param csv_serializable: :return: """ record = csv_serializable.next_record() while record: record_dict = mysql_serializer.serialize_record(record) table_name = record.__tablename__ key = (table_name, self._make_key(record_dict)) file_suffix = '{}_{}'.format(*key) record_writer = self._get_writer(record_dict, self._record_file_mapping, key, self._record_dir, file_suffix) record_writer.writerow(record_dict) record = csv_serializable.next_record() def _get_writer(self, csv_record_dict: Dict[str, Any], file_mapping: Dict[Any, DictWriter], key: Any, dir_path: str, file_suffix: str) -> DictWriter: """ Finds a writer based on csv record, key. If writer does not exist, it's creates a csv writer and update the mapping. :param csv_record_dict: :param file_mapping: :param key: :param dir_path: :param file_suffix: :return: """ writer = file_mapping.get(key) if writer: return writer LOGGER.info(f'Creating file for {key}') file_out = open(f'{dir_path}/{file_suffix}.csv', 'w', encoding='utf8') writer = csv.DictWriter(file_out, fieldnames=csv_record_dict.keys(), quoting=csv.QUOTE_NONNUMERIC) def file_out_close() -> None: LOGGER.info(f'Closing file IO {file_out}') file_out.close() self._closer.register(file_out_close) writer.writeheader() file_mapping[key] = writer return writer def close(self) -> None: """ Any closeable callable registered in _closer, it will close. :return: """ self._closer.close() def get_scope(self) -> str: return "loader.mysql_filesystem_csv" def _make_key(self, record_dict: Dict[str, Any]) -> int: """ Each unique set of record keys is assigned an increasing numeric key """ return self._keys.setdefault(frozenset(record_dict.keys()), len(self._keys))
class FsAtlasCSVLoader(Loader): """ Write entity and relationship CSV file(s) that can be consumed by AtlasCsvPublisher. It assumes that the record it consumes is instance of AtlasCsvSerializable """ # Config keys ENTITY_DIR_PATH = 'entity_dir_path' RELATIONSHIP_DIR_PATH = 'relationship_dir_path' FORCE_CREATE_DIR = 'force_create_directory' SHOULD_DELETE_CREATED_DIR = 'delete_created_directories' _DEFAULT_CONFIG = ConfigFactory.from_dict({ SHOULD_DELETE_CREATED_DIR: True, FORCE_CREATE_DIR: False, }) def __init__(self) -> None: self._entity_file_mapping: Dict[Any, DictWriter] = {} self._relation_file_mapping: Dict[Any, DictWriter] = {} self._keys: Dict[FrozenSet[str], int] = {} self._closer = Closer() def init(self, conf: ConfigTree) -> None: """ Initializing FsAtlasCSVLoader by creating directory for entity files and relationship files. Note that the directory defined in configuration should not exist. :param conf: :return: """ conf = conf.with_fallback(FsAtlasCSVLoader._DEFAULT_CONFIG) self._entity_dir = conf.get_string(FsAtlasCSVLoader.ENTITY_DIR_PATH) self._relation_dir = \ conf.get_string(FsAtlasCSVLoader.RELATIONSHIP_DIR_PATH) self._delete_created_dir = \ conf.get_bool(FsAtlasCSVLoader.SHOULD_DELETE_CREATED_DIR) self._force_create_dir = conf.get_bool(FsAtlasCSVLoader.FORCE_CREATE_DIR) self._create_directory(self._entity_dir) self._create_directory(self._relation_dir) def _create_directory(self, path: str) -> None: """ Validate directory does not exist, creates it, register deletion of created directory function to Job.closer. :param path: :return: """ if os.path.exists(path): if self._force_create_dir: LOGGER.info('Directory exist. Deleting directory %s', path) shutil.rmtree(path) else: raise RuntimeError(f'Directory should not exist: {path}') os.makedirs(path) def _delete_dir() -> None: if not self._delete_created_dir: LOGGER.warning('Skip Deleting directory %s', path) return LOGGER.info('Deleting directory %s', path) shutil.rmtree(path) # Directory should be deleted after publish is finished Job.closer.register(_delete_dir) def load(self, csv_serializable: AtlasSerializable) -> None: """ Writes AtlasSerializable into CSV files. There are multiple CSV files that this method writes. This is because there're not only node and relationship, but also it can also have different entities, and relationships. Common pattern for both entities and relations: 1. retrieve csv row (a dict where keys represent a header, values represent a row) 2. using this dict to get a appropriate csv writer and write to it. 3. repeat 1 and 2 :param csv_serializable: :return: """ entity = csv_serializable.next_atlas_entity() while entity: entity_dict = atlas_serializer.serialize_entity(entity) key = (self._make_key(entity_dict), entity.typeName) file_suffix = '{}_{}'.format(*key) entity_writer = self._get_writer( entity_dict, self._entity_file_mapping, key, self._entity_dir, file_suffix, ) entity_writer.writerow(entity_dict) entity = csv_serializable.next_atlas_entity() relation = csv_serializable.next_atlas_relation() while relation: relation_dict = atlas_serializer.serialize_relationship(relation) keys = ( self._make_key(relation_dict), relation.entityType1, relation.entityType2, ) file_suffix = '{}_{}_{}'.format(*keys) relation_writer = self._get_writer( relation_dict, self._relation_file_mapping, keys, self._relation_dir, file_suffix, ) relation_writer.writerow(relation_dict) relation = csv_serializable.next_atlas_relation() def _get_writer( self, csv_record_dict: Dict[str, Any], file_mapping: Dict[Any, DictWriter], key: Any, dir_path: str, file_suffix: str, ) -> DictWriter: """ Finds a writer based on csv record, key. If writer does not exist, it's creates a csv writer and update the mapping. :param csv_record_dict: :param file_mapping: :param key: :param file_suffix: :return: """ writer = file_mapping.get(key) if writer: return writer LOGGER.info('Creating file for %s', key) file_out = open(f'{dir_path}/{file_suffix}.csv', 'w', encoding='utf8') writer = csv.DictWriter( # type: ignore file_out, fieldnames=csv_record_dict.keys(), quoting=csv.QUOTE_NONNUMERIC, ) def file_out_close() -> None: LOGGER.info('Closing file IO %s', file_out) file_out.close() self._closer.register(file_out_close) writer.writeheader() file_mapping[key] = writer return writer def close(self) -> None: """ Any closeable callable registered in _closer, it will close. :return: """ self._closer.close() def get_scope(self) -> str: return "loader.filesystem_csv_atlas" def _make_key(self, record_dict: Dict[str, Any]) -> str: """ Each unique set of record keys is assigned an increasing numeric key """ return str(self._keys.setdefault(frozenset(record_dict.keys()), len(self._keys))).rjust(3, '0')
class DefaultTask(Task): """ A default task expecting to extract, transform and load. """ # Determines the frequency of the log on task progress PROGRESS_REPORT_FREQUENCY = 'progress_report_frequency' def __init__( self, extractor: Extractor, loader: Loader, transformer: Transformer = NoopTransformer()) -> None: self.extractor = extractor self.transformer = transformer self.loader = loader self._closer = Closer() self._closer.register(self.extractor.close) self._closer.register(self.transformer.close) self._closer.register(self.loader.close) def init(self, conf: ConfigTree) -> None: self._progress_report_frequency = \ conf.get_int(f'{self.get_scope()}.{DefaultTask.PROGRESS_REPORT_FREQUENCY}', 500) self.extractor.init( Scoped.get_scoped_conf(conf, self.extractor.get_scope())) self.transformer.init( Scoped.get_scoped_conf(conf, self.transformer.get_scope())) self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope())) def run(self) -> None: """ Runs a task """ LOGGER.info('Running a task') try: record = self.extractor.extract() count = 0 while record: record = self.transformer.transform(record) if not record: # Move on if the transformer filtered the record out record = self.extractor.extract() continue # Support transformers which return one record, or yield multiple results = record if isinstance(record, Iterator) else [record] for result in results: if result: self.loader.load(result) count += 1 if count > 0 and count % self._progress_report_frequency == 0: LOGGER.info(f'Extracted %i records so far', count) # Prepare the next record record = self.extractor.extract() finally: self._closer.close()