예제 #1
0
    def __init__(self,
                 extractor: Extractor,
                 transformer: Transformer = NoopTransformer()) -> None:
        self.extractor = extractor
        self.transformer = transformer

        self._closer = Closer()
        self._closer.register(self.extractor.close)
        self._closer.register(self.transformer.close)
예제 #2
0
    def __init__(self, extractor, loader, transformer=NoopTransformer()):
        # type: (Extractor, Loader, Transformer) -> None
        self.extractor = extractor
        self.transformer = transformer
        self.loader = loader

        self._closer = Closer()
        self._closer.register(self.extractor.close)
        self._closer.register(self.transformer.close)
        self._closer.register(self.loader.close)
예제 #3
0
class DefaultTask(Task):
    """
    A default task expecting to extract, transform and load.

    """

    # Determines the frequency of the log on task progress
    PROGRESS_REPORT_FREQUENCY = 'progress_report_frequency'

    def __init__(
        self,
        extractor: Extractor,
        loader: Loader,
        transformer: Transformer = NoopTransformer()) -> None:
        self.extractor = extractor
        self.transformer = transformer
        self.loader = loader

        self._closer = Closer()
        self._closer.register(self.extractor.close)
        self._closer.register(self.transformer.close)
        self._closer.register(self.loader.close)

    def init(self, conf: ConfigTree) -> None:
        self._progress_report_frequency = \
            conf.get_int('{}.{}'.format(self.get_scope(), DefaultTask.PROGRESS_REPORT_FREQUENCY), 500)

        self.extractor.init(
            Scoped.get_scoped_conf(conf, self.extractor.get_scope()))
        self.transformer.init(
            Scoped.get_scoped_conf(conf, self.transformer.get_scope()))
        self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope()))

    def run(self) -> None:
        """
        Runs a task
        :return:
        """
        LOGGER.info('Running a task')
        try:
            record = self.extractor.extract()
            count = 1
            while record:
                record = self.transformer.transform(record)
                if not record:
                    record = self.extractor.extract()
                    continue
                self.loader.load(record)
                record = self.extractor.extract()
                count += 1
                if count > 0 and count % self._progress_report_frequency == 0:
                    LOGGER.info('Extracted {} records so far'.format(count))

        finally:
            self._closer.close()
예제 #4
0
class DefaultTask(Task):
    """
    A default task expecting to extract, transform and load.

    """
    def __init__(self, extractor, loader, transformer=NoopTransformer()):
        # type: (Extractor, Loader, Transformer) -> None
        self.extractor = extractor
        self.transformer = transformer
        self.loader = loader

        self._closer = Closer()
        self._closer.register(self.extractor.close)
        self._closer.register(self.transformer.close)
        self._closer.register(self.loader.close)

    def init(self, conf):
        # type: (ConfigTree) -> None
        self.extractor.init(
            Scoped.get_scoped_conf(conf, self.extractor.get_scope()))
        self.transformer.init(
            Scoped.get_scoped_conf(conf, self.transformer.get_scope()))
        self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope()))

    def run(self):
        # type: () -> None
        """
        Runs a task
        :return:
        """
        logging.info('Running a task')
        try:
            record = self.extractor.extract()

            while record:
                record = self.transformer.transform(record)
                if not record:
                    continue
                self.loader.load(record)
                record = self.extractor.extract()
        finally:
            self._closer.close()
예제 #5
0
class Job(Scoped):
    closer = Closer()

    """
    A Databuilder job that represents single work unit.
    """
    @abc.abstractmethod
    def init(self, conf: ConfigTree) -> None:
        pass

    @abc.abstractmethod
    def launch(self) -> None:
        """
        Launch a job
        :return: None
        """
        pass

    def get_scope(self) -> str:
        return 'job'
예제 #6
0
 def __init__(self):
     # type: () -> None
     self._node_file_mapping = {}  # type: Dict[Any, DictWriter]
     self._relation_file_mapping = {}  # type: Dict[Any, DictWriter]
     self._closer = Closer()
예제 #7
0
class FsNeo4jCSVLoader(Loader):
    """
    Write node and relationship CSV file(s) that can be consumed by
    Neo4jCsvPublisher.
    It assumes that the record it consumes is instance of Neo4jCsvSerializable
    """
    # Config keys
    NODE_DIR_PATH = 'node_dir_path'
    RELATION_DIR_PATH = 'relationship_dir_path'
    SHOULD_DELETE_CREATED_DIR = 'delete_created_directories'

    _DEFAULT_CONFIG = ConfigFactory.from_dict(
        {SHOULD_DELETE_CREATED_DIR: True})

    def __init__(self):
        # type: () -> None
        self._node_file_mapping = {}  # type: Dict[Any, DictWriter]
        self._relation_file_mapping = {}  # type: Dict[Any, DictWriter]
        self._closer = Closer()

    def init(self, conf):
        # type: (ConfigTree) -> None
        """
        Initializing FsNeo4jCsvLoader by creating directory for node files
        and relationship files. Note that the directory defined in
        configuration should not exist.
        :param conf:
        :return:
        """
        conf = conf.with_fallback(FsNeo4jCSVLoader._DEFAULT_CONFIG)

        self._node_dir = conf.get_string(FsNeo4jCSVLoader.NODE_DIR_PATH)
        self._relation_dir = \
            conf.get_string(FsNeo4jCSVLoader.RELATION_DIR_PATH)

        self._delete_created_dir = \
            conf.get_bool(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR)
        self._create_directory(self._node_dir)
        self._create_directory(self._relation_dir)

    def _create_directory(self, path):
        # type: (str) -> None
        """
        Validate directory does not exist, creates it, register deletion of
        created directory function to Job.closer.
        :param path:
        :return:
        """
        if os.path.exists(path):
            raise RuntimeError('Directory should not exist: {}'.format(path))

        os.makedirs(path)

        def _delete_dir():
            # type: () -> None
            if not self._delete_created_dir:
                LOGGER.warn('Skip Deleting directory {}'.format(path))
                return

            LOGGER.info('Deleting directory {}'.format(path))
            shutil.rmtree(path)

        # Directory should be deleted after publish is finished
        Job.closer.register(_delete_dir)

    def load(self, csv_serializable):
        # type: (Neo4jCsvSerializable) -> None
        """
        Writes Neo4jCsvSerializable into CSV files.
        There are multiple CSV files that this method writes.
        This is because there're not only node and relationship, but also it
        can also have different nodes, and relationships.

        Common pattern for both nodes and relations:
         1. retrieve csv row (a dict where keys represent a header,
         values represent a row)
         2. using this dict to get a appropriate csv writer and write to it.
         3. repeat 1 and 2

        :param csv_serializable:
        :return:
        """

        node_dict = csv_serializable.next_node()
        while node_dict:
            key = (node_dict[NODE_LABEL], len(node_dict))
            file_suffix = '{}_{}'.format(*key)
            node_writer = self._get_writer(node_dict, self._node_file_mapping,
                                           key, self._node_dir, file_suffix)
            node_writer.writerow(node_dict)
            node_dict = csv_serializable.next_node()

        relation_dict = csv_serializable.next_relation()
        while relation_dict:
            key2 = (relation_dict[RELATION_START_LABEL],
                    relation_dict[RELATION_END_LABEL],
                    relation_dict[RELATION_TYPE], len(relation_dict))

            file_suffix = '{}_{}_{}'.format(key2[0], key2[1], key2[2])
            relation_writer = self._get_writer(relation_dict,
                                               self._relation_file_mapping,
                                               key2, self._relation_dir,
                                               file_suffix)
            relation_writer.writerow(relation_dict)
            relation_dict = csv_serializable.next_relation()

    def _get_writer(
            self,
            csv_record_dict,  # type: Dict[str, Any]
            file_mapping,  # type: Dict[Any, DictWriter]
            key,  # type: Any
            dir_path,  # type: str
            file_suffix  # type: str
    ):
        # type: (...) -> DictWriter
        """
        Finds a writer based on csv record, key.
        If writer does not exist, it's creates a csv writer and update the
        mapping.

        :param csv_record_dict:
        :param file_mapping:
        :param key:
        :param file_suffix:
        :return:
        """
        writer = file_mapping.get(key)
        if writer:
            return writer

        LOGGER.info('Creating file for {}'.format(key))
        file_out = open('{}/{}.csv'.format(dir_path, file_suffix), 'w')

        def file_out_close():
            # type: () -> None
            LOGGER.info('Closing file IO {}'.format(file_out))
            file_out.close()

        self._closer.register(file_out_close)

        writer = csv.DictWriter(file_out,
                                fieldnames=csv_record_dict.keys(),
                                quoting=csv.QUOTE_NONNUMERIC)
        writer.writeheader()
        file_mapping[key] = writer

        return writer

    def close(self):
        # type: () -> None
        """
        Any closeable callable registered in _closer, it will close.
        :return:
        """
        self._closer.close()

    def get_scope(self):
        # type: () -> str
        return "loader.filesystem_csv_neo4j"
 def __init__(self) -> None:
     self._node_file_mapping: Dict[Any, DictWriter] = {}
     self._relation_file_mapping: Dict[Any, DictWriter] = {}
     self._closer = Closer()
class FSNeptuneCSVLoader(Loader):
    """
    Write node and relationship CSV file(s) that can be consumed by
    NeptuneCsvPublisher.
    It assumes that the record it consumes is instance of GraphSerializable
    """
    # Config keys
    NODE_DIR_PATH = 'node_dir_path'
    RELATION_DIR_PATH = 'relationship_dir_path'
    FORCE_CREATE_DIR = 'force_create_directory'
    SHOULD_DELETE_CREATED_DIR = 'delete_created_directories'
    JOB_PUBLISHER_TAG = 'job_publisher_tag'

    _DEFAULT_CONFIG = ConfigFactory.from_dict({
        SHOULD_DELETE_CREATED_DIR: True,
        FORCE_CREATE_DIR: False
    })

    def __init__(self) -> None:
        self._node_file_mapping: Dict[Any, DictWriter] = {}
        self._relation_file_mapping: Dict[Any, DictWriter] = {}
        self._closer = Closer()

    def init(self, conf: ConfigTree) -> None:
        """
        Initializing FSNeptuneCSVLoader by creating directory for node files
        and relationship files. Note that the directory defined in
        configuration should not exist.
        """
        conf = conf.with_fallback(FSNeptuneCSVLoader._DEFAULT_CONFIG)

        self._node_dir = conf.get_string(FSNeptuneCSVLoader.NODE_DIR_PATH)
        self._relation_dir = conf.get_string(
            FSNeptuneCSVLoader.RELATION_DIR_PATH)

        self._delete_created_dir = conf.get_bool(
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR)
        self._force_create_dir = conf.get_bool(
            FSNeptuneCSVLoader.FORCE_CREATE_DIR)
        self._create_directory(self._node_dir)
        self._create_directory(self._relation_dir)
        self.job_publisher_tag = conf.get_string(
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG)

    def _create_directory(self, path: str) -> None:
        """
        Validate directory does not exist, creates it, register deletion of
        created directory function to Job.closer.
        """
        if os.path.exists(path):
            if self._force_create_dir:
                LOGGER.info(
                    'Directory exist. Deleting directory {}'.format(path))
                shutil.rmtree(path)
            else:
                raise RuntimeError(
                    'Directory should not exist: {}'.format(path))

        os.makedirs(path)

        def _delete_dir() -> None:
            if not self._delete_created_dir:
                LOGGER.warn('Skip Deleting directory {}'.format(path))
                return

            LOGGER.info('Deleting directory {}'.format(path))
            shutil.rmtree(path)

        # Directory should be deleted after publish is finished
        Job.closer.register(_delete_dir)

    def load(self, csv_serializable: GraphSerializable) -> None:
        """
        Writes GraphSerializable into CSV files.
        There are multiple CSV files that this method writes.
        This is because there're not only node and relationship, but also it
        can also have different nodes, and relationships.
        Common pattern for both nodes and relations:
         1. retrieve csv row (a dict where keys represent a header,
         values represent a row)
         2. using this dict to get a appropriate csv writer and write to it.
         3. repeat 1 and 2
        :param csv_serializable:
        :return:
        """

        node = csv_serializable.next_node()
        while node:

            node.attributes[
                PUBLISHED_TAG_PROPERTY_NAME] = self.job_publisher_tag
            node_dict = neptune_serializer.convert_node(node)
            if node_dict:
                key = (node.label, len(node_dict))
                file_suffix = '{}_{}'.format(*key)
                node_writer = self._get_writer(node_dict,
                                               self._node_file_mapping, key,
                                               self._node_dir, file_suffix)
                node_writer.writerow(node_dict)
            node = csv_serializable.next_node()

        relation = csv_serializable.next_relation()
        while relation:
            relation.attributes[
                PUBLISHED_TAG_PROPERTY_NAME] = self.job_publisher_tag
            relation_dicts = neptune_serializer.convert_relationship(relation)
            if relation_dicts:
                key2 = (relation.start_label, relation.end_label,
                        relation.type, len(relation_dicts[0]))

                file_suffix = '{}_{}_{}'.format(key2[0], key2[1], key2[2])
                relation_writer = self._get_writer(relation_dicts[0],
                                                   self._relation_file_mapping,
                                                   key2, self._relation_dir,
                                                   file_suffix)
                relation_writer.writerows(relation_dicts)
            relation = csv_serializable.next_relation()

    def _get_writer(self, csv_record_dict: Dict[str, Any],
                    file_mapping: Dict[Any, DictWriter], key: Any,
                    dir_path: str, file_suffix: str) -> DictWriter:
        """
        Finds a writer based on csv record, key.
        If writer does not exist, it's creates a csv writer and update the
        mapping.
        """
        writer = file_mapping.get(key)
        if writer:
            return writer

        LOGGER.info('Creating file for {}'.format(key))

        file_out = open('{}/{}.csv'.format(dir_path, file_suffix),
                        'w',
                        encoding='utf8')
        writer = csv.DictWriter(file_out,
                                fieldnames=csv_record_dict.keys(),
                                quoting=csv.QUOTE_NONNUMERIC)

        def file_out_close() -> None:
            LOGGER.info('Closing file IO {}'.format(file_out))
            file_out.close()

        self._closer.register(file_out_close)

        writer.writeheader()
        file_mapping[key] = writer

        return writer

    def close(self) -> None:
        """
        Any closeable callable registered in _closer, it will close.
        """
        self._closer.close()

    def get_scope(self) -> str:
        return "loader.neptune_filesystem_csv"
예제 #10
0
class SearchMetadatatoElasticasearchTask(Task):

    ENTITY_TYPE = 'doc_type'
    ELASTICSEARCH_CLIENT_CONFIG_KEY = 'client'
    MAPPING_CLASS = 'document_mapping'
    ELASTICSEARCH_ALIAS_CONFIG_KEY = 'alias'
    ELASTICSEARCH_NEW_INDEX = 'new_index'
    ELASTICSEARCH_PUBLISHER_BATCH_SIZE = 'batch_size'
    ELASTICSEARCH_TIMEOUT_SEC = 'es_timeout_sec'
    DATE = 'date'

    today = date.today().strftime("%Y%m%d")

    def __init__(self,
                 extractor: Extractor,
                 transformer: Transformer = NoopTransformer()) -> None:
        self.extractor = extractor
        self.transformer = transformer

        self._closer = Closer()
        self._closer.register(self.extractor.close)
        self._closer.register(self.transformer.close)

    def init(self, conf: ConfigTree) -> None:
        # initialize extractor with configurarion
        self.extractor.init(Scoped.get_scoped_conf(conf, self.extractor.get_scope()))
        # initialize transformer with configuration
        self.transformer.init(Scoped.get_scoped_conf(conf, self.transformer.get_scope()))

        # task configuration
        conf = Scoped.get_scoped_conf(conf, self.get_scope())
        self.date = conf.get_string(SearchMetadatatoElasticasearchTask.DATE, self.today)
        self.entity = conf.get_string(SearchMetadatatoElasticasearchTask.ENTITY_TYPE).lower()
        self.elasticsearch_client = conf.get(
            SearchMetadatatoElasticasearchTask.ELASTICSEARCH_CLIENT_CONFIG_KEY
        )
        self.elasticsearch_alias = conf.get(
            SearchMetadatatoElasticasearchTask.ELASTICSEARCH_ALIAS_CONFIG_KEY
        )
        self.elasticsearch_new_index = conf.get(
            SearchMetadatatoElasticasearchTask.ELASTICSEARCH_NEW_INDEX,
            self.create_new_index_name())
        self.document_mapping = conf.get(SearchMetadatatoElasticasearchTask.MAPPING_CLASS,
                                         RESOURCE_TO_MAPPING[self.entity])

        LOGGER.info(issubclass(self.document_mapping, SearchableResource))

        if not issubclass(self.document_mapping, SearchableResource):
            msg = "Provided document_mapping should be instance" \
                f" of SearchableResource not {type(self.document_mapping)}"
            LOGGER.error(msg)
            raise TypeError(msg)

        self.elasticsearch_batch_size = conf.get(
            SearchMetadatatoElasticasearchTask.ELASTICSEARCH_PUBLISHER_BATCH_SIZE, 10000
        )
        self.elasticsearch_timeout_sec = conf.get(
            SearchMetadatatoElasticasearchTask.ELASTICSEARCH_TIMEOUT_SEC, 120
        )

    def create_new_index_name(self) -> str:
        hex_string = uuid4().hex
        return f"{self.elasticsearch_alias}_{self.date}_{hex_string}"

    def to_document(self, metadata: Any) -> Document:
        return self.document_mapping(_index=self.elasticsearch_new_index, **metadata)

    def generate_documents(self, record: Any) -> Generator:
        # iterate through records
        while record:
            record = self.transformer.transform(record)
            if not record:
                # Move on if the transformer filtered the record out
                record = self.extractor.extract()
                continue
            document = self.to_document(metadata=record).to_dict(True)
            document['_source']['resource_type'] = self.entity

            yield document
            record = self.extractor.extract()

    def _get_old_index(self, connection: Connections) -> List[str]:
        """
        Retrieve all indices that currently have {elasticsearch_alias} alias
        :return: list of elasticsearch indices
        """
        try:
            indices = connection.indices.get_alias(self.elasticsearch_alias).keys()
            return indices
        except NotFoundError:
            LOGGER.warn("Received index not found error from Elasticsearch. " +
                        "The index doesn't exist for a newly created ES. It's OK on first run.")
            # return empty list on exception
            return []

    def _delete_old_index(self, connection: Connections, document_index: Index) -> None:
        alias_updates = []
        previous_index = self._get_old_index(connection=connection)
        for previous_index_name in previous_index:
            if previous_index_name != document_index._name:
                LOGGER.info(f"Deleting old index {previous_index_name}")
                alias_updates.append({"remove_index": {"index": previous_index_name}})
        alias_updates.append({"add": {
            "index": self.elasticsearch_new_index,
            "alias": self.elasticsearch_alias}})
        connection.indices.update_aliases({"actions": alias_updates})

    def run(self) -> None:
        LOGGER.info('Running search metadata to Elasticsearch task')
        try:
            # extract records from metadata store
            record = self.extractor.extract()

            # create connection
            connections.add_connection('default', self.elasticsearch_client)
            connection = connections.get_connection()

            # health check ES
            health = connection.cluster.health()
            status = health["status"]
            if status not in ("green", "yellow"):
                msg = f"Elasticsearch healthcheck failed: {status}"
                LOGGER.error(msg)
                raise Exception(msg)

            # create index
            LOGGER.info(f"Creating ES index {self.elasticsearch_new_index}")
            index = Index(name=self.elasticsearch_new_index, using=self.elasticsearch_client)
            index.document(self.document_mapping)
            index.create()

            # publish search metadata to ES
            cnt = 0
            for success, info in parallel_bulk(connection,
                                               self.generate_documents(record=record),
                                               raise_on_error=False,
                                               chunk_size=self.elasticsearch_batch_size,
                                               request_timeout=self.elasticsearch_timeout_sec):
                if not success:
                    LOGGER.warn(f"There was an error while indexing a document to ES: {info}")
                else:
                    cnt += 1
                if cnt == self.elasticsearch_batch_size:
                    LOGGER.info(f'Published {str(cnt*self.elasticsearch_batch_size)} records to ES')

            # delete old index
            self._delete_old_index(connection=connection,
                                   document_index=index)

            LOGGER.info("Elasticsearch Indexing completed")
        finally:
            self._closer.close()

    def get_scope(self) -> str:
        return 'task.search_metadata_to_elasticsearch'
예제 #11
0
 def __init__(self) -> None:
     self._record_file_mapping: Dict[Any, DictWriter] = {}
     self._keys: Dict[FrozenSet[str], int] = {}
     self._closer = Closer()
예제 #12
0
class FSMySQLCSVLoader(Loader):
    """
    Write table record CSV file(s) that can be consumed by MySQLCsvPublisher.
    It assumes that the record it consumes is instance of TableSerializable.
    """
    # Config keys
    RECORD_DIR_PATH = 'record_dir_path'
    FORCE_CREATE_DIR = 'force_create_directory'
    SHOULD_DELETE_CREATED_DIR = 'delete_created_directories'

    _DEFAULT_CONFIG = ConfigFactory.from_dict({
        SHOULD_DELETE_CREATED_DIR: True,
        FORCE_CREATE_DIR: False
    })

    def __init__(self) -> None:
        self._record_file_mapping: Dict[Any, DictWriter] = {}
        self._keys: Dict[FrozenSet[str], int] = {}
        self._closer = Closer()

    def init(self, conf: ConfigTree) -> None:
        """
        Initializing FsMySQLCSVLoader by creating directory for record files.
        Note that the directory defined in configuration should not exist.
        :param conf:
        :return:
        """
        conf = conf.with_fallback(FSMySQLCSVLoader._DEFAULT_CONFIG)

        self._record_dir = conf.get_string(FSMySQLCSVLoader.RECORD_DIR_PATH)
        self._delete_created_dir = conf.get_bool(
            FSMySQLCSVLoader.SHOULD_DELETE_CREATED_DIR)
        self._force_create_dir = conf.get_bool(
            FSMySQLCSVLoader.FORCE_CREATE_DIR)
        self._create_directory(self._record_dir)

    def _create_directory(self, path: str) -> None:
        """
        Validate directory does not exist, creates it, register deletion of
        created directory function to Job.closer.
        :param path:
        :return:
        """
        if os.path.exists(path):
            if self._force_create_dir:
                LOGGER.info(f'Directory exist. Deleting directory {path}')
                shutil.rmtree(path)
            else:
                raise RuntimeError(f'Directory should not exist: {path}')

        os.makedirs(path)

        def _delete_dir() -> None:
            if not self._delete_created_dir:
                LOGGER.warning(f'Skip Deleting directory {path}')
                return

            LOGGER.info(f'Deleting directory {path}')
            shutil.rmtree(path)

        # Directory should be deleted after publish is finished
        Job.closer.register(_delete_dir)

    def load(self, csv_serializable: TableSerializable) -> None:
        """
        Writes TableSerializable records into CSV files.
        There are multiple CSV files meaning different tables that this method writes.

        Common pattern for table records:
         1. retrieve csv row (a dict where keys represent a header,
         values represent a row)
         2. using this dict to get a appropriate csv writer and write to it.
         3. repeat 1 and 2

        :param csv_serializable:
        :return:
        """
        record = csv_serializable.next_record()
        while record:
            record_dict = mysql_serializer.serialize_record(record)
            table_name = record.__tablename__
            key = (table_name, self._make_key(record_dict))
            file_suffix = '{}_{}'.format(*key)
            record_writer = self._get_writer(record_dict,
                                             self._record_file_mapping, key,
                                             self._record_dir, file_suffix)
            record_writer.writerow(record_dict)
            record = csv_serializable.next_record()

    def _get_writer(self, csv_record_dict: Dict[str, Any],
                    file_mapping: Dict[Any, DictWriter], key: Any,
                    dir_path: str, file_suffix: str) -> DictWriter:
        """
        Finds a writer based on csv record, key.
        If writer does not exist, it's creates a csv writer and update the mapping.

        :param csv_record_dict:
        :param file_mapping:
        :param key:
        :param dir_path:
        :param file_suffix:
        :return:
        """
        writer = file_mapping.get(key)
        if writer:
            return writer

        LOGGER.info(f'Creating file for {key}')

        file_out = open(f'{dir_path}/{file_suffix}.csv', 'w', encoding='utf8')
        writer = csv.DictWriter(file_out,
                                fieldnames=csv_record_dict.keys(),
                                quoting=csv.QUOTE_NONNUMERIC)

        def file_out_close() -> None:
            LOGGER.info(f'Closing file IO {file_out}')
            file_out.close()

        self._closer.register(file_out_close)

        writer.writeheader()
        file_mapping[key] = writer

        return writer

    def close(self) -> None:
        """
        Any closeable callable registered in _closer, it will close.
        :return:
        """
        self._closer.close()

    def get_scope(self) -> str:
        return "loader.mysql_filesystem_csv"

    def _make_key(self, record_dict: Dict[str, Any]) -> int:
        """ Each unique set of record keys is assigned an increasing numeric key """
        return self._keys.setdefault(frozenset(record_dict.keys()),
                                     len(self._keys))
예제 #13
0
class FsAtlasCSVLoader(Loader):
    """
    Write entity and relationship CSV file(s) that can be consumed by
    AtlasCsvPublisher.
    It assumes that the record it consumes is instance of AtlasCsvSerializable
    """
    # Config keys
    ENTITY_DIR_PATH = 'entity_dir_path'
    RELATIONSHIP_DIR_PATH = 'relationship_dir_path'
    FORCE_CREATE_DIR = 'force_create_directory'
    SHOULD_DELETE_CREATED_DIR = 'delete_created_directories'

    _DEFAULT_CONFIG = ConfigFactory.from_dict({
        SHOULD_DELETE_CREATED_DIR: True,
        FORCE_CREATE_DIR: False,
    })

    def __init__(self) -> None:
        self._entity_file_mapping: Dict[Any, DictWriter] = {}
        self._relation_file_mapping: Dict[Any, DictWriter] = {}
        self._keys: Dict[FrozenSet[str], int] = {}
        self._closer = Closer()

    def init(self, conf: ConfigTree) -> None:
        """
        Initializing FsAtlasCSVLoader by creating directory for entity files
        and relationship files. Note that the directory defined in
        configuration should not exist.
        :param conf:
        :return:
        """
        conf = conf.with_fallback(FsAtlasCSVLoader._DEFAULT_CONFIG)

        self._entity_dir = conf.get_string(FsAtlasCSVLoader.ENTITY_DIR_PATH)
        self._relation_dir = \
            conf.get_string(FsAtlasCSVLoader.RELATIONSHIP_DIR_PATH)

        self._delete_created_dir = \
            conf.get_bool(FsAtlasCSVLoader.SHOULD_DELETE_CREATED_DIR)
        self._force_create_dir = conf.get_bool(FsAtlasCSVLoader.FORCE_CREATE_DIR)
        self._create_directory(self._entity_dir)
        self._create_directory(self._relation_dir)

    def _create_directory(self, path: str) -> None:
        """
        Validate directory does not exist, creates it, register deletion of
        created directory function to Job.closer.
        :param path:
        :return:
        """
        if os.path.exists(path):
            if self._force_create_dir:
                LOGGER.info('Directory exist. Deleting directory %s', path)
                shutil.rmtree(path)
            else:
                raise RuntimeError(f'Directory should not exist: {path}')

        os.makedirs(path)

        def _delete_dir() -> None:
            if not self._delete_created_dir:
                LOGGER.warning('Skip Deleting directory %s', path)
                return

            LOGGER.info('Deleting directory %s', path)
            shutil.rmtree(path)

        # Directory should be deleted after publish is finished
        Job.closer.register(_delete_dir)

    def load(self, csv_serializable: AtlasSerializable) -> None:
        """
        Writes AtlasSerializable into CSV files.
        There are multiple CSV files that this method writes.
        This is because there're not only node and relationship, but also it
        can also have different entities, and relationships.

        Common pattern for both entities and relations:
         1. retrieve csv row (a dict where keys represent a header,
         values represent a row)
         2. using this dict to get a appropriate csv writer and write to it.
         3. repeat 1 and 2

        :param csv_serializable:
        :return:
        """

        entity = csv_serializable.next_atlas_entity()
        while entity:
            entity_dict = atlas_serializer.serialize_entity(entity)
            key = (self._make_key(entity_dict), entity.typeName)
            file_suffix = '{}_{}'.format(*key)
            entity_writer = self._get_writer(
                entity_dict,
                self._entity_file_mapping,
                key,
                self._entity_dir,
                file_suffix,
            )
            entity_writer.writerow(entity_dict)
            entity = csv_serializable.next_atlas_entity()

        relation = csv_serializable.next_atlas_relation()
        while relation:
            relation_dict = atlas_serializer.serialize_relationship(relation)
            keys = (
                self._make_key(relation_dict),
                relation.entityType1,
                relation.entityType2,
            )

            file_suffix = '{}_{}_{}'.format(*keys)
            relation_writer = self._get_writer(
                relation_dict,
                self._relation_file_mapping,
                keys,
                self._relation_dir,
                file_suffix,
            )
            relation_writer.writerow(relation_dict)
            relation = csv_serializable.next_atlas_relation()

    def _get_writer(
        self,
        csv_record_dict: Dict[str, Any],
        file_mapping: Dict[Any, DictWriter],
        key: Any,
        dir_path: str,
        file_suffix: str,
    ) -> DictWriter:
        """
        Finds a writer based on csv record, key.
        If writer does not exist, it's creates a csv writer and update the
        mapping.

        :param csv_record_dict:
        :param file_mapping:
        :param key:
        :param file_suffix:
        :return:
        """
        writer = file_mapping.get(key)
        if writer:
            return writer

        LOGGER.info('Creating file for %s', key)

        file_out = open(f'{dir_path}/{file_suffix}.csv', 'w', encoding='utf8')
        writer = csv.DictWriter(  # type: ignore
            file_out,
            fieldnames=csv_record_dict.keys(),
            quoting=csv.QUOTE_NONNUMERIC,
        )

        def file_out_close() -> None:
            LOGGER.info('Closing file IO %s', file_out)
            file_out.close()

        self._closer.register(file_out_close)

        writer.writeheader()
        file_mapping[key] = writer

        return writer

    def close(self) -> None:
        """
        Any closeable callable registered in _closer, it will close.
        :return:
        """
        self._closer.close()

    def get_scope(self) -> str:
        return "loader.filesystem_csv_atlas"

    def _make_key(self, record_dict: Dict[str, Any]) -> str:
        """ Each unique set of record keys is assigned an increasing numeric key """
        return str(self._keys.setdefault(frozenset(record_dict.keys()), len(self._keys))).rjust(3, '0')
예제 #14
0
class DefaultTask(Task):
    """
    A default task expecting to extract, transform and load.

    """

    # Determines the frequency of the log on task progress
    PROGRESS_REPORT_FREQUENCY = 'progress_report_frequency'

    def __init__(
        self,
        extractor: Extractor,
        loader: Loader,
        transformer: Transformer = NoopTransformer()) -> None:
        self.extractor = extractor
        self.transformer = transformer
        self.loader = loader

        self._closer = Closer()
        self._closer.register(self.extractor.close)
        self._closer.register(self.transformer.close)
        self._closer.register(self.loader.close)

    def init(self, conf: ConfigTree) -> None:
        self._progress_report_frequency = \
            conf.get_int(f'{self.get_scope()}.{DefaultTask.PROGRESS_REPORT_FREQUENCY}', 500)

        self.extractor.init(
            Scoped.get_scoped_conf(conf, self.extractor.get_scope()))
        self.transformer.init(
            Scoped.get_scoped_conf(conf, self.transformer.get_scope()))
        self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope()))

    def run(self) -> None:
        """
        Runs a task
        """
        LOGGER.info('Running a task')
        try:
            record = self.extractor.extract()
            count = 0
            while record:
                record = self.transformer.transform(record)
                if not record:
                    # Move on if the transformer filtered the record out
                    record = self.extractor.extract()
                    continue

                # Support transformers which return one record, or yield multiple
                results = record if isinstance(record, Iterator) else [record]
                for result in results:
                    if result:
                        self.loader.load(result)
                        count += 1

                if count > 0 and count % self._progress_report_frequency == 0:
                    LOGGER.info(f'Extracted %i records so far', count)

                # Prepare the next record
                record = self.extractor.extract()
        finally:
            self._closer.close()