def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    node_files_folder = f'{tmp_folder}/nodes'
    relationship_files_folder = f'{tmp_folder}/relationships'

    task = DefaultTask(extractor=EsLastUpdatedExtractor(),
                       loader=FsNeo4jCSVLoader())

    job_config = ConfigFactory.from_dict({
        'extractor.es_last_updated.model_class':
            'databuilder.models.es_last_updated.ESLastUpdated',

        'loader.filesystem_csv_neo4j.node_dir_path': node_files_folder,
        'loader.filesystem_csv_neo4j.relationship_dir_path': relationship_files_folder,
        'publisher.neo4j.node_files_directory': node_files_folder,
        'publisher.neo4j.relation_files_directory': relationship_files_folder,
        'publisher.neo4j.neo4j_endpoint': neo4j_endpoint,
        'publisher.neo4j.neo4j_user': neo4j_user,
        'publisher.neo4j.neo4j_password': neo4j_password,
        'publisher.neo4j.neo4j_encrypted': False,
        'publisher.neo4j.job_publish_tag': 'unique_lastupdated_tag',  # should use unique tag here like {ds}
    })

    return DefaultJob(conf=job_config,
                      task=task,
                      publisher=Neo4jCsvPublisher())
Exemplo n.º 2
0
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder)
    relationship_files_folder = '{tmp_folder}/relationships'.format(
        tmp_folder=tmp_folder)

    loader = FSNeptuneCSVLoader()
    task = DefaultTask(extractor=EsLastUpdatedExtractor(), loader=loader)

    publisher = NeptuneCSVPublisher()

    job_config = ConfigFactory.from_dict({
        'extractor.es_last_updated.model_class':
        'databuilder.models.es_last_updated.ESLastUpdated',
        loader.get_scope(): {
            FSNeptuneCSVLoader.NODE_DIR_PATH: node_files_folder,
            FSNeptuneCSVLoader.RELATION_DIR_PATH: relationship_files_folder,
            FSNeptuneCSVLoader.SHOULD_DELETE_CREATED_DIR: True,
            FSNeptuneCSVLoader.JOB_PUBLISHER_TAG: 'unique_tag'
        },
        publisher.get_scope(): {
            NeptuneCSVPublisher.NODE_FILES_DIR: node_files_folder,
            NeptuneCSVPublisher.RELATION_FILES_DIR: relationship_files_folder,
            NeptuneCSVPublisher.AWS_S3_BUCKET_NAME: S3_BUCKET_NAME,
            NeptuneCSVPublisher.AWS_BASE_S3_DATA_PATH: S3_DATA_PATH,
            NeptuneCSVPublisher.NEPTUNE_HOST: NEPTUNE_ENDPOINT,
            NeptuneCSVPublisher.AWS_IAM_ROLE_NAME: neptune_iam_role_name,
            'job_publish_tag': 'unique_lastupdated_tag'
        }
    })

    return DefaultJob(conf=job_config, task=task, publisher=publisher)
Exemplo n.º 3
0
def create_last_updated_job():
    # loader saves data to these folders and publisher reads it from here
    tmp_folder = '/var/tmp/amundsen/last_updated_data'
    record_files_folder = f'{tmp_folder}/records'

    task = DefaultTask(extractor=EsLastUpdatedExtractor(),
                       loader=FSMySQLCSVLoader())

    job_config = ConfigFactory.from_dict({
        'extractor.es_last_updated.model_class':
        'databuilder.models.es_last_updated.ESLastUpdated',
        'loader.mysql_filesystem_csv.record_dir_path':
        record_files_folder,
        'loader.mysql_filesystem_csv.delete_created_directories':
        True,
        'publisher.mysql.record_files_directory':
        record_files_folder,
        'publisher.mysql.conn_string':
        mysql_conn_string,
        'publisher.mysql.job_publish_tag':
        'unique_tag'
    })

    return DefaultJob(conf=job_config,
                      task=task,
                      publisher=MySQLCSVPublisher())
    def test_extraction_with_model_class(self, mock_time: Any) -> None:
        """
        Test Extraction using model class
        """
        mock_time.return_value = 10000000
        extractor = EsLastUpdatedExtractor()
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        result = extractor.extract()
        self.assertEqual(result.timestamp, 10000000)