示例#1
0
 def __init__(self, metadata, source, schema, aws_access_key_id,
              aws_secret_access_key, bucket, table):
     self.metadata = metadata
     self.source = source
     self.schema = schema
     self.aws_access_key_id = aws_access_key_id
     self.aws_secret_access_key = aws_secret_access_key
     self.bucket = bucket
     self.table = table
     self.runner = S3JsonStepRunner(metadata, schema, bucket, table)
     self.manifest = Manifest(metadata, source, schema, bucket)
     self.sql = SqlStep(table.database)
     self.max_error_count = 1
示例#2
0
文件: step.py 项目: bpmilne/arbalest
class ManifestCopyFromS3JsonStep(PipelineStep):
    def __init__(self, metadata, source, schema, aws_access_key_id,
                 aws_secret_access_key, bucket, table):
        self.metadata = metadata
        self.source = source
        self.schema = schema
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.bucket = bucket
        self.table = table
        self.runner = S3JsonStepRunner(metadata, schema, bucket, table)
        self.manifest = Manifest(metadata, source, schema, bucket)
        self.sql = SqlStep(table.database)
        self.max_error_count = 1

    @property
    def schema_key(self):
        return self.runner.schema_key

    @property
    def schema_url(self):
        return self.runner.schema_url

    @property
    def copy_sql(self):
        return "COPY %s FROM '%s' " \
               "CREDENTIALS 'aws_access_key_id=%s;" \
               "aws_secret_access_key=%s' " \
               "JSON '%s' " \
               "TIMEFORMAT 'auto' " \
               "MANIFEST " \
               "MAXERROR %s"

    @property
    def validate_sql(self):
        return self.copy_sql + " NOLOAD"

    def run(self):
        self.runner.stage()
        updated_journal = self.manifest.save()
        self.__execute(self.copy_sql)
        self.manifest.commit(updated_journal)
        self.runner.commit()

    def validate(self):
        self.runner.stage()
        self.manifest.save()
        self.__execute(self.validate_sql)
        self.runner.rollback()

    def __execute(self, sql):
        if self.table.exists() and not self.manifest.journal_exists():
            self.table.drop()
            self.table.create()
        elif not self.table.exists():
            self.table.create()
        self.sql.execute((sql, self.schema.table, self.manifest.manifest_url,
                          self.aws_access_key_id, self.aws_secret_access_key,
                          self.schema_url,
                          self.max_error_count))
示例#3
0
文件: step.py 项目: bpmilne/arbalest
 def __init__(self, metadata, source, schema, aws_access_key_id,
              aws_secret_access_key, bucket, table):
     self.metadata = metadata
     self.source = source
     self.schema = schema
     self.aws_access_key_id = aws_access_key_id
     self.aws_secret_access_key = aws_secret_access_key
     self.bucket = bucket
     self.table = table
     self.runner = S3JsonStepRunner(metadata, schema, bucket, table)
     self.manifest = Manifest(metadata, source, schema, bucket)
     self.sql = SqlStep(table.database)
     self.max_error_count = 1
示例#4
0
class ManifestCopyFromS3JsonStep(PipelineStep):
    def __init__(self, metadata, source, schema, aws_access_key_id,
                 aws_secret_access_key, bucket, table):
        self.metadata = metadata
        self.source = source
        self.schema = schema
        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key
        self.bucket = bucket
        self.table = table
        self.runner = S3JsonStepRunner(metadata, schema, bucket, table)
        self.manifest = Manifest(metadata, source, schema, bucket)
        self.sql = SqlStep(table.database)
        self.max_error_count = 1

    @property
    def schema_key(self):
        return self.runner.schema_key

    @property
    def schema_url(self):
        return self.runner.schema_url

    @property
    def copy_sql(self):
        return "COPY %s FROM '%s' " \
               "CREDENTIALS 'aws_access_key_id=%s;" \
               "aws_secret_access_key=%s' " \
               "JSON '%s' " \
               "TIMEFORMAT 'auto' " \
               "MANIFEST " \
               "MAXERROR %s"

    @property
    def validate_sql(self):
        return self.copy_sql + " NOLOAD"

    def run(self):
        self.runner.stage()
        updated_journal = self.manifest.save()
        self.__execute(self.copy_sql)
        self.manifest.commit(updated_journal)
        self.runner.commit()

    def validate(self):
        self.runner.stage()
        self.manifest.save()
        self.__execute(self.validate_sql)
        self.runner.rollback()

    def __execute(self, sql):
        if self.table.exists() and not self.manifest.journal_exists():
            self.table.drop()
            self.table.create()
        elif not self.table.exists():
            self.table.create()
        self.sql.execute((sql, self.schema.table, self.manifest.manifest_url,
                          self.aws_access_key_id, self.aws_secret_access_key,
                          self.schema_url, self.max_error_count))
示例#5
0
 def setUp(self):
     self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
     self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
                          BUCKET_NAME, Mock())
     self.bucket.save = Mock()
     self.key_names = [
         'object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a',
         'object_path/19440481-7766-4061-bd42-4a54fa0aac7c',
         'object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c',
         'object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd',
         'object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268',
         'object_path/80536e83-6bbe-4a42-ade1-533d99321a6c',
         'object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40',
         'object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1']
     self.bucket.list = Mock(
         return_value=[self.mock_key(key) for key in self.key_names])
     self.manifest = Manifest(metadata='',
                              source='',
                              schema=self.schema,
                              bucket=self.bucket)
     self.expected_manifest = {'entries': [
         {
             'url': 's3://{0}/object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a'.format(
                 BUCKET_NAME), 'mandatory': True},
         {
             'url': 's3://{0}/object_path/19440481-7766-4061-bd42-4a54fa0aac7c'.format(
                 BUCKET_NAME), 'mandatory': True},
         {
             'url': 's3://{0}/object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c'.format(
                 BUCKET_NAME), 'mandatory': True},
         {
             'url': 's3://{0}/object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd'.format(
                 BUCKET_NAME), 'mandatory': True},
         {
             'url': 's3://{0}/object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268'.format(
                 BUCKET_NAME), 'mandatory': True},
         {
             'url': 's3://{0}/object_path/80536e83-6bbe-4a42-ade1-533d99321a6c'.format(
                 BUCKET_NAME), 'mandatory': True},
         {
             'url': 's3://{0}/object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40'.format(
                 BUCKET_NAME), 'mandatory': True},
         {
             'url': 's3://{0}/object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'.format(
                 BUCKET_NAME), 'mandatory': True}
     ]}
示例#6
0
 def setUp(self):
     self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
     self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
                          BUCKET_NAME, Mock())
     self.bucket.save = Mock()
     self.key_names = [
         'object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a',
         'object_path/19440481-7766-4061-bd42-4a54fa0aac7c',
         'object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c',
         'object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd',
         'object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268',
         'object_path/80536e83-6bbe-4a42-ade1-533d99321a6c',
         'object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40',
         'object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'
     ]
     self.bucket.list = Mock(
         return_value=[self.mock_key(key) for key in self.key_names])
     self.manifest = Manifest(metadata='',
                              source='',
                              schema=self.schema,
                              bucket=self.bucket)
     self.expected_manifest = {
         'entries': [{
             'url':
             's3://{0}/object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/19440481-7766-4061-bd42-4a54fa0aac7c'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c'
             .format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/80536e83-6bbe-4a42-ade1-533d99321a6c'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }, {
             'url':
             's3://{0}/object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'.
             format(BUCKET_NAME),
             'mandatory':
             True
         }]
     }
示例#7
0
class ManifestShould(unittest.TestCase):
    maxDiff = None

    def setUp(self):
        self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
        self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
                             BUCKET_NAME, Mock())
        self.bucket.save = Mock()
        self.key_names = [
            'object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a',
            'object_path/19440481-7766-4061-bd42-4a54fa0aac7c',
            'object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c',
            'object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd',
            'object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268',
            'object_path/80536e83-6bbe-4a42-ade1-533d99321a6c',
            'object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40',
            'object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'
        ]
        self.bucket.list = Mock(
            return_value=[self.mock_key(key) for key in self.key_names])
        self.manifest = Manifest(metadata='',
                                 source='',
                                 schema=self.schema,
                                 bucket=self.bucket)
        self.expected_manifest = {
            'entries': [{
                'url':
                's3://{0}/object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a'.
                format(BUCKET_NAME),
                'mandatory':
                True
            }, {
                'url':
                's3://{0}/object_path/19440481-7766-4061-bd42-4a54fa0aac7c'.
                format(BUCKET_NAME),
                'mandatory':
                True
            }, {
                'url':
                's3://{0}/object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c'
                .format(BUCKET_NAME),
                'mandatory':
                True
            }, {
                'url':
                's3://{0}/object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd'.
                format(BUCKET_NAME),
                'mandatory':
                True
            }, {
                'url':
                's3://{0}/object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268'.
                format(BUCKET_NAME),
                'mandatory':
                True
            }, {
                'url':
                's3://{0}/object_path/80536e83-6bbe-4a42-ade1-533d99321a6c'.
                format(BUCKET_NAME),
                'mandatory':
                True
            }, {
                'url':
                's3://{0}/object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40'.
                format(BUCKET_NAME),
                'mandatory':
                True
            }, {
                'url':
                's3://{0}/object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'.
                format(BUCKET_NAME),
                'mandatory':
                True
            }]
        }

    def mock_key(self, name):
        return Key(Mock(), name)

    def mock_key_exists(self, key_name, exists):
        key = self.mock_key(key_name)
        key.exists = Mock(return_value=exists)
        self.bucket.get = Mock(return_value=key)
        return key

    def mock_journal(self, exists, key_names=None):
        key = self.mock_key_exists(self.manifest.journal_key, exists)
        if exists:
            key.get_contents_as_string = Mock(
                return_value=json.dumps(key_names))

    def test_have_all_keys(self):
        self.assertEqual(self.key_names, self.manifest.all_keys)

    def test_have_manifest_key(self):
        self.assertEqual('/event_created_manifest.json',
                         self.manifest.manifest_key)

    def test_have_journal_key(self):
        self.assertEqual('/event_created_journal.json',
                         self.manifest.journal_key)

    def test_have_manifest_url(self):
        self.assertEqual(
            's3://{0}/event_created_manifest.json'.format(BUCKET_NAME),
            self.manifest.manifest_url)

    def test_have_empty_journal(self):
        self.mock_journal(False)

        self.assertEqual([], self.manifest.journal())

    def test_have_journal(self):
        self.mock_journal(True, self.key_names)

        self.assertEqual(self.key_names, self.manifest.journal())

    def test_have_manifest_when_journal_is_empty(self):
        self.mock_journal(False)

        expected_entries = set(
            [entry['url'] for entry in self.expected_manifest['entries']])
        actual_entries = set([
            entry['url']
            for entry in self.manifest.get()['manifest']['entries']
        ])

        self.assertEqual(expected_entries, actual_entries)

    def test_update_existing_manifest_when_key_not_in_journal(self):
        self.mock_journal(True, list(self.key_names))
        self.key_names.append(
            'object_path/5acd5fb0-be96-451a-be32-b65c4461b3f4')
        self.bucket.list = Mock(
            return_value=[self.mock_key(key) for key in self.key_names])

        self.assertEqual(
            {
                'entries': [{
                    'url':
                    's3://{0}/object_path/5acd5fb0-be96-451a-be32-b65c4461b3f4'
                    .format(BUCKET_NAME),
                    'mandatory':
                    True
                }]
            },
            self.manifest.get()['manifest'])

    def test_save_and_have_updated_journal(self):
        self.mock_journal(False)
        updated_journal = self.manifest.save()

        self.assertEqual(set(self.key_names), set(updated_journal))

        save = self.bucket.save.call_args_list[0][0]

        self.assertEqual(self.manifest.manifest_key, save[0])

        expected_entries = set(
            [entry['url'] for entry in self.expected_manifest['entries']])
        actual_entries = set(
            [entry['url'] for entry in json.loads(save[1])['entries']])

        self.assertEqual(expected_entries, actual_entries)

    def test_commit_and_save_journal(self):
        self.manifest.commit(self.key_names)

        self.bucket.save.assert_called_once_with(self.manifest.journal_key,
                                                 json.dumps(self.key_names))

    def test_exist(self):
        exists = True
        self.mock_key_exists(self.manifest.manifest_key, exists)

        self.assertEqual(exists, self.manifest.exists())

    def test_not_exist(self):
        exists = False
        self.mock_key_exists(self.manifest.manifest_key, exists)

        self.assertEqual(exists, self.manifest.exists())

    def test_have_journal_existence(self):
        exists = True
        self.mock_key_exists(self.manifest.journal_key, exists)

        self.assertEqual(exists, self.manifest.journal_exists())

    def test_not_have_journal_existence(self):
        exists = False
        self.mock_key_exists(self.manifest.journal_key, exists)

        self.assertEqual(exists, self.manifest.journal_exists())
示例#8
0
class ManifestShould(unittest.TestCase):
    maxDiff = None

    def setUp(self):
        self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
        self.bucket = Bucket(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY,
                             BUCKET_NAME, Mock())
        self.bucket.save = Mock()
        self.key_names = [
            'object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a',
            'object_path/19440481-7766-4061-bd42-4a54fa0aac7c',
            'object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c',
            'object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd',
            'object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268',
            'object_path/80536e83-6bbe-4a42-ade1-533d99321a6c',
            'object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40',
            'object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1']
        self.bucket.list = Mock(
            return_value=[self.mock_key(key) for key in self.key_names])
        self.manifest = Manifest(metadata='',
                                 source='',
                                 schema=self.schema,
                                 bucket=self.bucket)
        self.expected_manifest = {'entries': [
            {
                'url': 's3://{0}/object_path/00c68a1e-85f2-49e5-9d07-6922046dbc5a'.format(
                    BUCKET_NAME), 'mandatory': True},
            {
                'url': 's3://{0}/object_path/19440481-7766-4061-bd42-4a54fa0aac7c'.format(
                    BUCKET_NAME), 'mandatory': True},
            {
                'url': 's3://{0}/object_path/2014-09-02/19440481-7766-4061-bd42-4a54fa0aac7c'.format(
                    BUCKET_NAME), 'mandatory': True},
            {
                'url': 's3://{0}/object_path/282e6063-ecef-4e45-bdfb-9fdfb39840cd'.format(
                    BUCKET_NAME), 'mandatory': True},
            {
                'url': 's3://{0}/object_path/35cbf09a-b2dc-43f2-96f6-7d7573906268'.format(
                    BUCKET_NAME), 'mandatory': True},
            {
                'url': 's3://{0}/object_path/80536e83-6bbe-4a42-ade1-533d99321a6c'.format(
                    BUCKET_NAME), 'mandatory': True},
            {
                'url': 's3://{0}/object_path/cf00b394-3ff3-4418-b244-2ccf104fcc40'.format(
                    BUCKET_NAME), 'mandatory': True},
            {
                'url': 's3://{0}/object_path/e822e2ae-61f5-4be0-aacd-ca6de70faad1'.format(
                    BUCKET_NAME), 'mandatory': True}
        ]}

    def mock_key(self, name):
        return Key(Mock(), name)

    def mock_key_exists(self, key_name, exists):
        key = self.mock_key(key_name)
        key.exists = Mock(return_value=exists)
        self.bucket.get = Mock(return_value=key)
        return key

    def mock_journal(self, exists, key_names=None):
        key = self.mock_key_exists(self.manifest.journal_key, exists)
        if exists:
            key.get_contents_as_string = Mock(
                return_value=json.dumps(key_names))

    def test_have_all_keys(self):
        self.assertEqual(self.key_names, self.manifest.all_keys)

    def test_have_manifest_key(self):
        self.assertEqual('/event_created_manifest.json',
                         self.manifest.manifest_key)

    def test_have_journal_key(self):
        self.assertEqual('/event_created_journal.json',
                         self.manifest.journal_key)

    def test_have_manifest_url(self):
        self.assertEqual(
            's3://{0}/event_created_manifest.json'.format(BUCKET_NAME),
            self.manifest.manifest_url)

    def test_have_empty_journal(self):
        self.mock_journal(False)

        self.assertEqual([], self.manifest.journal())

    def test_have_journal(self):
        self.mock_journal(True, self.key_names)

        self.assertEqual(self.key_names, self.manifest.journal())

    def test_have_manifest_when_journal_is_empty(self):
        self.mock_journal(False)

        expected_entries = set(
            [entry['url'] for entry in
             self.expected_manifest['entries']])
        actual_entries = set(
            [entry['url'] for entry in
             self.manifest.get()['manifest']['entries']])

        self.assertEqual(expected_entries, actual_entries)

    def test_update_existing_manifest_when_key_not_in_journal(self):
        self.mock_journal(True, list(self.key_names))
        self.key_names.append(
            'object_path/5acd5fb0-be96-451a-be32-b65c4461b3f4')
        self.bucket.list = Mock(
            return_value=[self.mock_key(key) for key in self.key_names])

        self.assertEqual({'entries': [{
                                          'url': 's3://{0}/object_path/5acd5fb0-be96-451a-be32-b65c4461b3f4'.format(
                                              BUCKET_NAME),
                                          'mandatory': True}]},
                         self.manifest.get()['manifest'])

    def test_save_and_have_updated_journal(self):
        self.mock_journal(False)
        updated_journal = self.manifest.save()

        self.assertEqual(set(self.key_names), set(updated_journal))

        save = self.bucket.save.call_args_list[0][0]

        self.assertEqual(self.manifest.manifest_key, save[0])

        expected_entries = set(
            [entry['url'] for entry in self.expected_manifest['entries']])
        actual_entries = set(
            [entry['url'] for entry in json.loads(save[1])['entries']])

        self.assertEqual(expected_entries, actual_entries)

    def test_commit_and_save_journal(self):
        self.manifest.commit(self.key_names)

        self.bucket.save.assert_called_once_with(self.manifest.journal_key,
                                                 json.dumps(self.key_names))

    def test_exist(self):
        exists = True
        self.mock_key_exists(self.manifest.manifest_key, exists)

        self.assertEqual(exists, self.manifest.exists())

    def test_not_exist(self):
        exists = False
        self.mock_key_exists(self.manifest.manifest_key, exists)

        self.assertEqual(exists, self.manifest.exists())

    def test_have_journal_existence(self):
        exists = True
        self.mock_key_exists(self.manifest.journal_key, exists)

        self.assertEqual(exists, self.manifest.journal_exists())

    def test_not_have_journal_existence(self):
        exists = False
        self.mock_key_exists(self.manifest.journal_key, exists)

        self.assertEqual(exists, self.manifest.journal_exists())