Пример #1
0
 def setUp(self):
     self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
     self.database = Database(psycopg2.connect(CONNECTION))
     self.database.open()
     table = TargetTable(self.schema, self.database)
     table.create()
     self.database.commit()
Пример #2
0
    def __init__(self, metadata, source, schema, bucket, db_connection):
        self.metadata = metadata
        self.source = source
        self.schema = schema
        self.bucket = bucket

        if isinstance(db_connection, Database):
            self.database = db_connection
        else:
            self.database = Database(db_connection)

        self.file_name = '{0}_manifest.json'.format(schema.table)
        self.journal_file_name = '{0}_journal.db'.format(schema.table)
Пример #3
0
 def setUp(self):
     self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
     self.database = Database(psycopg2.connect(CONNECTION))
     self.database.open()
     table = TargetTable(self.schema, self.database)
     table.create()
     self.database.commit()
Пример #4
0
    def test_stage_update_when_column_name_defined(self):
        with patch.object(Database, 'execute') as execute:
            schema = JsonObject(
                TABLE_NAME, Property('property1', 'VARCHAR(10)', 'someColumn'),
                Property('property2', 'TIMESTAMP', 'anotherColumn'))
            table = TargetTable(schema, Database(Mock()))

            table.stage_update()

            expected_sql = 'CREATE TABLE {0}_update (someColumn VARCHAR(10), ' \
                           'anotherColumn TIMESTAMP)'.format(TABLE_NAME)

            execute.assert_called_once_with(expected_sql)
Пример #5
0
    def test_create_when_column_name_not_defined_for_nested_property(self):
        with patch.object(Database, 'execute') as execute:
            schema = JsonObject(
                TABLE_NAME, Property('property1', 'VARCHAR(10)'),
                Property('property2', Property('timestamp', 'TIMESTAMP')))
            table = TargetTable(schema, Database(Mock()))

            table.create()

            expected_sql = 'CREATE TABLE {0} (property1 VARCHAR(10), ' \
                           'property2_timestamp TIMESTAMP)'.format(TABLE_NAME)

            execute.assert_called_once_with(expected_sql)
Пример #6
0
    def __init__(self, aws_access_key_id, aws_secret_access_key, bucket,
                 db_connection):
        super(S3CopyPipeline, self).__init__()

        self.aws_access_key_id = aws_access_key_id
        self.aws_secret_access_key = aws_secret_access_key

        if isinstance(bucket, basestring):
            self.bucket = Bucket(aws_access_key_id, aws_secret_access_key,
                                 bucket)
        else:
            self.bucket = bucket

        if isinstance(db_connection, Database):
            self.database = db_connection
        else:
            self.database = Database(db_connection)
Пример #7
0
    def sql_manifest_copy(self, metadata, source, schema, max_error_count=1):
        sql_manifest_copy_step = ManifestCopyFromS3JsonStep(metadata=metadata,
                                                            source=source,
                                                            schema=schema,
                                                            aws_access_key_id=self.aws_access_key_id,
                                                            aws_secret_access_key=self.aws_secret_access_key,
                                                            bucket=self.bucket,
                                                            table=TargetTable(
                                                                schema,
                                                                self.database))

        sql_manifest = SqlManifest(metadata, source, schema, self.bucket,
                                   self.database)
        sql_manifest.database = Database(
            sqlite3.connect(sql_manifest.journal_file_name))
        sql_manifest_copy_step.manifest = sql_manifest
        self.__add_copy_step(sql_manifest_copy_step, max_error_count)
        return self
Пример #8
0
class SqlStepShould(unittest.TestCase):
    def setUp(self):
        self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
        self.database = Database(psycopg2.connect(CONNECTION))
        self.database.open()
        table = TargetTable(self.schema, self.database)
        table.create()
        self.database.commit()

    def tearDown(self):
        self.database.open()
        self.database.execute('DROP TABLE IF EXISTS %s', (AsIs(TABLE_NAME), ))
        self.database.commit()

    def test_run(self):
        SqlStep(self.database, ("INSERT INTO %s VALUES('%s')", TABLE_NAME, 1),
                ("INSERT INTO %s VALUES('%s')", TABLE_NAME, 2),
                ("INSERT INTO {0} VALUES ('3')".format(TABLE_NAME))).run()

        self.database.open()
        self.database.execute("SELECT COUNT(*) FROM %s", (AsIs(TABLE_NAME), ))
        count = int(self.database.cursor.fetchone()[0])

        self.assertEqual(3, count)

        self.database.execute("SELECT * FROM %s", (AsIs(TABLE_NAME), ))
        values = list(self.database.cursor.fetchall())

        self.assertEqual([('1', ), ('2', ), ('3', )], values)

    def test_execute(self):
        self.database.open()
        sql = SqlStep(self.database)
        sql.execute(("INSERT INTO %s VALUES('%s')", TABLE_NAME, 1))
        sql.execute(("INSERT INTO %s VALUES('%s')", TABLE_NAME, 2))
        sql.execute(("INSERT INTO {0} VALUES ('3')".format(TABLE_NAME)))
        self.database.commit()

        self.database.open()
        self.database.execute("SELECT COUNT(*) FROM %s", (AsIs(TABLE_NAME), ))
        count = int(self.database.cursor.fetchone()[0])

        self.assertEqual(3, count)

        self.database.execute("SELECT * FROM %s", (AsIs(TABLE_NAME), ))
        values = list(self.database.cursor.fetchall())

        self.assertEqual([('1', ), ('2', ), ('3', )], values)
Пример #9
0
    def test_query_table_with_psycopg2_connection(self):
        db = Database(psycopg2.connect(CONNECTION))
        db.open()
        db.execute('CREATE TABLE SomeTable (id VARCHAR(255))')
        db.execute('INSERT INTO SomeTable VALUES (1)')
        db.execute('INSERT INTO SomeTable VALUES (2)')
        db.execute('SELECT * FROM SomeTable')

        rows = db.fetchall()
        self.assertEqual(True, isinstance(rows, types.GeneratorType))
        self.assertEqual([('1',), ('2',)], list(rows))
        db.rollback()
Пример #10
0
    def test_query_table_with_sqlite3_connection(self):
        database_file = 'sometable.db'
        db = Database(sqlite3.connect(database_file))
        db.open()
        db.execute('CREATE TABLE IF NOT EXISTS SomeTable (id VARCHAR(255))')
        db.execute('INSERT INTO SomeTable VALUES (1)')
        db.execute('INSERT INTO SomeTable VALUES (?)', (2,))
        db.execute('SELECT * FROM SomeTable')

        rows = db.fetchall()
        self.assertEqual(True, isinstance(rows, types.GeneratorType))
        self.assertEqual(['1', '2'], [row[0] for row in rows])
        db.rollback()
        db.close()
        os.remove(database_file)
Пример #11
0
 def setUp(self):
     self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
     self.database = Database(psycopg2.connect(CONNECTION))
     self.database.open()
Пример #12
0
class TargetTableShould(unittest.TestCase):
    def setUp(self):
        self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
        self.database = Database(psycopg2.connect(CONNECTION))
        self.database.open()

    def tearDown(self):
        self.database.rollback()

    def assertColumns(self, table_name, schema):
        self.database.execute(
            "SELECT column_name, udt_name FROM information_schema.columns "
            "WHERE table_name = '%s'",
            (AsIs(table_name),))
        columns = self.database.cursor.fetchall()

        self.assertEqual(len(schema.schema), len(columns))

        for i, column in enumerate(columns):
            self.assertEqual(schema.schema[i].column_name.lower(), column[0])
            self.assertEqual(True, column[1] in schema.schema[i].type.lower())

    def test_not_exist(self):
        table = TargetTable(self.schema, self.database)

        self.assertEqual(False, table.exists())

    def test_exist(self):
        table = TargetTable(self.schema, self.database)
        table.create()

        self.assertEqual(True, table.exists())

    def test_create_when_column_name_not_defined(self):
        schema = JsonObject(TABLE_NAME,
                            Property('property1', 'VARCHAR(10)'),
                            Property('property2', 'TIMESTAMP'))
        table = TargetTable(schema, self.database)
        table.create()

        self.assertColumns(schema.table, schema)

    def test_create_when_column_name_defined(self):
        schema = JsonObject(TABLE_NAME,
                            Property('property1', 'VARCHAR(10)', 'someColumn'),
                            Property('property2', 'TIMESTAMP', 'anotherColumn'))
        table = TargetTable(schema, self.database)
        table.create()

        self.assertColumns(schema.table, schema)

    def test_stage_update_when_column_name_not_defined(self):
        schema = JsonObject(TABLE_NAME,
                            Property('property1', 'VARCHAR(10)'),
                            Property('property2', 'TIMESTAMP'))
        table = TargetTable(schema, self.database)
        table.stage_update()

        self.assertColumns(schema.update_table, schema)

    def test_stage_update_when_column_name_defined(self):
        schema = JsonObject(TABLE_NAME,
                            Property('property1', 'VARCHAR(10)', 'someColumn'),
                            Property('property2', 'TIMESTAMP', 'anotherColumn'))
        table = TargetTable(schema, self.database)
        table.stage_update()

        self.assertColumns(schema.update_table, schema)

    def test_promote_update(self):
        table = TargetTable(self.schema, self.database)
        table.stage_update()
        table.promote_update()

        self.assertColumns(self.schema.table, self.schema)

    def test_insert_update(self):
        table = TargetTable(self.schema, self.database)
        table.create()
        table.stage_update()
        self.database.execute("INSERT INTO %s VALUES('%s')",
                              (AsIs(TABLE_NAME), AsIs(1),))
        self.database.execute("INSERT INTO %s VALUES('%s')",
                              (AsIs(self.schema.update_table), AsIs(2),))
        self.database.execute("INSERT INTO %s VALUES('%s')",
                              (AsIs(self.schema.update_table), AsIs(3),))

        table.insert_update()

        self.database.execute(
            "SELECT * FROM %s",
            (AsIs(TABLE_NAME),))
        values = list(self.database.cursor.fetchall())

        self.assertEqual([('1',), ('2',), ('3',)], values)

    def test_drop(self):
        table = TargetTable(self.schema, self.database)
        table.create()
        table.drop()

        self.assertEqual(False, table.exists())
Пример #13
0
class SqlManifest(object):
    def __init__(self, metadata, source, schema, bucket, db_connection):
        self.metadata = metadata
        self.source = source
        self.schema = schema
        self.bucket = bucket

        if isinstance(db_connection, Database):
            self.database = db_connection
        else:
            self.database = Database(db_connection)

        self.file_name = '{0}_manifest.json'.format(schema.table)
        self.journal_file_name = '{0}_journal.db'.format(schema.table)

    @property
    def all_keys(self):
        return (k.name for k in self.bucket.list(self.source)
                if not k.name.endswith('/'))

    @property
    def manifest_key(self):
        return normalize_path('{0}/{1}'.format(self.metadata, self.file_name))

    @property
    def journal_key(self):
        return normalize_path('{0}/{1}'.format(self.metadata,
                                               self.journal_file_name))

    @property
    def manifest_url(self):
        return 's3://{0}{1}'.format(self.bucket.name, self.manifest_key)

    def journal(self):
        journal = self.bucket.get(self.journal_key)
        if journal.exists():
            journal.get_contents_to_filename(self.journal_file_name)
            self.database.open()
            self.database.execute('SELECT key FROM journal')
            return (row[0] for row in self.database.fetchall())
        else:
            self.database.open()
            self.database.execute(
                'CREATE TABLE IF NOT EXISTS journal (key TEXT)')
            self.database.commit()
            self.database.close()
            return []

    def get(self):
        updated_journal = self.all_keys
        journal = self.journal()
        keys = list(set(updated_journal) - set(journal))

        return {
            'manifest': {
                'entries': ({
                    'url': 's3://{0}/{1}'.format(self.bucket.name, key),
                    'mandatory': True
                } for key in keys)
            },
            'updated_journal': updated_journal
        }

    def save(self):
        entries = self.get()['manifest']['entries']
        offset = 0
        offsets = []
        last_entry = None

        with open(self.file_name, 'wb') as f:
            f.seek(0)
            f.truncate()
            offset += self.__write(f, '{\n')
            offsets.append(offset)
            offset += self.__write(f, '"entries": [\n')
            offsets.append(offset)

            for entry in entries:
                line = json.dumps(entry) + ',\n'
                offset += self.__write(f, line)
                offsets.append(offset)
                last_entry = line
            f.seek(offsets[-2])
            f.truncate()
            if last_entry is not None:
                line = last_entry[0:-2] + '\n'
                f.write(line)
            f.write(']}')

        self.bucket.get(self.manifest_key).set_contents_from_filename(
            self.file_name)

    def commit(self, saved_keys):
        self.database.open()
        self.database.execute('DELETE FROM journal')
        for key in saved_keys:
            self.database.execute('INSERT INTO journal VALUES (?)', (key, ))
        self.database.commit()
        self.database.close()
        self.bucket.get(self.journal_key).set_contents_from_filename(
            self.journal_file_name)

    def exists(self):
        return self.bucket.get(self.manifest_key).exists()

    def journal_exists(self):
        return self.bucket.get(self.journal_key).exists()

    @staticmethod
    def __write(fd, line):
        fd.write(line)
        return len(line)
Пример #14
0
class SqlStepShould(unittest.TestCase):
    def setUp(self):
        self.schema = JsonObject(TABLE_NAME, Property('id', 'VARCHAR(36)'))
        self.database = Database(psycopg2.connect(CONNECTION))
        self.database.open()
        table = TargetTable(self.schema, self.database)
        table.create()
        self.database.commit()

    def tearDown(self):
        self.database.open()
        self.database.execute('DROP TABLE IF EXISTS %s', (AsIs(TABLE_NAME),))
        self.database.commit()

    def test_run(self):
        SqlStep(self.database, ("INSERT INTO %s VALUES('%s')", TABLE_NAME, 1),
                ("INSERT INTO %s VALUES('%s')", TABLE_NAME, 2),
                ("INSERT INTO {0} VALUES ('3')".format(TABLE_NAME))).run()

        self.database.open()
        self.database.execute(
            "SELECT COUNT(*) FROM %s",
            (AsIs(TABLE_NAME),))
        count = int(self.database.cursor.fetchone()[0])

        self.assertEqual(3, count)

        self.database.execute(
            "SELECT * FROM %s",
            (AsIs(TABLE_NAME),))
        values = list(self.database.cursor.fetchall())

        self.assertEqual([('1',), ('2',), ('3',)], values)

    def test_execute(self):
        self.database.open()
        sql = SqlStep(self.database)
        sql.execute(("INSERT INTO %s VALUES('%s')", TABLE_NAME, 1))
        sql.execute(("INSERT INTO %s VALUES('%s')", TABLE_NAME, 2))
        sql.execute(("INSERT INTO {0} VALUES ('3')".format(TABLE_NAME)))
        self.database.commit()

        self.database.open()
        self.database.execute(
            "SELECT COUNT(*) FROM %s",
            (AsIs(TABLE_NAME),))
        count = int(self.database.cursor.fetchone()[0])

        self.assertEqual(3, count)

        self.database.execute(
            "SELECT * FROM %s",
            (AsIs(TABLE_NAME),))
        values = list(self.database.cursor.fetchall())

        self.assertEqual([('1',), ('2',), ('3',)], values)