Пример #1
0
    def persist_csv_rows(self, cur, remote_schema, temp_table_name, columns,
                         csv_rows):
        key_prefix = temp_table_name + SEPARATOR

        bucket, key = self.s3.persist(csv_rows, key_prefix=key_prefix)

        credentials = self.s3.credentials()

        copy_sql = sql.SQL(
            'COPY {}.{} ({}) FROM {} CREDENTIALS {} FORMAT AS CSV NULL AS {}'
        ).format(
            sql.Identifier(self.postgres_schema),
            sql.Identifier(temp_table_name),
            sql.SQL(', ').join(map(sql.Identifier, columns)),
            sql.Literal('s3://{}/{}'.format(bucket, key)),
            sql.Literal('aws_access_key_id={};aws_secret_access_key={}'.format(
                credentials.get('aws_access_key_id'),
                credentials.get('aws_secret_access_key'))),
            sql.Literal(RESERVED_NULL_DEFAULT))

        cur.execute(copy_sql)

        pattern = re.compile(SINGER_LEVEL.format('[0-9]+'))
        subkeys = list(
            filter(lambda header: re.match(pattern, header) is not None,
                   columns))

        update_sql = self._get_update_sql(remote_schema['name'],
                                          temp_table_name,
                                          remote_schema['key_properties'],
                                          columns, subkeys)

        cur.execute(update_sql)
Пример #2
0
def _denest_records(table_path,
                    records,
                    records_map,
                    key_properties,
                    pk_fks=None,
                    level=-1):
    row_index = 0
    """
    [{...} ...] | [[...] ...] | [literal ...]
    """
    for record in records:
        if pk_fks:
            record_pk_fks = pk_fks.copy()
            record_pk_fks[SINGER_LEVEL.format(level)] = row_index

            if not isinstance(record, dict):
                """
                [...] | literal
                """
                record = {SINGER_VALUE: record}

            for key, value in record_pk_fks.items():
                record[key] = value
            row_index += 1
        else:  ## top level
            record_pk_fks = {}
            for key in key_properties:
                record_pk_fks[SINGER_SOURCE_PK_PREFIX + key] = record[key]
            if SINGER_SEQUENCE in record:
                record_pk_fks[SINGER_SEQUENCE] = record[SINGER_SEQUENCE]
        """
        {...}
        """
        _denest_record(table_path, record, records_map, key_properties,
                       record_pk_fks, level)
Пример #3
0
def _create_subtable(table_path, table_json_schema, key_prop_schemas,
                     subtables, level):
    if json_schema.is_object(table_json_schema['items']):
        new_properties = table_json_schema['items']['properties']
    else:
        new_properties = {SINGER_VALUE: table_json_schema['items']}

    key_properties = []
    for pk, item_json_schema in key_prop_schemas.items():
        key_properties.append(SINGER_SOURCE_PK_PREFIX + pk)
        new_properties[SINGER_SOURCE_PK_PREFIX + pk] = item_json_schema

    new_properties[SINGER_SEQUENCE] = {'type': ['null', 'integer']}

    for i in range(0, level + 1):
        new_properties[SINGER_LEVEL.format(i)] = {'type': ['integer']}

    new_schema = {
        'type': [json_schema.OBJECT],
        'properties': new_properties,
        'level': level,
        'key_properties': key_properties
    }

    _denest_schema(table_path,
                   new_schema,
                   key_prop_schemas,
                   subtables,
                   level=level)

    subtables[table_path] = new_schema
Пример #4
0
    def persist_csv_rows(self, cur, remote_schema, temp_table_name, columns,
                         csv_rows):

        copy = sql.SQL(
            'COPY {}.{} ({}) FROM STDIN WITH CSV NULL AS {}').format(
                sql.Identifier(self.postgres_schema),
                sql.Identifier(temp_table_name),
                sql.SQL(', ').join(map(sql.Identifier, columns)),
                sql.Literal(RESERVED_NULL_DEFAULT))
        cur.copy_expert(copy, csv_rows)

        pattern = re.compile(SINGER_LEVEL.format('[0-9]+'))
        subkeys = list(
            filter(lambda header: re.match(pattern, header) is not None,
                   columns))

        canonicalized_key_properties = [
            self.fetch_column_from_path((key_property, ), remote_schema)[0]
            for key_property in remote_schema['key_properties']
        ]

        update_sql = self._get_update_sql(remote_schema['name'],
                                          temp_table_name,
                                          canonicalized_key_properties,
                                          columns, subkeys)
        cur.execute(update_sql)
Пример #5
0
    def persist_rows(self, cur, target_table_name, temp_table_name,
                     target_table_json_schema, key_properties, records):
        headers = list(target_table_json_schema['properties'].keys())

        datetime_fields = [
            k for k, v in target_table_json_schema['properties'].items()
            if v.get('format') == 'date-time'
        ]

        rows = iter(records)

        def transform():
            try:
                row = next(rows)
                with io.StringIO() as out:
                    ## Serialize datetime to postgres compatible format
                    for prop in datetime_fields:
                        if prop in row:
                            row[prop] = self.get_postgres_datetime(row[prop])
                    writer = csv.DictWriter(out, headers)
                    writer.writerow(row)
                    return out.getvalue()
            except StopIteration:
                return ''

        csv_rows = TransformStream(transform)

        copy = sql.SQL('COPY {}.{} ({}) FROM STDIN CSV').format(
            sql.Identifier(self.postgres_schema),
            sql.Identifier(temp_table_name),
            sql.SQL(', ').join(map(sql.Identifier, headers)))
        cur.copy_expert(copy, csv_rows)

        pattern = re.compile(SINGER_LEVEL.format('[0-9]+'))
        subkeys = list(
            filter(lambda header: re.match(pattern, header) is not None,
                   headers))

        update_sql = self.get_update_sql(target_table_name, temp_table_name,
                                         key_properties, subkeys)
        cur.execute(update_sql)
Пример #6
0
    def persist_csv_rows(self, cur, remote_schema, temp_table_name, columns,
                         csv_rows):

        copy = sql.SQL(
            'COPY {}.{} ({}) FROM STDIN WITH (FORMAT CSV, NULL {})').format(
                sql.Identifier(self.postgres_schema),
                sql.Identifier(temp_table_name),
                sql.SQL(', ').join(map(sql.Identifier, columns)),
                sql.Literal(RESERVED_NULL_DEFAULT))
        cur.copy_expert(copy, csv_rows)

        pattern = re.compile(SINGER_LEVEL.format('[0-9]+'))
        subkeys = list(
            filter(lambda header: re.match(pattern, header) is not None,
                   columns))

        update_sql = self.get_update_sql(remote_schema['name'],
                                         temp_table_name,
                                         remote_schema['key_properties'],
                                         subkeys)
        cur.execute(update_sql)
Пример #7
0
 def denest_records(self,
                    table_name,
                    records,
                    records_map,
                    key_properties,
                    pk_fks=None,
                    level=-1):
     row_index = 0
     for record in records:
         if pk_fks:
             record_pk_fks = pk_fks.copy()
             record_pk_fks[SINGER_LEVEL.format(level)] = row_index
             for key, value in record_pk_fks.items():
                 record[key] = value
             row_index += 1
         else:  ## top level
             record_pk_fks = {}
             for key in key_properties:
                 record_pk_fks[SINGER_SOURCE_PK_PREFIX + key] = record[key]
             if SINGER_SEQUENCE in record:
                 record_pk_fks[SINGER_SEQUENCE] = record[SINGER_SEQUENCE]
         self.denest_record(table_name, None, record, records_map,
                            key_properties, record_pk_fks, level)
Пример #8
0
    def create_subtable(self, table_name, table_json_schema, key_prop_schemas,
                        subtables, level):
        if json_schema.is_object(table_json_schema['items']):
            new_properties = table_json_schema['items']['properties']
        else:
            new_properties = {'value': table_json_schema['items']}

        for pk, item_json_schema in key_prop_schemas.items():
            new_properties[SINGER_SOURCE_PK_PREFIX + pk] = item_json_schema

        new_properties[SINGER_SEQUENCE] = {'type': ['null', 'integer']}

        for i in range(0, level + 1):
            new_properties[SINGER_LEVEL.format(i)] = {'type': ['integer']}

        new_schema = {'type': ['object'], 'properties': new_properties}

        self.denest_schema(table_name,
                           new_schema,
                           key_prop_schemas,
                           subtables,
                           level=level)

        subtables[table_name] = new_schema
Пример #9
0
    def persist_csv_rows(self, cur, remote_schema, temp_table_name, columns,
                         csv_rows):
        params = []

        if self.s3:
            bucket, key = self.s3.persist(csv_rows,
                                          key_prefix=temp_table_name +
                                          SEPARATOR)
            stage_location = "'s3://{bucket}/{key}' credentials=(AWS_KEY_ID=%s AWS_SECRET_KEY=%s)".format(
                bucket=bucket, key=key)
            params = [
                self.s3.credentials()['aws_access_key_id'],
                self.s3.credentials()['aws_secret_access_key']
            ]
        else:
            stage_location = '@{db}.{schema}.%{table}'.format(
                db=sql.identifier(self.connection.configured_database),
                schema=sql.identifier(self.connection.configured_schema),
                table=sql.identifier(temp_table_name))

            rel_path = '/tmp/target-snowflake/'
            file_name = str(uuid.uuid4()).replace('-', '_')

            # Make tmp folder to hold data file
            os.makedirs(rel_path, exist_ok=True)

            # Write readable csv_rows to file
            with open(rel_path + file_name, 'wb') as file:
                line = csv_rows.read()
                while line:
                    file.write(line.encode('utf-8'))
                    line = csv_rows.read()

            # Upload to internal table stage
            cur.execute('''
                PUT file://{rel_path}{file_name} {stage_location}
            '''.format(rel_path=rel_path,
                       file_name=file_name,
                       stage_location=stage_location))

            # Tidy up and remove tmp staging file
            os.remove(rel_path + file_name)

            stage_location += '/{}'.format(file_name)

        cur.execute('''
            COPY INTO {db}.{schema}.{table} ({cols})
            FROM {stage_location}
            FILE_FORMAT = (TYPE = CSV EMPTY_FIELD_AS_NULL = FALSE)
        '''.format(db=sql.identifier(self.connection.configured_database),
                   schema=sql.identifier(self.connection.configured_schema),
                   table=sql.identifier(temp_table_name),
                   cols=','.join([sql.identifier(x) for x in columns]),
                   stage_location=stage_location),
                    params=params)

        pattern = re.compile(SINGER_LEVEL.upper().format('[0-9]+'))
        subkeys = list(
            filter(lambda header: re.match(pattern, header) is not None,
                   columns))

        canonicalized_key_properties = [
            self.fetch_column_from_path((key_property, ), remote_schema)[0]
            for key_property in remote_schema['key_properties']
        ]

        self.perform_update(cur, remote_schema['name'], temp_table_name,
                            canonicalized_key_properties, columns, subkeys)
Пример #10
0
    def persist_rows(self, cur, target_table_name, temp_table_name,
                     target_table_json_schema, key_properties, records):
        target_schema = self.get_schema(cur, self.postgres_schema,
                                        target_table_name)

        headers = list(target_schema['schema']['properties'].keys())

        datetime_fields = [
            k for k, v in target_table_json_schema['properties'].items()
            if v.get('format') == 'date-time'
        ]

        default_fields = {
            k: v.get('default')
            for k, v in target_table_json_schema['properties'].items()
            if v.get('default') is not None
        }

        fields = set(
            headers +
            [v['from'] for k, v in target_schema.get('mappings', {}).items()])

        records_iter = iter(records)

        def transform():
            try:
                record = next(records_iter)
                row = {}

                for field in fields:
                    value = record.get(field, None)

                    ## Serialize fields which are not present but have default values set
                    if field in default_fields \
                            and value is None:
                        value = default_fields[field]

                    ## Serialize datetime to postgres compatible format
                    if field in datetime_fields \
                            and value is not None:
                        value = self.get_postgres_datetime(value)

                    ## Serialize NULL default value
                    if value == RESERVED_NULL_DEFAULT:
                        self.logger.warning(
                            'Reserved {} value found in field: {}. Value will be turned into literal null'
                            .format(RESERVED_NULL_DEFAULT, field))

                    if value is None:
                        value = RESERVED_NULL_DEFAULT

                    field_name = field

                    if field in target_table_json_schema['properties']:
                        field_name = self.get_mapping(target_schema,
                                                      field,
                                                      target_table_json_schema['properties'][field]) \
                                     or field

                    if not field_name in row \
                        or row[field_name] is None \
                        or row[field_name] == RESERVED_NULL_DEFAULT:

                        row[field_name] = value

                with io.StringIO() as out:
                    writer = csv.DictWriter(out, headers)
                    writer.writerow(row)
                    return out.getvalue()
            except StopIteration:
                return ''

        csv_rows = TransformStream(transform)

        copy = sql.SQL(
            'COPY {}.{} ({}) FROM STDIN WITH (FORMAT CSV, NULL {})').format(
                sql.Identifier(self.postgres_schema),
                sql.Identifier(temp_table_name),
                sql.SQL(', ').join(map(sql.Identifier, headers)),
                sql.Literal(RESERVED_NULL_DEFAULT))
        cur.copy_expert(copy, csv_rows)

        pattern = re.compile(SINGER_LEVEL.format('[0-9]+'))
        subkeys = list(
            filter(lambda header: re.match(pattern, header) is not None,
                   headers))

        update_sql = self.get_update_sql(target_table_name, temp_table_name,
                                         key_properties, subkeys)
        cur.execute(update_sql)