Пример #1
0
def flush_records(stream: str,
                  records: List[Dict],
                  db_sync: DbSync,
                  temp_dir: str = None,
                  no_compression: bool = False) -> None:
    """
    Takes a list of record messages and loads it into the snowflake target table

    Args:
        stream: Name of the stream
        records: List of dictionary, that represents multiple csv lines. Dict key is the column name, value is the
                 column value
        row_count:
        db_sync: A DbSync object
        temp_dir: Directory where intermediate temporary files will be created. (Default: OS specificy temp directory)
        no_compression: Disable to use compressed files. (Default: False)

    Returns:
        None
    """
    # Generate file on disk in the required format
    filepath = db_sync.file_format.formatter.records_to_file(
        records,
        db_sync.flatten_schema,
        compression=not no_compression,
        dest_dir=temp_dir,
        data_flattening_max_level=db_sync.data_flattening_max_level)

    # Get file stats
    row_count = len(records)
    size_bytes = os.path.getsize(filepath)

    # Upload to s3 and load into Snowflake
    s3_key = db_sync.put_to_stage(filepath,
                                  stream,
                                  row_count,
                                  temp_dir=temp_dir)
    db_sync.load_file(s3_key, row_count, size_bytes)

    # Delete file from local disk and from s3
    os.remove(filepath)
    db_sync.delete_from_stage(stream, s3_key)
def flush_records(stream: str,
                  records: List[Dict],
                  db_sync: DbSync,
                  temp_dir: str = None,
                  no_compression: bool = False,
                  archive_load_files: Dict = None) -> None:
    """
    Takes a list of record messages and loads it into the snowflake target table

    Args:
        stream: Name of the stream
        records: List of dictionary, that represents multiple csv lines. Dict key is the column name, value is the
                 column value
        row_count:
        db_sync: A DbSync object
        temp_dir: Directory where intermediate temporary files will be created. (Default: OS specific temp directory)
        no_compression: Disable to use compressed files. (Default: False)
        archive_load_files: Data needed for archive load files. (Default: None)

    Returns:
        None
    """
    # Generate file on disk in the required format
    filepath = db_sync.file_format.formatter.records_to_file(records,
                                                             db_sync.flatten_schema,
                                                             compression=not no_compression,
                                                             dest_dir=temp_dir,
                                                             data_flattening_max_level=
                                                             db_sync.data_flattening_max_level)

    # Get file stats
    row_count = len(records)
    size_bytes = os.path.getsize(filepath)

    # Upload to s3 and load into Snowflake
    s3_key = db_sync.put_to_stage(filepath, stream, row_count, temp_dir=temp_dir)
    db_sync.load_file(s3_key, row_count, size_bytes)

    # Delete file from local disk
    os.remove(filepath)

    if archive_load_files:
        stream_name_parts = stream_utils.stream_name_to_dict(stream)
        if 'schema_name' not in stream_name_parts or 'table_name' not in stream_name_parts:
            raise Exception("Failed to extract schema and table names from stream '{}'".format(stream))

        archive_schema = stream_name_parts['schema_name']
        archive_table = stream_name_parts['table_name']
        archive_tap = archive_load_files['tap']

        archive_metadata = {
            'tap': archive_tap,
            'schema': archive_schema,
            'table': archive_table,
            'archived-by': 'pipelinewise_target_snowflake'
        }

        if 'column' in archive_load_files:
            archive_metadata.update({
                'incremental-key': archive_load_files['column'],
                'incremental-key-min': str(archive_load_files['min']),
                'incremental-key-max': str(archive_load_files['max'])
            })

        # Use same file name as in import
        archive_file = os.path.basename(s3_key)
        archive_key = "{}/{}/{}".format(archive_tap, archive_table, archive_file)

        db_sync.copy_to_archive(s3_key, archive_key, archive_metadata)

    # Delete file from S3
    db_sync.delete_from_stage(stream, s3_key)