예제 #1
0
    def generate_all_sequence_rows(
        self, 
        segment_unified_id,
        segment_conjoined_part,
        segment_num
    ):
        """
        a generator to return sequence data for a segment in order
        """
        open_value_files = dict()

        sequence_rows = _all_sequence_rows_for_segment(
            self._connection, 
            segment_unified_id, 
            segment_conjoined_part,
            segment_num
        )

        # first yield is count of sequences
        yield len(sequence_rows)

        for sequence_row in sequence_rows:
            if not sequence_row.value_file_id in open_value_files:
                open_value_files[sequence_row.value_file_id] = open(
                    compute_value_file_path(
                        self._repository_path, sequence_row.value_file_id
                    ), 
                    "r"
                )
            value_file = open_value_files[sequence_row.value_file_id]
            value_file.seek(sequence_row.value_file_offset)
            yield sequence_row, value_file.read(sequence_row.size)

        for value_file in open_value_files.values():            
            value_file.close()
예제 #2
0
 def __init__(self,
              connection,
              space_id,
              repository_path,
              expected_size=None):
     self._log = logging.getLogger("OutputValueFile")
     self._connection = connection
     assert space_id is not None
     self._space_id = space_id
     self._value_file_id = _get_next_value_file_id(connection)
     self._value_file_path = compute_value_file_path(
         repository_path, space_id, self._value_file_id)
     self._expected_size = expected_size
     self._log.debug("opening {0} expected size = {1}".format(
         self._value_file_path, self._expected_size))
     value_file_dir = os.path.dirname(self._value_file_path)
     if not os.path.exists(value_file_dir):
         os.makedirs(value_file_dir)
     flags = os.O_WRONLY | os.O_CREAT
     self._value_file_fd = os.open(self._value_file_path, flags)
     self._creation_time = create_timestamp()
     self._size = 0
     self._md5 = hashlib.md5()
     self._segment_sequence_count = 0
     self._min_segment_id = None
     self._max_segment_id = None
     self._collection_ids = set()
예제 #3
0
 def __init__(self, 
              connection, 
              space_id, 
              repository_path, 
              expected_size=None):
     self._log = logging.getLogger("OutputValueFile")
     self._connection = connection
     assert space_id is not None
     self._space_id = space_id
     self._value_file_id = _get_next_value_file_id(connection)
     self._value_file_path = compute_value_file_path(
         repository_path, space_id, self._value_file_id)
     self._expected_size = expected_size
     self._log.debug("opening {0} expected size = {1}".format(
         self._value_file_path, self._expected_size)) 
     value_file_dir = os.path.dirname(self._value_file_path)
     if not os.path.exists(value_file_dir):
         os.makedirs(value_file_dir)
     flags = os.O_WRONLY | os.O_CREAT
     self._value_file_fd = os.open(self._value_file_path, flags)
     self._creation_time = create_timestamp()
     self._size = 0
     self._md5 = hashlib.md5()
     self._segment_sequence_count = 0
     self._min_segment_id = None
     self._max_segment_id = None
     self._collection_ids = set()
def unlink_unreachable_value_files(connection, repository_path):
    """
    clean out value files that are unreachable from the database
    """
    log = logging.getLogger("unlink_unreachable_value_files")

    # 1) build up an accumulator of value files seen in the file system.
    # the repository contains some instances of directories 000-999
    # where 'nnn' is value_file.id mod 1000
    # in those directories, we're looking for filenames that are 
    # 8 digit numbers, representing value_file.id
    filesystem_value_file_ids = set()
    for i in range(1000):
        dir_name = "{0:0>3}".format(i)
        dir_path = os.path.join(repository_path, dir_name)
        if os.path.isdir(dir_path):
            for name in os.listdir(dir_path):
                if len(name) != 8:
                    continue
                try:
                    value_file_id = int(name)
                except ValueError:
                    continue
                filesystem_value_file_ids.add(value_file_id)

    # 2) explicitly rollback the connection's current transaction
    connection.rollback()

    # 3) do the "select id from value_file" and store in a set
    database_value_file_ids = set()
    for (value_file_id, ) in connection.fetch_all_rows(
        "select id from nimbusio_node.value_file", []
    ):  
        database_value_file_ids.add(value_file_id)

    # 4) unlink things from #1 not present in #3.
    unreachable_value_file_ids = filesystem_value_file_ids \
                               - database_value_file_ids

    total_value_file_size = 0
    for value_file_id in list(unreachable_value_file_ids):
        value_file_path = compute_value_file_path(repository_path, 
                                                  value_file_id)
        value_file_size = os.path.getsize(value_file_path)
        log.info("unlinking unreachable value_file {0} size = {1}".format(
            value_file_path, value_file_size
        ))
        total_value_file_size += value_file_size
        try:
            os.unlink(value_file_path)
        except Exception:
            log.exception(value_file_path)

    log.info(
        "found {0:,} unreachable value files; savings={1:,}".format(
            len(unreachable_value_file_ids), total_value_file_size
        )
    )

    return total_value_file_size
예제 #5
0
 def __init__(self, local_connection, repository_path, value_file_row): 
     self._log = logging.getLogger("InputValueFile")
     self._local_connection = local_connection
     self._repository_path = repository_path
     self._value_file_row = value_file_row
     self._value_file_path = compute_value_file_path(
         repository_path, value_file_row.id
     )
     self._value_file = open(self._value_file_path, "rb")
def _remove_old_value_files(repository_path, value_file_ids):
    log = logging.getLogger("_remove_old_value_files")
    for value_file_id in value_file_ids:
        value_file_path = \
                compute_value_file_path(repository_path, value_file_id)
        try:
            os.unlink(value_file_path)
        except Exception:       
            log.exception
예제 #7
0
 def __init__(self, local_connection, repository_path, value_file_row):
     self._log = logging.getLogger("InputValueFile")
     self._local_connection = local_connection
     self._repository_path = repository_path
     self._value_file_row = value_file_row
     self._value_file_path = \
             compute_value_file_path(repository_path,
                                     value_file_row.space_id,
                                     value_file_row.id)
     self._value_file = open(self._value_file_path, "rb")
예제 #8
0
def _unlink_value_files(connection, repository_path, unused_value_files):
    log = logging.getLogger("_unlink_value_files")
    for value_file_id, space_id in unused_value_files:
        connection.execute(_delete_value_file_query, [value_file_id, ])
        value_file_path = compute_value_file_path(
            repository_path, space_id, value_file_id
        )
        try:
            os.unlink(value_file_path)
        except Exception:
            log.exception(value_file_path)
예제 #9
0
def _remove_old_value_files(repository_path, value_file_keys):
    log = logging.getLogger("_remove_old_value_files")
    for value_file_id, space_id in value_file_keys:
        value_file_path = \
                compute_value_file_path(repository_path,
                                        space_id,
                                        value_file_id)
        try:
            os.unlink(value_file_path)
        except Exception:
            log.exception(value_file_path)
예제 #10
0
def _unlink_value_files(connection, repository_path, unused_value_files):
    log = logging.getLogger("_unlink_value_files")
    for value_file_id, space_id in unused_value_files:
        connection.execute(_delete_value_file_query, [
            value_file_id,
        ])
        value_file_path = compute_value_file_path(repository_path, space_id,
                                                  value_file_id)
        try:
            os.unlink(value_file_path)
        except Exception:
            log.exception(value_file_path)
예제 #11
0
def _verify_entry_against_value_file(entry):
    log = logging.getLogger("_verify_entry_against_vaue_file")
    value_file_path = compute_value_file_path(_repository_path, 
                                              entry.space_id,
                                              entry.value_file_id)
    md5_sum = hashlib.md5()
    try:
        with open(value_file_path, "rb") as input_file:
            input_file.seek(entry.value_file_offset)
            md5_sum.update(input_file.read(entry.sequence_size))
    except (OSError, IOError) as instance:
        log.error("Error seek/reading {0} {1}".format(value_file_path, 
                                                      instance))
        return False
    return md5_sum.digest() == bytes(entry.sequence_hash)
예제 #12
0
 def __init__(self, connection, space_id, repository_path):
     self._space_id = space_id
     self._value_file_id = _insert_value_file_default_row(
         connection, space_id)
     self._log = logging.getLogger("VF%08d" % (self._value_file_id, ))
     self._connection = connection
     self._value_file_path = compute_value_file_path(
         repository_path, space_id, self._value_file_id)
     self._log.info("opening %s" % (self._value_file_path, ))
     self._value_file_fd = _open_value_file(self._value_file_path)
     self._creation_time = create_timestamp()
     self._size = 0
     self._md5 = hashlib.md5()
     self._segment_sequence_count = 0
     self._min_segment_id = None
     self._max_segment_id = None
     self._collection_ids = set()
     self._synced = True  # treat as synced until we write
예제 #13
0
 def __init__(self, connection, space_id, repository_path):
     self._space_id = space_id
     self._value_file_id =  _insert_value_file_default_row(connection,
                                                           space_id)
     self._log = logging.getLogger("VF%08d" % (self._value_file_id, ))
     self._connection = connection
     self._value_file_path = compute_value_file_path(
          repository_path, space_id, self._value_file_id
     )
     self._log.info("opening %s" % (self._value_file_path, ))
     self._value_file_fd = _open_value_file(self._value_file_path)
     self._creation_time = create_timestamp()
     self._size = 0
     self._md5 = hashlib.md5()
     self._segment_sequence_count = 0
     self._min_segment_id = None
     self._max_segment_id = None
     self._collection_ids = set()
     self._synced = True # treat as synced until we write
예제 #14
0
def _process_request(resources):
    """
    Wait for a reply to our last message from the controller.
    """
    log = logging.getLogger("_process_one_transaction")
    log.debug("waiting work request")
    try:
        request = resources.dealer_socket.recv_pyobj()
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error):
            raise InterruptedSystemCall()
        raise

    assert resources.dealer_socket.rcvmore
    control = resources.dealer_socket.recv_pyobj()

    log.debug("user_request_id = {0}; control = {1}".format(
              request["user-request-id"], control))

    assert resources.dealer_socket.rcvmore
    sequence_row = resources.dealer_socket.recv_pyobj()

    value_file_path = compute_value_file_path(_repository_path, 
                                              sequence_row["space_id"], 
                                              sequence_row["value_file_id"]) 

    control["result"] = "success"
    control["error-message"] = ""

    if value_file_path in resources.file_cache:
        value_file, _ = resources.file_cache[value_file_path]
        del resources.file_cache[value_file_path]
    else:
        try:
            value_file = open(value_file_path, "rb")
        except Exception as instance:
            log.exception("user_request_id = {0}, " \
                          "read {1}".format(request["user-request-id"],
                                            value_file_path))
            resources.event_push_client.exception("error_opening_value_file", 
                                                  str(instance))
            control["result"] = "error_opening_value_file"
            control["error-message"] = str(instance)
            
        if control["result"] != "success":
            _send_error_reply(resources, request, control)
            return

    read_offset = \
        sequence_row["value_file_offset"] + \
        (control["left-offset"] * encoded_block_slice_size)


    read_size = \
        sequence_row["size"] - \
        (control["left-offset"] * encoded_block_slice_size) - \
        (control["right-offset"] * encoded_block_slice_size) 

    # Ticket #84 handle a short block
    # the last block in the file may be smaller than encoded_block_slice_size
    # so we might have subtracted too much for the right offset
    if control["right-offset"] > 0:
        block_modulus = sequence_row["size"] % encoded_block_slice_size
        last_block_size = (encoded_block_slice_size if block_modulus == 0 else \
                           block_modulus)
        last_block_delta = encoded_block_slice_size - last_block_size
        read_size += last_block_delta 

    try:
        value_file.seek(read_offset)
        encoded_data = value_file.read(read_size)
    except Exception as instance:
        log.exception("user_request_id = {0}, " \
                      "read {1}".format(request["user-request-id"],
                                        value_file_path))
        resources.event_push_client.exception("error_reading_value_file", 
                                              str(instance))
        control["result"] = "error_reading_vqalue_file"
        control["error-message"] = str(instance)

    if control["result"] != "success":
        value_file.close()
        _send_error_reply(resources, request, control)
        return

    resources.file_cache[value_file_path] = value_file, time.time()

    if len(encoded_data) != read_size:
        error_message = "{0} size mismatch {1} {2}".format(
            request["retrieve-id"],
            len(encoded_data),
            read_size)
        log.error("user_request_id = {0}, {1}".format(request["user-request-id"],
                                                      error_message))
        resources.event_push_client.error("size_mismatch", error_message)
        control["result"] = "size_mismatch"
        control["error-message"] = error_message

    if control["result"] != "success":
        _send_error_reply(resources, request, control)
        return

    encoded_block_list = list(encoded_block_generator(encoded_data))

    segment_size = 0
    segment_adler32 = 0
    segment_md5 = hashlib.md5()
    for encoded_block in encoded_block_list:
        segment_size += len(encoded_block)
        segment_adler32 = zlib.adler32(encoded_block, segment_adler32) 
        segment_md5.update(encoded_block)
    segment_md5_digest = segment_md5.digest()

    reply = {
        "message-type"          : "retrieve-key-reply",
        "user-request-id"       : request["user-request-id"],
        "client-tag"            : request["client-tag"],
        "message-id"            : request["message-id"],
        "retrieve-id"           : request["retrieve-id"],
        "segment-unified-id"    : request["segment-unified-id"],
        "segment-num"           : request["segment-num"],
        "segment-size"          : segment_size,
        "zfec-padding-size"     : sequence_row["zfec_padding_size"],
        "segment-adler32"       : segment_adler32,
        "segment-md5-digest"    : b64encode(segment_md5_digest).decode("utf-8"),
        "sequence-num"          : None,
        "completed"             : control["completed"],
        "result"                : "success",
        "error-message"         : "",
    }

    push_socket = _get_reply_push_socket(resources, request["client-address"])
    push_socket.send_json(reply, zmq.SNDMORE)
    for encoded_block in encoded_block_list[:-1]:
        push_socket.send(encoded_block, zmq.SNDMORE)
    push_socket.send(encoded_block_list[-1])
예제 #15
0
def _value_file_status(connection, entry):
    log = logging.getLogger("_value_file_status")
    batch_key = make_batch_key(entry)

    value_file_path = compute_value_file_path(_repository_path, 
                                              entry.space_id, 
                                              entry.value_file_id)
    # Always do a stat on the value file. 
    try:
        stat_result = os.stat(value_file_path)
    except OSError as instance:
        # If the value file is missing, consider all of the segment_sequences 
        # to be missing, and handle it as such.
        if instance.errno == errno.ENOENT:
            log.error("value file missing {0} {1}".format(batch_key,
                                                          value_file_path))
            return _value_file_missing
        log.error("Error stat'ing value file {0} {1} {2}".format(
            str(instance), batch_key, value_file_path))
        raise

    # If the value file is still open, consider all data in it undammaged.
    if entry.value_file_close_time is None:
        return _value_file_valid

    # If the value file exists, is closed, and has an md5 in the database, 
    # has a size in the database, and the size in the stat matches the size 
    # in the database, and has a close_time or a last_integrity_check_time 
    # that is younger than (MAX_TIME_BETWEEN_VALUE_FILE_INTEGRITY_CHECK) 
    # consider all records in the file undammaged. (This is the common case.)
    if entry.value_file_hash is None:
        log.info("Value file row has no md5 hash {0} {1}".format(batch_key,
                                                                 entry))
        return _value_file_questionable

    if entry.value_file_size is None:
        log.info("Value file row has no size {0} {1}".format(batch_key,
                                                             entry))
        return _value_file_questionable

    if entry.value_file_size != stat_result.st_size:
        log.info("Value file row size {0} != stat size {1} {2}".format(
            entry.value_file_size, stat_result.st_size, batch_key))
        return _value_file_questionable
    
    current_time = create_timestamp()
    value_file_row_age = current_time - entry.value_file_close_time
    if entry.value_file_last_integrity_check_time is not None:
        value_file_row_age = \
                current_time - entry.value_file_last_integrity_check_time

    if value_file_row_age < _max_value_file_time:
        return _value_file_valid

    value_file_result = _value_file_valid

    # If the value matches all the previous criteria EXCEPT the 
    # MAX_TIME_BETWEEN_VALUE_FILE_INTEGRITY_CHECK, then read the whole file, 
    # and calculate the md5. If it matches, consider the whole file good as 
    # above. Update last_integrity_check_time regardless.

    md5_sum = hashlib.md5()
    try:
        with open(value_file_path, "rb") as input_file:
            while True:
                data = input_file.read(_read_buffer_size)
                if len(data) == 0:
                    break
                md5_sum.update(data)
    except (OSError, IOError) as instance:
        log.error("Error reading {0} {1}".format(value_file_path, 
                                                 instance))
        value_file_result =  _value_file_questionable

    if value_file_result == _value_file_valid and \
       md5_sum.digest() != bytes(entry.value_file_hash):
        log.error(
            "md5 mismatch {0} {1} {2} {3}".format(md5_sum.digest(),
                                                  bytes(entry.value_file_hash),
                                                  batch_key,
                                                  value_file_path))
        value_file_result =  _value_file_questionable


    # we're only supposed to do this after we've also read the file
    # and inserted any damage. not before. otherwise it's a race condition --
    # we may crash before finishing checking the file, and then the file
    # doesn't get checked, but it's marked as checked.

    _update_value_file_last_integrity_check_time(connection,
                                                 entry.value_file_id,
                                                 create_timestamp())

    return value_file_result
예제 #16
0
def rewrite_value_files(options, connection, repository_path, ref_generator):
    log = logging.getLogger("_rewrite_value_files")
    max_sort_mem = options.max_sort_mem * 1024 ** 3

    total_batch_size = 0
    total_output_size = 0
    savings = 0

    batch_size = 0
    refs = list()
    value_file_data = dict()

    while True:

        try:
            ref = next(ref_generator)
        except StopIteration:
            break

        # this should be the start of a partition
        assert ref.value_row_num == 1, ref

        if batch_size + ref.value_file_size > max_sort_mem:
            connection.begin_transaction()
            try:
                output_size = _process_batch(connection, 
                                             repository_path, 
                                             refs, 
                                             value_file_data)
            except Exception:
                connection.rollback()
                raise
            connection.commit()
            _remove_old_value_files(repository_path, value_file_data.keys())

            total_batch_size += batch_size
            total_output_size += output_size
            savings = batch_size - output_size
            log.debug(
                "batch_size={0:,}, output_size={1:,}, savings={2:,}".format(
                    batch_size, output_size, savings
            ))

            batch_size = 0
            refs = list()
            value_file_data = dict()
            
        batch_size += ref.value_file_size

        # get the value file data
        # TODO: we should only store the actual references from the files into 
        # memory, not keep the whole files into memory.  Keeping the whole file 
        # in memory means we're using memory for parts of the files that are 
        # garbage, effectively decreasing the size of our output sort batch.  
        # We could end up with very small outputs from each batch if a large 
        # portion of the input value files are garbage.

        value_file_key = (ref.value_file_id, ref.space_id, )
        assert value_file_key not in value_file_data
        value_file_path = \
                compute_value_file_path(repository_path, 
                                        ref.space_id, 
                                        ref.value_file_id)
        with open(value_file_path, "rb") as input_file:
            value_file_data[value_file_key] = input_file.read()

        # load up the refs for this partition
        refs.append(ref)
        for _ in range(ref.value_row_count-1):
            refs.append(next(ref_generator)) 

    if len(refs) > 0:
        connection.begin_transaction()
        try:
            output_size = _process_batch(connection, 
                                         repository_path, 
                                         refs, 
                                         value_file_data)
        except Exception:
            connection.rollback()
            raise

        connection.commit()
        _remove_old_value_files(repository_path, value_file_data.keys())

        total_batch_size += batch_size
        total_output_size += output_size
        savings = batch_size - output_size
        log.debug("batch_size={0:,}, output_size={1:,}, savings={2:,}".format(
            batch_size, output_size, savings
        ))

    savings = total_batch_size - total_output_size
    log.info(
        "total_batch_size={0:,} total_output_size={1:,} savings={2:,}".format(
            total_batch_size, total_output_size, savings
    ))

    return savings
예제 #17
0
def unlink_unreachable_value_files(connection, repository_path):
    """
    clean out value files that are unreachable from the database
    """
    log = logging.getLogger("unlink_unreachable_value_files")

    # 1) build up an accumulator of value files seen in the file system.
    # the repository contains some instances of directories 000-999
    # where 'nnn' is value_file.id mod 1000
    # in those directories, we're looking for filenames that are
    # 8 digit numbers, representing value_file.id
    filesystem_value_file_ids = set()
    for i in range(1000):
        dir_name = "{0:0>3}".format(i)
        dir_path = os.path.join(repository_path, dir_name)
        if os.path.isdir(dir_path):
            for name in os.listdir(dir_path):
                if len(name) != 8:
                    continue
                try:
                    value_file_id = int(name)
                except ValueError:
                    continue
                filesystem_value_file_ids.add(value_file_id)

    # 2) explicitly rollback the connection's current transaction
    # not needed because we are in auto-commit mode.
    # connection.rollback()
    assert not connection._in_transaction

    # 3) do the "select id from value_file" and store in a set
    database_value_file_ids = dict()
    for (
            value_file_id,
            space_id,
    ) in connection.fetch_all_rows(
            "select id, space_id from nimbusio_node.value_file", []):
        database_value_file_ids[value_file_id] = space_id

    # 4) unlink things from #1 not present in #3.
    unreachable_value_file_ids = filesystem_value_file_ids \
                               - set(database_value_file_ids)

    total_value_file_size = 0
    for value_file_id in list(unreachable_value_file_ids):
        value_file_path = compute_value_file_path(
            repository_path, database_value_file_ids[value_file_id],
            value_file_id)
        value_file_size = os.path.getsize(value_file_path)
        log.info("unlinking unreachable value_file {0} size = {1}".format(
            value_file_path, value_file_size))
        total_value_file_size += value_file_size
        try:
            os.unlink(value_file_path)
        except Exception:
            log.exception(value_file_path)

    log.info("found {0:,} unreachable value files; savings={1:,}".format(
        len(unreachable_value_file_ids), total_value_file_size))

    return total_value_file_size
예제 #18
0
def _process_request(resources):
    """
    Wait for a reply to our last message from the controller.
    """
    log = logging.getLogger("_process_one_transaction")
    log.debug("waiting work request")
    try:
        request = resources.dealer_socket.recv_pyobj()
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error):
            raise InterruptedSystemCall()
        raise

    assert resources.dealer_socket.rcvmore
    control = resources.dealer_socket.recv_pyobj()

    log.debug("user_request_id = {0}; control = {1}".format(
        request["user-request-id"], control))

    assert resources.dealer_socket.rcvmore
    sequence_row = resources.dealer_socket.recv_pyobj()

    value_file_path = compute_value_file_path(_repository_path,
                                              sequence_row["space_id"],
                                              sequence_row["value_file_id"])

    control["result"] = "success"
    control["error-message"] = ""

    if value_file_path in resources.file_cache:
        value_file, _ = resources.file_cache[value_file_path]
        del resources.file_cache[value_file_path]
    else:
        try:
            value_file = open(value_file_path, "rb")
        except Exception as instance:
            log.exception("user_request_id = {0}, " \
                          "read {1}".format(request["user-request-id"],
                                            value_file_path))
            resources.event_push_client.exception("error_opening_value_file",
                                                  str(instance))
            control["result"] = "error_opening_value_file"
            control["error-message"] = str(instance)

        if control["result"] != "success":
            _send_error_reply(resources, request, control)
            return

    read_offset = \
        sequence_row["value_file_offset"] + \
        (control["left-offset"] * encoded_block_slice_size)


    read_size = \
        sequence_row["size"] - \
        (control["left-offset"] * encoded_block_slice_size) - \
        (control["right-offset"] * encoded_block_slice_size)

    # Ticket #84 handle a short block
    # the last block in the file may be smaller than encoded_block_slice_size
    # so we might have subtracted too much for the right offset
    if control["right-offset"] > 0:
        block_modulus = sequence_row["size"] % encoded_block_slice_size
        last_block_size = (encoded_block_slice_size if block_modulus == 0 else \
                           block_modulus)
        last_block_delta = encoded_block_slice_size - last_block_size
        read_size += last_block_delta

    try:
        value_file.seek(read_offset)
        encoded_data = value_file.read(read_size)
    except Exception as instance:
        log.exception("user_request_id = {0}, " \
                      "read {1}".format(request["user-request-id"],
                                        value_file_path))
        resources.event_push_client.exception("error_reading_value_file",
                                              str(instance))
        control["result"] = "error_reading_vqalue_file"
        control["error-message"] = str(instance)

    if control["result"] != "success":
        value_file.close()
        _send_error_reply(resources, request, control)
        return

    resources.file_cache[value_file_path] = value_file, time.time()

    if len(encoded_data) != read_size:
        error_message = "{0} size mismatch {1} {2}".format(
            request["retrieve-id"], len(encoded_data), read_size)
        log.error("user_request_id = {0}, {1}".format(
            request["user-request-id"], error_message))
        resources.event_push_client.error("size_mismatch", error_message)
        control["result"] = "size_mismatch"
        control["error-message"] = error_message

    if control["result"] != "success":
        _send_error_reply(resources, request, control)
        return

    encoded_block_list = list(encoded_block_generator(encoded_data))

    segment_size = 0
    segment_adler32 = 0
    segment_md5 = hashlib.md5()
    for encoded_block in encoded_block_list:
        segment_size += len(encoded_block)
        segment_adler32 = zlib.adler32(encoded_block, segment_adler32)
        segment_md5.update(encoded_block)
    segment_md5_digest = segment_md5.digest()

    reply = {
        "message-type": "retrieve-key-reply",
        "user-request-id": request["user-request-id"],
        "client-tag": request["client-tag"],
        "message-id": request["message-id"],
        "retrieve-id": request["retrieve-id"],
        "segment-unified-id": request["segment-unified-id"],
        "segment-num": request["segment-num"],
        "segment-size": segment_size,
        "zfec-padding-size": sequence_row["zfec_padding_size"],
        "segment-adler32": segment_adler32,
        "segment-md5-digest": b64encode(segment_md5_digest).decode("utf-8"),
        "sequence-num": None,
        "completed": control["completed"],
        "result": "success",
        "error-message": "",
    }

    push_socket = _get_reply_push_socket(resources, request["client-address"])
    push_socket.send_json(reply, zmq.SNDMORE)
    for encoded_block in encoded_block_list[:-1]:
        push_socket.send(encoded_block, zmq.SNDMORE)
    push_socket.send(encoded_block_list[-1])
예제 #19
0
def rewrite_value_files(options, connection, repository_path, ref_generator):
    log = logging.getLogger("_rewrite_value_files")
    max_sort_mem = options.max_sort_mem * 1024**3

    total_batch_size = 0
    total_output_size = 0
    savings = 0

    batch_size = 0
    refs = list()
    value_file_data = dict()

    while True:

        try:
            ref = next(ref_generator)
        except StopIteration:
            break

        # this should be the start of a partition
        assert ref.value_row_num == 1, ref

        if batch_size + ref.value_file_size > max_sort_mem:
            connection.begin_transaction()
            try:
                output_size = _process_batch(connection, repository_path, refs,
                                             value_file_data)
            except Exception:
                connection.rollback()
                raise
            connection.commit()
            _remove_old_value_files(repository_path, value_file_data.keys())

            total_batch_size += batch_size
            total_output_size += output_size
            savings = batch_size - output_size
            log.debug(
                "batch_size={0:,}, output_size={1:,}, savings={2:,}".format(
                    batch_size, output_size, savings))

            batch_size = 0
            refs = list()
            value_file_data = dict()

        batch_size += ref.value_file_size

        # get the value file data
        # TODO: we should only store the actual references from the files into
        # memory, not keep the whole files into memory.  Keeping the whole file
        # in memory means we're using memory for parts of the files that are
        # garbage, effectively decreasing the size of our output sort batch.
        # We could end up with very small outputs from each batch if a large
        # portion of the input value files are garbage.

        value_file_key = (
            ref.value_file_id,
            ref.space_id,
        )
        assert value_file_key not in value_file_data
        value_file_path = \
                compute_value_file_path(repository_path,
                                        ref.space_id,
                                        ref.value_file_id)
        with open(value_file_path, "rb") as input_file:
            value_file_data[value_file_key] = input_file.read()

        # load up the refs for this partition
        refs.append(ref)
        for _ in range(ref.value_row_count - 1):
            refs.append(next(ref_generator))

    if len(refs) > 0:
        connection.begin_transaction()
        try:
            output_size = _process_batch(connection, repository_path, refs,
                                         value_file_data)
        except Exception:
            connection.rollback()
            raise

        connection.commit()
        _remove_old_value_files(repository_path, value_file_data.keys())

        total_batch_size += batch_size
        total_output_size += output_size
        savings = batch_size - output_size
        log.debug("batch_size={0:,}, output_size={1:,}, savings={2:,}".format(
            batch_size, output_size, savings))

    savings = total_batch_size - total_output_size
    log.info(
        "total_batch_size={0:,} total_output_size={1:,} savings={2:,}".format(
            total_batch_size, total_output_size, savings))

    return savings