def _allocate_output_value_files(connection, repository_path, refs): output_value_file_sizes = defaultdict(list) file_space_info = load_file_space_info(connection) file_space_sanity_check(file_space_info, repository_path) space_id = find_least_volume_space_id("storage", file_space_info) for ref in refs: if len(output_value_file_sizes[ref.collection_id]) == 0: output_value_file_sizes[ref.collection_id].append(0) expected_size = output_value_file_sizes[ref.collection_id][-1] \ + ref.data_size if expected_size > _max_value_file_size: output_value_file_sizes[ref.collection_id].append(0) output_value_file_sizes[ref.collection_id][-1] += ref.data_size output_value_files = defaultdict(list) for collection_id in output_value_file_sizes.keys(): for expected_size in output_value_file_sizes[collection_id]: output_value_files[collection_id].append( OutputValueFile(connection, space_id, repository_path, expected_size=expected_size)) return output_value_files
def store_sequence( self, collection_id, key, unified_id, timestamp_repr, conjoined_part, segment_num, segment_size, zfec_padding_size, segment_md5_digest, segment_adler32, sequence_num, data, user_request_id ): """ store one piece (sequence) of segment data """ segment_key = (unified_id, conjoined_part, segment_num, ) self._log.info("request {0}: " \ "store_sequence {1} {2} {3} {4} {5}: {6} ({7})".format( user_request_id, collection_id, key, unified_id, timestamp_repr, segment_num, sequence_num, segment_size)) segment_entry = self._active_segments[segment_key] # if this write would put us over the max size, # start a new output value file if self._value_file.size + segment_size > _max_value_file_size: self._value_file.close() space_id = find_least_volume_space_id("journal", self._file_space_info) self._value_file = OutputValueFile(self._connection, space_id, self._repository_path) segment_sequence_row = segment_sequence_template( collection_id=collection_id, segment_id=segment_entry["segment-id"], zfec_padding_size=zfec_padding_size, value_file_id=self._value_file.value_file_id, sequence_num=sequence_num, value_file_offset=self._value_file.size, size=segment_size, hash=psycopg2.Binary(segment_md5_digest), adler32=segment_adler32, ) self._value_file.write_data_for_one_sequence( collection_id, segment_entry["segment-id"], data ) _insert_segment_sequence_row(self._connection, segment_sequence_row)
def __init__(self, connection, file_space_info, repository_path, active_segments, completions ): self._log = logging.getLogger("Writer") self._connection = connection self._file_space_info = file_space_info self._repository_path = repository_path self._active_segments = active_segments self._completions = completions space_id = find_least_volume_space_id("journal", self._file_space_info) # open a new value file at startup self._value_file = OutputValueFile(self._connection, space_id, self._repository_path)
def _generate_work(connection, file_space_info, value_file_rows): log = logging.getLogger("_generate_work") prev_handoff_node_id = None prev_collection_id = None output_value_file = None for reference in _query_value_file_references( connection, [row.id for row in value_file_rows] ): if reference.handoff_node_id is not None: # at least one distinct value file per handoff node if reference.handoff_node_id != prev_handoff_node_id: if prev_handoff_node_id is not None: log.debug( "closing output value file handoff node {0}".format( prev_handoff_node_id ) ) assert output_value_file is not None output_value_file.close() output_value_file = None log.debug( "opening value file for handoff node {0}".format( reference.handoff_node_id ) ) assert output_value_file is None space_id = find_least_volume_space_id("storage", file_space_info) output_value_file = OutputValueFile(connection, space_id, _repository_path) prev_handoff_node_id = reference.handoff_node_id elif reference.collection_id != prev_collection_id: if prev_handoff_node_id is not None: log.debug( "closing value file for handoff node {0}".format( prev_handoff_node_id ) ) assert output_value_file is not None output_value_file.close() output_value_file = None prev_handoff_node_id = None # at least one distinct value file per collection_id if prev_collection_id is not None: log.debug( "closing value file for collection {0}".format( prev_collection_id ) ) assert output_value_file is not None output_value_file.close() output_value_file = None log.debug( "opening value file for collection {0}".format( reference.collection_id ) ) assert output_value_file is None space_id = find_least_volume_space_id("storage", file_space_info) output_value_file = OutputValueFile( connection, space_id, _repository_path ) prev_collection_id = reference.collection_id assert output_value_file is not None # if this write would put us over the max size, # start a new output value file expected_size = output_value_file.size + reference.sequence_size if expected_size > _max_value_file_size: log.debug("closing value_file and opening new one due to size") output_value_file.close() space_id = find_least_volume_space_id("storage", file_space_info) output_value_file = OutputValueFile( connection, space_id, _repository_path ) yield reference, output_value_file if prev_handoff_node_id is not None: log.debug( "closing final value file for handoff node {0}".format( prev_handoff_node_id ) ) if prev_collection_id is not None: log.debug( "closing final value file for collection {0}".format( prev_collection_id ) ) output_value_file.close()
def _generate_work(connection, file_space_info, value_file_rows): log = logging.getLogger("_generate_work") prev_handoff_node_id = None prev_collection_id = None output_value_file = None for reference in _query_value_file_references( connection, [row.id for row in value_file_rows]): if reference.handoff_node_id is not None: # at least one distinct value file per handoff node if reference.handoff_node_id != prev_handoff_node_id: if prev_handoff_node_id is not None: log.debug( "closing output value file handoff node {0}".format( prev_handoff_node_id)) assert output_value_file is not None output_value_file.close() output_value_file = None log.debug("opening value file for handoff node {0}".format( reference.handoff_node_id)) assert output_value_file is None space_id = find_least_volume_space_id("storage", file_space_info) output_value_file = OutputValueFile(connection, space_id, _repository_path) prev_handoff_node_id = reference.handoff_node_id elif reference.collection_id != prev_collection_id: if prev_handoff_node_id is not None: log.debug("closing value file for handoff node {0}".format( prev_handoff_node_id)) assert output_value_file is not None output_value_file.close() output_value_file = None prev_handoff_node_id = None # at least one distinct value file per collection_id if prev_collection_id is not None: log.debug("closing value file for collection {0}".format( prev_collection_id)) assert output_value_file is not None output_value_file.close() output_value_file = None log.debug("opening value file for collection {0}".format( reference.collection_id)) assert output_value_file is None space_id = find_least_volume_space_id("storage", file_space_info) output_value_file = OutputValueFile(connection, space_id, _repository_path) prev_collection_id = reference.collection_id assert output_value_file is not None # if this write would put us over the max size, # start a new output value file expected_size = output_value_file.size + reference.sequence_size if expected_size > _max_value_file_size: log.debug("closing value_file and opening new one due to size") output_value_file.close() space_id = find_least_volume_space_id("storage", file_space_info) output_value_file = OutputValueFile(connection, space_id, _repository_path) yield reference, output_value_file if prev_handoff_node_id is not None: log.debug("closing final value file for handoff node {0}".format( prev_handoff_node_id)) if prev_collection_id is not None: log.debug("closing final value file for collection {0}".format( prev_collection_id)) output_value_file.close()