Exemplo n.º 1
0
 def make_merge_ops():
     for id_and_count, components in merge_batches:
         merge_kwargs = {}
         if self.order_by == location_value:
             merge_kwargs["results_indexes"] = components[0]
             components = components[1:]
         column_handless, record_ids = components[:2]
         pass_around = tuple(
             a[0] for a in
             components[2:])  # slice so we only get the frist component
         record_id = record_ids[0]
         control_deps = []
         if self.log_goodput:
             with tf.control_dependencies(
                 (id_and_count, )
             ):  # put it after this in case the merge takes a while
                 ts = gate.unix_timestamp()
             control_deps.append(
                 gate.log_events(item_names=("id", "time"),
                                 directory=self.log_directory,
                                 event_name="merge_head",
                                 components=(slice_id(id_and_count),
                                             ts)))
         with tf.control_dependencies(control_deps):
             yield merge(chunk_group_handles=column_handless,
                         other_components=(id_and_count, record_id) +
                         tuple(pass_around),
                         **merge_kwargs)
Exemplo n.º 2
0
 def gen_written_paths():
     for chunk_file, extension, write_type in zip(
             chunk_files, extensions, write_types):
         full_key = tf.string_join(
             [intermediate_name, extension],
             separator=".",
             name="full_key_join_{ext}_{t}".format(ext=extension,
                                                   t=write_type))
         result = writer(
             path=full_key,
             record_type=write_type,
             resource_handle=chunk_file,
             name="intermediate_ceph_writer_{ext}_{t}".format(
                 ext=extension, t=write_type))
         out_path = result.output_path
         if self.log_goodput:
             timestamp = result.time
             duration = result.duration
             bytes = result.bytes
             log_op = gate.log_events(
                 item_names=("timestamp", "duration", "bytes",
                             "key", "id"),
                 directory=self.log_directory,
                 event_name="sort_ceph_write",
                 name="sort_ceph_write_logger",
                 components=(timestamp, duration, bytes, out_path,
                             slice_id(idc)))
             with tf.control_dependencies((log_op, )):
                 out_path = tf.identity(out_path)
         yield out_path
Exemplo n.º 3
0
    def make_decomp_stage(self, ready_to_decomp):
        """
        :param args:
        :param ready_to_decomp: generator of (id_and_count, column0, column1, ..., [:rest of input])
        :return: a generator of [ agd_read_handle, num_records, first_ordinal, record_id, id_and_count, {rest of input} ]
        """
        ready_to_decomp = sanitize_generator(ready_to_decomp)
        num_columns = len(self.columns)

        # to_agd_reader = just the columns
        # pass_around_agd_reader = (id_and_count, rest, of, input, ...)
        to_agd_reader, pass_around_agd_reader = zip(*(
            (rtd[1:1+num_columns], (rtd[0],)+tuple(rtd[1+num_columns:])) for rtd in ready_to_decomp
        ))

        def gen_timestamps():
            for group in pass_around_agd_reader:
                with tf.control_dependencies((group[0],)):
                    yield gate.unix_timestamp(name="align_head_timestamp")

        reader_kwargs = {}
        timestamps = []
        if self.log_goodput:
            timestamps.extend(gen_timestamps())
            assert len(timestamps) == len(ready_to_decomp)
            # control dependencies have to be an iterable
            reader_kwargs["control_ops"] = tuple((a,) for a in timestamps)

        # [output_buffer_handles], num_records, first_ordinal, record_id; in order, for each column group in upstream_tensorz
        multi_column_gen = tuple(pipeline.agd_reader_multi_column_pipeline(upstream_tensorz=to_agd_reader, verify=self.deep_verify,
                                                                           name="align_reader", **reader_kwargs))

        # around = num_records, first_ordinal, record_id for each group
        to_assembler, around_assembler = zip(*(
            (a[:2], a[1:]) for a in multi_column_gen
        ))

        assembler_kwargs = {}
        if self.log_goodput:
            log_event_ops = [
                (gate.log_events( # single element tuple because that's how tf.control_dependencies works
                    item_names=("id","time","ordinal","record_id"),
                    components=(in_id, timestamp, ordinal, record_id),
                    event_name="align_head",
                    directory=self.log_directory,
                    name="align_head_event_logger"
                ),) for in_id, timestamp, ordinal, record_id in zip(
                    (slice_id(a[0]) for a in pass_around_agd_reader),
                    timestamps,
                    (b[2] for b in multi_column_gen),
                    (b[3] for b in multi_column_gen)
                )
            ]
            assembler_kwargs["control_deps"] = log_event_ops

        # each element is an agd_reads handle
        agd_assembled_reads = pipeline.agd_read_assembler(upstream_tensors=to_assembler, include_meta=False, **assembler_kwargs)
        for agd_read, around_assembler_group, around_reader_group in zip(agd_assembled_reads, around_assembler, pass_around_agd_reader):
            yield (agd_read,) + tuple(around_assembler_group) + tuple(around_reader_group)
Exemplo n.º 4
0
 def gen_timestamps():
     for group in pass_around_agd_reader:
         idc = group[0]
         with tf.control_dependencies((idc, )):
             ts = gate.unix_timestamp(name="sort_head_timestamp")
         event_log_op = gate.log_events(item_names=("id", "time"),
                                        components=(slice_id(idc), ts),
                                        event_name="sort_head",
                                        directory=self.log_directory,
                                        name="sort_head_event_logger")
         yield event_log_op
Exemplo n.º 5
0
 def gen_control_deps():
     for item in ready_to_write_items:
         num_records, ordinal, record_id = item[1:4]
         item_id = slice_id(item[4])
         with tf.control_dependencies((item_id,)):
             ts = gate.unix_timestamp(name="align_tail_timestamp")
         yield (gate.log_events(
             item_names=("id", "time", "ordinal", "record_id", "num_records"),
             directory=self.log_directory,
             event_name="align_tail",
             name="align_tail_event_logger",
             components=(item_id, ts, ordinal, record_id, num_records)
         ),)
Exemplo n.º 6
0
 def gen_buffers(bufs):
     for cb in bufs:
         compressed_buffer = cb.compressed_buffer
         if self.log_goodput:
             timestamp = cb.time
             duration = cb.duration
             original_size = cb.original_size
             compressed_size = cb.compressed_size
             log_op = gate.log_events(
                 item_names=("timestamp", "duration",
                             "original_bytes",
                             "compressed_bytes"),
                 directory=self.log_directory,
                 event_name="merge_compression",
                 name="merge_compression_logger",
                 components=(timestamp, duration,
                             original_size,
                             compressed_size))
             with tf.control_dependencies((log_op, )):
                 compressed_buffer = tf.identity(
                     compressed_buffer)
         yield compressed_buffer
Exemplo n.º 7
0
    def make_compress_stage(self, to_compress):
        """
        :param to_compress: a generator of (chunk_handles, num_records, first_ordinal, total_chunks, id_and_count, record_id, {pass around})
        :return: a generator of (id_and_count, chunk_file_matrix, first_ordinal, num_records, record_id, {pass around})
        """
        def compress_pipeline(handles):
            with tf.name_scope("merge_compress_results"):
                buffer_pool = persona_ops.buffer_pool(bound=False, size=10)

                compressors = tuple(
                    partial(persona_ops.buffer_pair_compressor,
                            buffer_pool=buffer_pool,
                            pack=False,
                            name="buffer_pair_compressor_{}".format(cname))
                    for cname in self.columns)
                for buffer_pairs in handles:
                    bps_unstacked = tf.unstack(buffer_pairs)
                    compressed_buffers = tuple(
                        compressor(buffer_pair=a)
                        for compressor, a in zip(compressors, bps_unstacked))

                    def gen_buffers(bufs):
                        for cb in bufs:
                            compressed_buffer = cb.compressed_buffer
                            if self.log_goodput:
                                timestamp = cb.time
                                duration = cb.duration
                                original_size = cb.original_size
                                compressed_size = cb.compressed_size
                                log_op = gate.log_events(
                                    item_names=("timestamp", "duration",
                                                "original_bytes",
                                                "compressed_bytes"),
                                    directory=self.log_directory,
                                    event_name="merge_compression",
                                    name="merge_compression_logger",
                                    components=(timestamp, duration,
                                                original_size,
                                                compressed_size))
                                with tf.control_dependencies((log_op, )):
                                    compressed_buffer = tf.identity(
                                        compressed_buffer)
                            yield compressed_buffer

                    yield tf.stack(tuple(gen_buffers(bufs=compressed_buffers)))

        to_compress = sanitize_generator(to_compress)
        for chunk_file_matrix, (num_records, first_ordinal, total_num_chunks, id_and_count, record_id), pass_around in \
            zip(
                compress_pipeline(handles=(a[0] for a in to_compress)),
                (a[1:6] for a in to_compress),
                (a[6:] for a in to_compress)
            ):
            ids_only = tf.unstack(id_and_count,
                                  axis=1,
                                  name="id_only_extractor")[0]
            new_count = tf.fill(ids_only.shape,
                                total_num_chunks,
                                name="new_counts_fill")
            new_id_and_count = tf.stack((ids_only, new_count),
                                        axis=1,
                                        name="new_id_and_count_constructor")
            control_deps = []
            if self.log_goodput:
                with tf.control_dependencies((new_id_and_count, )):
                    ts = gate.unix_timestamp()
                control_deps.append(
                    gate.log_events(item_names=("id", "time", "record_id",
                                                "num_records"),
                                    event_name="merge_tail",
                                    directory=self.log_directory,
                                    components=(ids_only, ts, record_id,
                                                num_records)))

            yield (new_id_and_count, chunk_file_matrix, first_ordinal,
                   num_records, record_id) + tuple(pass_around), control_deps
Exemplo n.º 8
0
    def make_sort_stage(self, ready_to_sort):
        """
        :param ready_to_sort:
        :return: (id_and_count, record_id, intermediate_name, superchunk_num_records, superchunk_matrix) + rest_of_input, log_event
        """
        bpp = persona_ops.buffer_pair_pool(
            size=0, bound=False, name="local_read_sort_buffer_list_pool")

        self.log.info("order by is '{ob}'".format(ob=self.order_by))
        if self.order_by == location_value:
            self.log.info("sorting by location")
            sort_op = partial(persona_ops.agd_sort,
                              buffer_pair_pool=bpp,
                              name="agd_sort_results")
        else:
            raise Exception("not supported")
            sort_op = partial(persona_ops.agd_sort_metadata,
                              buffer_pair_pool=bpp,
                              name="agd_sort_metadata")

        for id_and_count, components in ready_to_sort:
            output_buffer_handless, num_recordss, first_ordinals, record_ids = components[:
                                                                                          4]
            rest_of_inputs = components[4:]

            # need to just pick the top things
            rest_of_input = tuple(a[0] for a in rest_of_inputs)
            record_id = record_ids[0]
            first_ordinal = first_ordinals[0]

            first_ordinal_str = tf.as_string(first_ordinal,
                                             name="first_ordinal_conversion")

            # this filename is guaranteed to be unique because of the ordinal (unique among this dataset) and the extension (so it doesn't conflict with existing chunk files)
            # otherwise when a request is resubmitted, the cleanup from the merge stage may overlap with the new files created!
            random_gen = tf.as_string(
                tf.random_uniform(dtype=tf.int32,
                                  maxval=2**20,
                                  shape=(),
                                  name="random_intermediate_name_gen"),
                name="random_intermediate_value_to_string")
            intermediate_name = tf.string_join(
                (record_id, first_ordinal_str, random_gen,
                 intermediate_extension),
                separator="_",
                name="intermediate_filename")

            # TODO not sure if this axis=1 is correct
            unstack_handles = tf.unstack(output_buffer_handless,
                                         axis=1,
                                         name="buffers_unstack")
            key_handles = unstack_handles[0]  # output_buffer_handless[:,0,:]
            other_handles = tf.stack(unstack_handles[1:],
                                     axis=1)  # output_buffer_handless[:,1:,:]

            # first column is always the correct one, due to self.extended_columns order
            superchunk_matrix, superchunk_num_records = sort_op(
                num_records=num_recordss,
                sort_key_handles=key_handles,
                column_handles=other_handles)

            if self.log_goodput:
                with tf.control_dependencies((superchunk_num_records, )):
                    ts = gate.unix_timestamp(name="sort_tail_timestamp")
                log_event = (gate.log_events(
                    item_names=("id", "time", "record_id", "num_records"),
                    directory=self.log_directory,
                    event_name="sort_tail",
                    name="sort_tail_event_logger",
                    components=(slice_id(id_and_count), ts, record_id,
                                superchunk_num_records)), )
            else:
                log_event = ()

            yield (id_and_count, record_id, intermediate_name,
                   superchunk_num_records,
                   superchunk_matrix) + rest_of_input, log_event