def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        super(UniversalJoiner, self).__init__(example_joiner_options,
                                                  raw_data_options,
                                                  data_block_builder_options,
                                                  kvstore, data_source,
                                                  partition_id)
        self._min_window_size = example_joiner_options.min_matching_window
        self._max_window_size = example_joiner_options.max_matching_window

        self._max_watermark_delay = \
                example_joiner_options.max_conversion_delay

        self._key_mapper = create_key_mapper(
            example_joiner_options.join_key_mapper)
        self._leader_join_window = _SlidingWindow(
            self._min_window_size, self._max_window_size,
            self._key_mapper.leader_mapping)
        self._follower_join_window = _SlidingWindow(
            self._min_window_size, self._max_window_size,
            self._key_mapper.follower_mapping)
        self._leader_restart_index = -1
        self._leader_index_ps = PrioritySet()
        self._dedup_by_follower_index = {}
        self._trigger = _Trigger(self._max_watermark_delay)
        self._expr = expr.Expr(example_joiner_options.join_expr)
        self._joiner = _JoinerImpl(self._expr)

        self._enable_negative_example_generator = \
                example_joiner_options.enable_negative_example_generator
        if self._enable_negative_example_generator:
            sf = example_joiner_options.negative_sampling_rate
            fe = example_joiner_options.negative_sampling_filter_expr
            self._negative_example_generator = NegativeExampleGenerator(sf, fe)
예제 #2
0
    def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        super(AttributionJoiner,
              self).__init__(example_joiner_options, raw_data_options,
                             data_block_builder_options, kvstore, data_source,
                             partition_id)
        self._min_window_size = example_joiner_options.min_matching_window
        # max_window_size must be lesser than max_conversion_delay
        self._max_window_size = example_joiner_options.max_matching_window
        self._max_conversion_delay = \
                example_joiner_options.max_conversion_delay
        self._leader_join_window = _SlidingWindow(self._min_window_size, 2**20)
        self._follower_join_window = _SlidingWindow(self._min_window_size,
                                                    self._max_window_size)
        self._leader_restart_index = -1
        self._sorted_buf_by_leader_index = []
        self._dedup_by_follower_index = {}

        self._trigger = _Trigger(self._max_conversion_delay)
        attri = _Attributor(self._max_conversion_delay)
        self._acc = _Accumulator(attri)

        self._enable_negative_example_generator = \
                example_joiner_options.enable_negative_example_generator
        if self._enable_negative_example_generator:
            sf = example_joiner_options.negative_sampling_rate
            self._negative_example_generator = NegativeExampleGenerator(sf)
예제 #3
0
    def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        super(StreamExampleJoiner,
              self).__init__(example_joiner_options, raw_data_options,
                             data_block_builder_options, kvstore, data_source,
                             partition_id)
        self._min_window_size = example_joiner_options.min_matching_window
        self._max_window_size = example_joiner_options.max_matching_window
        self._leader_join_window = _JoinWindow(0.05, 0.99)
        self._follower_join_window = _JoinWindow(0.05, 0.90)
        self._joined_cache = {}
        self._leader_unjoined_example_ids = []
        self._follower_example_cache = {}
        self._fill_leader_enough = False
        self._reset_joiner_state(True)

        self._enable_negative_example_generator = \
                example_joiner_options.enable_negative_example_generator
        if self._enable_negative_example_generator:
            sf = example_joiner_options.negative_sampling_rate
            self._negative_example_generator = NegativeExampleGenerator(sf)
class UniversalJoiner(ExampleJoiner):
    def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        super(UniversalJoiner, self).__init__(example_joiner_options,
                                                  raw_data_options,
                                                  data_block_builder_options,
                                                  kvstore, data_source,
                                                  partition_id)
        self._min_window_size = example_joiner_options.min_matching_window
        self._max_window_size = example_joiner_options.max_matching_window

        self._max_watermark_delay = \
                example_joiner_options.max_conversion_delay

        self._key_mapper = create_key_mapper(
            example_joiner_options.join_key_mapper)
        self._leader_join_window = _SlidingWindow(
            self._min_window_size, self._max_window_size,
            self._key_mapper.leader_mapping)
        self._follower_join_window = _SlidingWindow(
            self._min_window_size, self._max_window_size,
            self._key_mapper.follower_mapping)
        self._leader_restart_index = -1
        self._leader_index_ps = PrioritySet()
        self._dedup_by_follower_index = {}
        self._trigger = _Trigger(self._max_watermark_delay)
        self._expr = expr.Expr(example_joiner_options.join_expr)
        self._joiner = _JoinerImpl(self._expr)

        self._enable_negative_example_generator = \
                example_joiner_options.enable_negative_example_generator
        if self._enable_negative_example_generator:
            sf = example_joiner_options.negative_sampling_rate
            fe = example_joiner_options.negative_sampling_filter_expr
            self._negative_example_generator = NegativeExampleGenerator(sf, fe)

    @classmethod
    def name(cls):
        return 'UNIVERSAL_JOINER'

    def _inner_joiner(self, state_stale):
        if self.is_join_finished():
            return
        sync_example_id_finished, raw_data_finished = \
                self._prepare_join(state_stale)
        join_data_finished = False

        while True:
            fill_leader_enough = self._fill_leader_join_window(
                sync_example_id_finished)
            leader_exhausted = sync_example_id_finished and                    \
                    not self._leader_join_window.is_full()
            follower_exhausted = False
            logging.info('Fill leader_exhausted: %s, sync_example_id_finished '
                         '%s, raw_data_finished %s, leader_win_size %d, '
                         'follower_win_size %d, raw_data_finished %d',
                         leader_exhausted, sync_example_id_finished,
                         raw_data_finished, self._leader_join_window.size(),
                         self._follower_join_window.size(), raw_data_finished)
            while self._fill_follower_join_window(raw_data_finished):
                follower_exhausted = raw_data_finished and \
                        not self._follower_join_window.is_full()

                logging.info("Fill: follower_exhausted=%s, "
                             "follower_win_size=%d", follower_exhausted,
                             self._follower_join_window.size())
                #1. find all the matched pairs in current window
                raw_pairs, mismatches = self._joiner.join(
                    self._follower_join_window, self._leader_join_window,
                    self._max_watermark_delay)
                if self._enable_negative_example_generator:
                    self._negative_example_generator.update(mismatches)
                stride = self._trigger.trigger(self._follower_join_window,
                                               self._leader_join_window)
                #2. cache the pairs, evict the leader events which are out of
                # watermark
                watermark = self._trigger.watermark()
                pairs = self._update_matching_pairs(raw_pairs, watermark)
                #3. push the result into builder
                if len(pairs) > 0:
                    for meta in self._dump_joined_items(pairs):
                        yield meta
                    self._leader_restart_index = pairs[len(pairs) - 1].li
                    self._follower_restart_index = pairs[len(pairs) - 1].fi
                logging.info("Restart index of leader %d, follwer %d,"
                             "pair_buf=%d, raw_pairs=%d, pairs=%d",
                             self._leader_restart_index,
                             self._follower_restart_index,
                             self._leader_index_ps.size(), len(raw_pairs),
                             len(pairs))

                #4. update window
                self._follower_join_window.forward(stride[0],
                                                   self._optional_stats)
                self._leader_join_window.forward(stride[1])

                if self._follower_join_window.is_full():
                    if self._leader_join_window.is_full():
                        raise RuntimeError('max_matching_size[%d] is too '
                                           'small, dead looping'%
                                           self._follower_join_window.size())
                    # leader is moving forward
                    break

                if follower_exhausted:
                    break

            if leader_exhausted and self._leader_join_window.et_span(
                self._max_watermark_delay):
                join_data_finished = True
                break
            if follower_exhausted and self._follower_join_window.et_span(
                self._max_watermark_delay):
                join_data_finished = True
                break

            if self._leader_join_window.is_full() or not fill_leader_enough:
                break

        if self._get_data_block_builder(False) is not None and \
                (self._need_finish_data_block_since_interval() or
                    join_data_finished):
            yield self._finish_data_block()
        if join_data_finished:
            self._set_join_finished()
            logging.info("finish join example for partition %d by %s",
                            self._partition_id, self.name())

    def _update_matching_pairs(self, raw_pairs, watermark):
        """
        Push the pairs into a order-by-leader-index list,
        and evict the pairs which are out-of-watermark
        """
        for (cid, sid) in raw_pairs:
            #fi: follower index, fe: follower example
            assert cid < self._follower_join_window.size(), \
                    "Leader index[%d] out of range"%cid
            assert sid < self._leader_join_window.size(), \
                    "Follower index[%d] out of range"%(sid)

            example_with_index = self._follower_join_window[cid]
            fi, fe = example_with_index.index, example_with_index.item

            example_with_index = self._leader_join_window[sid]
            li, le = example_with_index.index, example_with_index.item
            if li <= self._leader_restart_index:
                logging.warning("Leader index should be bigger than restart "
                                "index, %d > %d for follower idx %d",
                                li, self._leader_restart_index, fi)
                continue

            if abs(fcc.time_diff(fe.event_time, le.event_time)) > \
               self._max_watermark_delay:
                ### unreachable branch
                logging.info('Pair %s:%s out-of-delay, leader et %d, '
                             'follower et %d', le.example_id, fe.example_id,
                             le.event_time, fe.event_time)
                continue

            # cache the latest leader event
            updated = False
            if fi in self._dedup_by_follower_index:
                old_conv_int = fcc.time_diff(fe.event_time, le.event_time)
                new_conv_int = fcc.time_diff(
                    self._dedup_by_follower_index[fi].event_time, le.event_time)
                if abs(old_conv_int) > abs(new_conv_int):
                    self._dedup_by_follower_index[fi] = \
                            IndexedTime(li, le.event_time)
                    updated = True
            else:
                self._dedup_by_follower_index[fi] = \
                        IndexedTime(li, le.event_time)
                updated = True
            # sort by leader index
            if not updated:
                continue
            self._leader_index_ps.put(_IndexedPair(fe, li, fi))

        matches = []
        while not self._leader_index_ps.empty():
            ip = self._leader_index_ps.get()
            if ip.fe.event_time <= watermark:
                if ip.fi not in self._dedup_by_follower_index:
                    logging.info("Ignore the deleted follower index %d", ip.fi)
                    continue
                indexed_time = self._dedup_by_follower_index[ip.fi]
                if indexed_time.li == ip.li:
                    matches.append(ip)
                    del self._dedup_by_follower_index[ip.fi]
                else:
                    logging.info("Example %s matching leader index %s is"   \
                                 " older than %d", ip.fe.example_id,        \
                                 ip.li, indexed_time.li)
            else:
                self._leader_index_ps.put(ip)
                logging.info('Break dumping, event time %s, watermark %s',
                             ip.fe.event_time, watermark)
                break
        return matches

    # useless
    def _reset_joiner_state(self, state_stale):
        self._leader_join_window.reset([], state_stale)
        if state_stale:
            self._follower_join_window.reset([], True)

    def _prepare_join(self, state_stale):
        if state_stale:
            self._reset_joiner_state(True)
        return super(UniversalJoiner, self)._prepare_join(state_stale)

    def _dump_joined_items(self, indexed_pairs):
        start_tm = time.time()
        for ip in indexed_pairs:
            if self._enable_negative_example_generator:
                for example in \
                    self._negative_example_generator.generate(ip.fe, ip.li):
                    builder = self._get_data_block_builder(True)
                    assert builder is not None, "data block builder must be "\
                                                "not None if before dummping"
                    # example:  (li, fi, item)
                    builder.append_item(example[0], example[1],
                                        example[2], None, True, 0)
                    if builder.check_data_block_full():
                        yield self._finish_data_block()

            builder = self._get_data_block_builder(True)
            assert builder is not None, "data block builder must be "\
                                        "not None if before dummping"
            builder.append_item(ip.fe, ip.li, ip.fi, None, True,
                                joined=1)
            if builder.check_data_block_full():
                yield self._finish_data_block()
        metrics.emit_timer(name='universal_joiner_dump_joined_items',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)

    def _fill_leader_join_window(self, sync_example_id_finished):
        start_tm = time.time()
        idx = self._leader_join_window.size()
        filled_enough = self._fill_join_windows(self._leader_visitor,
                                       self._leader_join_window)
        if not filled_enough:
            filled_enough = sync_example_id_finished
        eids = []
        while idx < self._leader_join_window.size():
            eids.append((self._leader_join_window[idx].index,
                        self._leader_join_window[idx].item.example_id))
            idx += 1

        self._joiner_stats.fill_leader_example_ids(eids)
        metrics.emit_timer(name=\
                           'universal_joiner_fill_leader_join_window',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)
        return filled_enough

    def _fill_follower_join_window(self, raw_data_finished):
        start_tm = time.time()
        idx = self._follower_join_window.size()
        filled_enough = self._fill_join_windows(self._follower_visitor,
                                      self._follower_join_window)
        eids = []
        while idx < self._follower_join_window.size():
            eids.append((self._follower_join_window[idx].index,
                 self._follower_join_window[idx].item.example_id))
            idx += 1

        self._joiner_stats.fill_follower_example_ids(eids)
        metrics.emit_timer(name=\
                           'universal_joiner_fill_follower_join_window',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)
        return filled_enough or raw_data_finished

    def _fill_join_windows(self, visitor, join_window):
        size = join_window.size()
        while not visitor.finished() and not join_window.is_full():
            required_item_count = join_window.reserved_size()
            self._consume_item_until_count(
                    visitor, join_window,
                    required_item_count
                )
        # return True if new elem added or window reaches its capacity
        return join_window.size() > size or size >= self._max_window_size

    def _consume_item_until_count(self, visitor, windows,
                                  required_item_count):
        for (index, item) in visitor:
            if item.example_id == common.InvalidExampleId:
                logging.warning("ignore item indexed as %d from %s since "\
                                "invalid example id", index, visitor.name())
            elif item.event_time == common.InvalidEventTime:
                logging.warning("ignore item indexed as %d from %s since "\
                                "invalid event time", index, visitor.name())
            else:
                windows.append(index, item)
                if windows.size() >= required_item_count:
                    return
        assert visitor.finished(), "visitor shoud be finished of "\
                                   "required_item is not satisfied"
예제 #5
0
class StreamExampleJoiner(ExampleJoiner):
    def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        super(StreamExampleJoiner, self).__init__(example_joiner_options,
                                                  raw_data_options,
                                                  data_block_builder_options,
                                                  kvstore, data_source,
                                                  partition_id)
        self._min_window_size = example_joiner_options.min_matching_window
        self._max_window_size = example_joiner_options.max_matching_window
        self._leader_join_window = _JoinWindow(0.05, 0.99)
        self._follower_join_window = _JoinWindow(0.05, 0.90)
        self._joined_cache = {}
        self._leader_unjoined_example_ids = []
        self._follower_example_cache = {}
        self._fill_leader_enough = False

        self._enable_negative_example_generator = \
                example_joiner_options.enable_negative_example_generator
        if self._enable_negative_example_generator:
            sf = example_joiner_options.negative_sampling_rate
            fe = example_joiner_options.negative_sampling_filter_expr
            self._negative_example_generator = NegativeExampleGenerator(sf, fe)
        self._reset_joiner_state(True)


    @classmethod
    def name(cls):
        return 'STREAM_JOINER'

    def _inner_joiner(self, state_stale):
        if self.is_join_finished():
            return
        sync_example_id_finished, raw_data_finished = \
                self._prepare_join(state_stale)
        logging.info("streaming joiner: sync_example_id_finished: %s,"
                     "raw_data_finished: %s", sync_example_id_finished,
                     raw_data_finished)
        join_data_finished = False
        while self._fill_leader_join_window(sync_example_id_finished):
            leader_exhausted = sync_example_id_finished and \
                    self._leader_join_window.size() <= \
                    self._min_window_size / 2
            follower_exhausted = False
            delay_dump = True

            logging.info("before leader window size %d, follwer %d, "
                           "follower cache %d, leader unjoined example %d",
                           self._leader_join_window.size(),
                           self._follower_join_window.size(),
                          len(self._follower_example_cache),
                          len(self._leader_unjoined_example_ids))
            while delay_dump and \
                    self._fill_follower_join_window(raw_data_finished):
                follower_exhausted = raw_data_finished and \
                        self._follower_join_window.size() <= \
                        self._min_window_size / 2
                delay_dump = self._need_delay_dump(raw_data_finished)
                if delay_dump:
                    self._update_join_cache()
                else:
                    for meta in self._dump_joined_items():
                        yield meta
                self._evit_stale_follower_cache()
            if not delay_dump:
                self._reset_joiner_state(False)
            if leader_exhausted:
                join_data_finished = not delay_dump
            elif follower_exhausted:
                join_data_finished = True
            logging.info("delay_dump %s, join_data_finished %s, "
                           "leader_exhausted %s, follower_exhausted %s",
                           delay_dump, join_data_finished, leader_exhausted,
                           follower_exhausted)
            logging.info("leader window size %d, follwer %d, "
                           "follower cache %d, leader unjoined example %d",
                           self._leader_join_window.size(),
                           self._follower_join_window.size(),
                          len(self._follower_example_cache),
                          len(self._leader_unjoined_example_ids))
            if delay_dump or join_data_finished:
                break
        if self._get_data_block_builder(False) is not None and \
                (self._need_finish_data_block_since_interval() or
                    join_data_finished):
            yield self._finish_data_block()
        if join_data_finished:
            self._set_join_finished()
            logging.warning("finish join example for partition %d by %s",
                            self._partition_id, self.name())

    def _prepare_join(self, state_stale):
        if state_stale:
            self._reset_joiner_state(True)
        return super(StreamExampleJoiner, self)._prepare_join(state_stale)

    def _need_delay_dump(self, raw_data_finished):
        if self._follower_visitor.finished() and raw_data_finished:
            return False
        leader_qt = self._leader_join_window.qt()
        follower_qt = self._follower_join_window.qt()
        logging.info("delay dump leader %s, follower %s",
                     leader_qt, follower_qt)
        if leader_qt is not None and follower_qt is not None and \
                not follower_qt < leader_qt:
            return False
        return True

    def _update_join_cache(self):
        start_tm = time.time()
        new_unjoined_example_ids = []
        for example_id in self._leader_unjoined_example_ids:
            if example_id in self._follower_example_cache:
                self._joined_cache[example_id] = \
                        self._follower_example_cache[example_id]
            else:
                new_unjoined_example_ids.append(example_id)
        self._leader_unjoined_example_ids = new_unjoined_example_ids
        metrics.emit_timer(name='stream_joiner_update_join_cache',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)

    def _dump_joined_items(self):
        start_tm = time.time()
        self._neg_samples = {}
        for (leader_idx, leader_item) in self._leader_join_window:
            eid = leader_item.example_id
            if (eid not in self._follower_example_cache
               and eid not in self._joined_cache):
                if self._enable_negative_example_generator:
                    self._negative_example_generator.update(
                        {leader_idx:leader_item})
                continue
            if eid not in self._joined_cache:
                self._joined_cache[eid] = self._follower_example_cache[eid]
            follower_example = self._joined_cache[eid]
            if self._enable_negative_example_generator:
                for example in self._negative_example_generator.generate(
                    follower_example[1], leader_idx
                ):
                    builder = self._get_data_block_builder(True)
                    assert builder is not None, "data block builder must not " \
                                                "be None before dumping"
                    builder.append_item(example[0], example[1], example[2],
                                        joined=0)
                    self._optional_stats.update_stats(example[0], kind='fake')
                    if builder.check_data_block_full():
                        yield self._finish_data_block()
            prev_leader_idx = leader_idx
            builder = self._get_data_block_builder(True)
            assert builder is not None, "data block builder must not be "\
                                        "None before dumping"
            follower_idx, item = self._joined_cache[eid]
            builder.append_item(item, leader_idx, follower_idx,
                                joined=1)
            self._optional_stats.update_stats(item, kind='joined')
            if builder.check_data_block_full():
                yield self._finish_data_block()
        metrics.emit_timer(name='stream_joiner_dump_joined_items',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)

    def _reset_joiner_state(self, state_stale):
        self._leader_join_window.reset([], state_stale)
        self._fill_leader_enough = False
        self._joined_cache = {}
        self._leader_unjoined_example_ids = []
        if state_stale:
            self._follower_join_window.reset([], True)
            self._follower_example_cache = {}

    def _fill_leader_join_window(self, sync_example_id_finished):
        if not self._fill_leader_enough:
            start_tm = time.time()
            start_pos = self._leader_join_window.size()
            if not self._fill_join_windows(self._leader_visitor,
                                           self._leader_join_window,
                                           None):
                self._fill_leader_enough = sync_example_id_finished
            else:
                self._fill_leader_enough = True
            if self._fill_leader_enough:
                self._leader_unjoined_example_ids = \
                    [item.example_id for _, item in self._leader_join_window]
            end_pos = self._leader_join_window.size()
            eids = [(self._leader_join_window[idx][0],
                     self._leader_join_window[idx][1].example_id)
                    for idx in range(start_pos, end_pos)]
            self._joiner_stats.fill_leader_example_ids(eids)
            metrics.emit_timer(name='stream_joiner_fill_leader_join_window',
                               value=int(time.time()-start_tm),
                               tags=self._metrics_tags)
        return self._fill_leader_enough

    def _fill_follower_join_window(self, raw_data_finished):
        start_tm = time.time()
        start_pos = self._follower_join_window.size()
        follower_enough = self._fill_join_windows(self._follower_visitor,
                                                  self._follower_join_window,
                                                  self._follower_example_cache)
        end_pos = self._follower_join_window.size()
        eids = [(self._follower_join_window[idx][0],
                 self._follower_join_window[idx][1].example_id)
                for idx in range(start_pos, end_pos)]
        self._joiner_stats.fill_follower_example_ids(eids)
        metrics.emit_timer(name='stream_joiner_fill_leader_join_window',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)
        return follower_enough or raw_data_finished

    def _fill_join_windows(self, visitor, join_window, join_cache):
        while not visitor.finished() and \
                join_window.size() < self._max_window_size:
            required_item_count = self._min_window_size
            if join_window.size() >= self._min_window_size:
                required_item_count *= 2
            if required_item_count >= self._max_window_size:
                required_item_count = self._max_window_size
            self._consume_item_until_count(
                    visitor, join_window,
                    required_item_count, join_cache
                )
            if join_window.forward_pt():
                return True
        return join_window.size() >= self._max_window_size

    def _evict_if_useless(self, item):
        outdated = self._leader_join_window.committed_pt() is None \
                   or _CmpCtnt(item) < self._leader_join_window.committed_pt()
        if outdated and item.example_id not in self._joined_cache:
            self._optional_stats.update_stats(item, kind='unjoined')
        return outdated or item.example_id in self._joined_cache

    def _evict_if_force(self, item):
        outdated = self._leader_join_window.qt() is None or \
                _CmpCtnt(item) < self._leader_join_window.qt()
        if outdated:
            self._optional_stats.update_stats(item, kind='unjoined')
        return outdated

    def _evict_impl(self, candidates, filter_fn):
        reserved_items = []
        for (index, item) in candidates:
            example_id = item.example_id
            if filter_fn(item):
                self._follower_example_cache.pop(example_id, None)
            else:
                reserved_items.append((index, item))
        return reserved_items

    def _evit_stale_follower_cache(self):
        start_tm = time.time()
        tmp_sz = self._follower_join_window.size()
        reserved_items = self._evict_impl(self._follower_join_window,
                                          self._evict_if_useless)
        logging.info("evict_if_useless %d to %d", tmp_sz, len(reserved_items))
        if len(reserved_items) < self._max_window_size:
            self._follower_join_window.reset(reserved_items, False)
            return
        tmp_sz = len(reserved_items)
        reserved_items = self._evict_impl(reserved_items,
                                          self._evict_if_force)
        logging.info("evict_if_force %d to %d", tmp_sz, len(reserved_items))
        self._follower_join_window.reset(reserved_items, False)
        metrics.emit_timer(name='stream_joiner_evit_stale_follower_cache',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)

    def _consume_item_until_count(self, visitor, windows,
                                  required_item_count, cache=None):
        for (index, item) in visitor:
            if item.example_id == common.InvalidExampleId:
                logging.warning("ignore item indexed as %d from %s since "\
                                "invalid example id", index, visitor.name())
            elif item.event_time == common.InvalidEventTime:
                logging.warning("ignore item indexed as %d from %s since "\
                                "invalid event time", index, visitor.name())
            else:
                windows.append(index, item)
                if cache is not None:
                    cache[item.example_id] = (index, item)
                if windows.size() >= required_item_count:
                    return
        assert visitor.finished(), "visitor shoud be finished of "\
                                   "required_item is not satisfied"

    def _finish_data_block(self):
        meta = super(StreamExampleJoiner, self)._finish_data_block()
        self._follower_restart_index = self._follower_visitor.get_index()
        if self._follower_join_window.size() > 0:
            self._follower_restart_index = \
                    self._follower_join_window[0][0]
        for index, _ in self._joined_cache.values():
            if index < self._follower_restart_index:
                self._follower_restart_index = index
        return meta
예제 #6
0
class AttributionJoiner(ExampleJoiner):
    def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        super(AttributionJoiner,
              self).__init__(example_joiner_options, raw_data_options,
                             data_block_builder_options, kvstore, data_source,
                             partition_id)
        self._min_window_size = example_joiner_options.min_matching_window
        # max_window_size must be lesser than max_conversion_delay
        self._max_window_size = example_joiner_options.max_matching_window
        self._max_conversion_delay = \
                example_joiner_options.max_conversion_delay
        self._leader_join_window = _SlidingWindow(self._min_window_size, 2**20)
        self._follower_join_window = _SlidingWindow(self._min_window_size,
                                                    self._max_window_size)
        self._leader_restart_index = -1
        self._sorted_buf_by_leader_index = []
        self._dedup_by_follower_index = {}

        self._trigger = _Trigger(self._max_conversion_delay)
        attri = _Attributor(self._max_conversion_delay)
        self._acc = _Accumulator(attri)

        self._enable_negative_example_generator = \
                example_joiner_options.enable_negative_example_generator
        if self._enable_negative_example_generator:
            sf = example_joiner_options.negative_sampling_rate
            self._negative_example_generator = NegativeExampleGenerator(sf)

    @classmethod
    def name(cls):
        return 'ATTRIBUTION_JOINER'

    def _inner_joiner(self, state_stale):
        if self.is_join_finished():
            return
        sync_example_id_finished, raw_data_finished = \
                self._prepare_join(state_stale)
        join_data_finished = False

        while True:
            leader_filled = self._fill_leader_join_window()
            leader_exhausted = sync_example_id_finished and \
                    self._leader_join_window.et_span() <    \
                    self._max_conversion_delay
            follower_filled = self._fill_follower_join_window()

            logging.info("Fill: leader_filled=%s, leader_exhausted=%s,"\
                         " follower_filled=%s,"\
                         " sync_example_id_finished=%s, raw_data_finished=%s"\
                         " leader_win_size=%d, follower_win_size=%d",\
                        leader_filled, leader_exhausted, \
                        follower_filled, sync_example_id_finished, \
                        raw_data_finished, self._leader_join_window.size(), \
                        self._follower_join_window.size())

            watermark = self._trigger.watermark()
            #1. find all the matched pairs in current window
            raw_pairs, mismatches = self._acc.join(self._follower_join_window,\
                        self._leader_join_window)
            if self._enable_negative_example_generator:
                self._negative_example_generator.update(mismatches)
            #2. cache the pairs, evict the show events which are out of
            # watermark
            pairs = self._sort_and_evict_attri_buf(raw_pairs, watermark)
            #3. push the result into builder
            if len(pairs) > 0:
                for meta in self._dump_joined_items(pairs):
                    yield meta
                self._leader_restart_index = pairs[len(pairs) - 1][1]
                self._follower_restart_index = pairs[len(pairs) - 1][2]
            logging.info("Restart index of leader %d, follwer %d, pair_buf=%d,"\
                         " raw_pairs=%d, pairs=%d", self._leader_restart_index,\
                         self._follower_restart_index,
                         len(self._sorted_buf_by_leader_index), len(raw_pairs),
                         len(pairs))

            #4. update the watermark
            stride = self._trigger.trigger(self._follower_join_window,  \
                                           self._leader_join_window)
            self._follower_join_window.forward(stride[0])
            self._leader_join_window.forward(stride[1])

            if not leader_filled and                                    \
               not sync_example_id_finished and                         \
               self._leader_join_window.reserved_size() > 0:
                logging.info("Wait for Leader syncing example id...")
                break

            if leader_exhausted:
                join_data_finished = True
                break

            if stride == (0, 0):
                if raw_data_finished:
                    self._leader_join_window.forward(
                        self._leader_join_window.size())

                if sync_example_id_finished:
                    force_stride = \
                            self._trigger.shrink(self._follower_join_window)
                    self._follower_join_window.forward(force_stride)

        if self._get_data_block_builder(False) is not None and \
                (self._need_finish_data_block_since_interval() or
                    join_data_finished):
            yield self._finish_data_block()
        if join_data_finished:
            self._set_join_finished()
            logging.info("finish join example for partition %d by %s",
                         self._partition_id, self.name())

    def _latest_attri(self, index):
        lf, rt = 0, len(self._sorted_buf_by_leader_index)
        while lf < rt:
            mid = (lf + rt) // 2
            if index < self._sorted_buf_by_leader_index[mid][1]:
                rt = mid
            else:
                lf = mid + 1
        return lf

    def _sort_and_evict_attri_buf(self, raw_matches, watermark):
        """
        Push the matched pairs to order-by-leader-index list,
        and evict the pairs which are out of watermark
        """
        for (cid, sid) in raw_matches:
            #fi: follower index, fe: follower example
            assert cid < self._follower_join_window.size(), "Invalid l index"
            assert sid < self._leader_join_window.size(), "Invalid f index"
            (fi, fe) = self._follower_join_window[cid]
            (li, le) = self._leader_join_window[sid]
            assert fe.example_id == le.example_id, "Example id must be equal"
            if li <= self._leader_restart_index:
                logging.warning("Unordered event ignored, leader index should"\
                                " be greater %d > %d for follower idx %d is"  \
                                " false", li, self._leader_restart_index, fi)
                continue

            # cache the latest show event
            updated = False
            if fi in self._dedup_by_follower_index:
                if self._dedup_by_follower_index[fi][1] > le.event_time:
                    self._dedup_by_follower_index[fi] = (li, le.event_time)
                    updated = True
            else:
                self._dedup_by_follower_index[fi] = (li, le.event_time)
                updated = True
            # sort by leader index
            if not updated:
                continue
            latest_pos = self._latest_attri(li)
            if latest_pos > 0:
                # remove the dups
                latest_item = \
                    self._sorted_buf_by_leader_index[latest_pos - 1]
                if latest_item[1] == li and latest_item[2] == fi:
                    continue
            self._sorted_buf_by_leader_index.insert(latest_pos, \
                                          (fe, li, fi))
        matches = []
        idx = 0
        for (fe, li, fi) in self._sorted_buf_by_leader_index:
            if fe.event_time <= watermark:
                assert fi in self._dedup_by_follower_index, "Invalid f index"
                (leader_index, _) = self._dedup_by_follower_index[fi]
                if leader_index == li:
                    matches.append((fe, li, fi))
                    del self._dedup_by_follower_index[fi]
                else:
                    logging.info("Example %s matching leader index %s is"\
                                 " older than %d", fe.example_id, li,    \
                                 leader_index)
            else:
                # FIXME: Assume the unordered range is limited,
                #  or this will bring an out-of-memory crash
                break
            idx += 1
        self._sorted_buf_by_leader_index \
                = self._sorted_buf_by_leader_index[idx:]
        return matches

    # useless
    def _reset_joiner_state(self, state_stale):
        self._leader_join_window.reset([], state_stale)
        if state_stale:
            self._follower_join_window.reset([], True)

    def _prepare_join(self, state_stale):
        if state_stale:
            self._reset_joiner_state(True)
        return super(AttributionJoiner, self)._prepare_join(state_stale)

    def _dump_joined_items(self, matching_list):
        start_tm = time.time()
        prev_leader_idx = self._leader_restart_index + 1
        for item in matching_list:
            (fe, li, fi) = item
            if self._enable_negative_example_generator and li > prev_leader_idx:
                for example in \
                    self._negative_example_generator.generate(
                        fe, prev_leader_idx, li):

                    builder = self._get_data_block_builder(True)
                    assert builder is not None, "data block builder must be "\
                                                "not None if before dummping"
                    builder.append_item(example[0], example[1], example[2],
                                        None, True)
                    if builder.check_data_block_full():
                        yield self._finish_data_block()
            prev_leader_idx = li + 1

            builder = self._get_data_block_builder(True)
            assert builder is not None, "data block builder must be "\
                                        "not None if before dummping"
            builder.append_item(fe, li, fi, None, True)
            if builder.check_data_block_full():
                yield self._finish_data_block()
        metrics.emit_timer(name='attribution_joiner_dump_joined_items',
                           value=int(time.time() - start_tm),
                           tags=self._metrics_tags)

    def _fill_leader_join_window(self):
        start_tm = time.time()
        idx = self._leader_join_window.size()
        filled_new_example = self._fill_join_windows(self._leader_visitor,
                                                     self._leader_join_window)
        eids = []
        while idx < self._leader_join_window.size():
            eids.append((self._leader_join_window[idx][0],
                         self._leader_join_window[idx][1].example_id))
            idx += 1

        self._joiner_stats.fill_leader_example_ids(eids)
        metrics.emit_timer(name=\
                           'attribution_joiner_fill_leader_join_window',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)
        return filled_new_example

    def _fill_follower_join_window(self):
        start_tm = time.time()
        idx = self._follower_join_window.size()
        filled_new_example = self._fill_join_windows(
            self._follower_visitor, self._follower_join_window)
        eids = []
        while idx < self._follower_join_window.size():
            eids.append((self._follower_join_window[idx][0],
                         self._follower_join_window[idx][1].example_id))
            idx += 1

        self._joiner_stats.fill_follower_example_ids(eids)
        metrics.emit_timer(name=\
                           'attribution_joiner_fill_follower_join_window',
                           value=int(time.time()-start_tm),
                           tags=self._metrics_tags)
        return filled_new_example

    def _fill_join_windows(self, visitor, join_window):
        size = join_window.size()
        while not visitor.finished() and not join_window.is_full():
            required_item_count = join_window.reserved_size()
            self._consume_item_until_count(visitor, join_window,
                                           required_item_count)
        return join_window.size() > size

    def _consume_item_until_count(self, visitor, windows, required_item_count):
        for (index, item) in visitor:
            if item.example_id == common.InvalidExampleId:
                logging.warning("ignore item indexed as %d from %s since "\
                                "invalid example id", index, visitor.name())
            elif item.event_time == common.InvalidEventTime:
                logging.warning("ignore item indexed as %d from %s since "\
                                "invalid event time", index, visitor.name())
            else:
                windows.append(index, item)
                if windows.size() >= required_item_count:
                    return
        assert visitor.finished(), "visitor shoud be finished of "\
                                   "required_item is not satisfied"