def __init__(self, example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id): super(UniversalJoiner, self).__init__(example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id) self._min_window_size = example_joiner_options.min_matching_window self._max_window_size = example_joiner_options.max_matching_window self._max_watermark_delay = \ example_joiner_options.max_conversion_delay self._key_mapper = create_key_mapper( example_joiner_options.join_key_mapper) self._leader_join_window = _SlidingWindow( self._min_window_size, self._max_window_size, self._key_mapper.leader_mapping) self._follower_join_window = _SlidingWindow( self._min_window_size, self._max_window_size, self._key_mapper.follower_mapping) self._leader_restart_index = -1 self._leader_index_ps = PrioritySet() self._dedup_by_follower_index = {} self._trigger = _Trigger(self._max_watermark_delay) self._expr = expr.Expr(example_joiner_options.join_expr) self._joiner = _JoinerImpl(self._expr) self._enable_negative_example_generator = \ example_joiner_options.enable_negative_example_generator if self._enable_negative_example_generator: sf = example_joiner_options.negative_sampling_rate fe = example_joiner_options.negative_sampling_filter_expr self._negative_example_generator = NegativeExampleGenerator(sf, fe)
def __init__(self, example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id): super(AttributionJoiner, self).__init__(example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id) self._min_window_size = example_joiner_options.min_matching_window # max_window_size must be lesser than max_conversion_delay self._max_window_size = example_joiner_options.max_matching_window self._max_conversion_delay = \ example_joiner_options.max_conversion_delay self._leader_join_window = _SlidingWindow(self._min_window_size, 2**20) self._follower_join_window = _SlidingWindow(self._min_window_size, self._max_window_size) self._leader_restart_index = -1 self._sorted_buf_by_leader_index = [] self._dedup_by_follower_index = {} self._trigger = _Trigger(self._max_conversion_delay) attri = _Attributor(self._max_conversion_delay) self._acc = _Accumulator(attri) self._enable_negative_example_generator = \ example_joiner_options.enable_negative_example_generator if self._enable_negative_example_generator: sf = example_joiner_options.negative_sampling_rate self._negative_example_generator = NegativeExampleGenerator(sf)
def __init__(self, example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id): super(StreamExampleJoiner, self).__init__(example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id) self._min_window_size = example_joiner_options.min_matching_window self._max_window_size = example_joiner_options.max_matching_window self._leader_join_window = _JoinWindow(0.05, 0.99) self._follower_join_window = _JoinWindow(0.05, 0.90) self._joined_cache = {} self._leader_unjoined_example_ids = [] self._follower_example_cache = {} self._fill_leader_enough = False self._reset_joiner_state(True) self._enable_negative_example_generator = \ example_joiner_options.enable_negative_example_generator if self._enable_negative_example_generator: sf = example_joiner_options.negative_sampling_rate self._negative_example_generator = NegativeExampleGenerator(sf)
class UniversalJoiner(ExampleJoiner): def __init__(self, example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id): super(UniversalJoiner, self).__init__(example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id) self._min_window_size = example_joiner_options.min_matching_window self._max_window_size = example_joiner_options.max_matching_window self._max_watermark_delay = \ example_joiner_options.max_conversion_delay self._key_mapper = create_key_mapper( example_joiner_options.join_key_mapper) self._leader_join_window = _SlidingWindow( self._min_window_size, self._max_window_size, self._key_mapper.leader_mapping) self._follower_join_window = _SlidingWindow( self._min_window_size, self._max_window_size, self._key_mapper.follower_mapping) self._leader_restart_index = -1 self._leader_index_ps = PrioritySet() self._dedup_by_follower_index = {} self._trigger = _Trigger(self._max_watermark_delay) self._expr = expr.Expr(example_joiner_options.join_expr) self._joiner = _JoinerImpl(self._expr) self._enable_negative_example_generator = \ example_joiner_options.enable_negative_example_generator if self._enable_negative_example_generator: sf = example_joiner_options.negative_sampling_rate fe = example_joiner_options.negative_sampling_filter_expr self._negative_example_generator = NegativeExampleGenerator(sf, fe) @classmethod def name(cls): return 'UNIVERSAL_JOINER' def _inner_joiner(self, state_stale): if self.is_join_finished(): return sync_example_id_finished, raw_data_finished = \ self._prepare_join(state_stale) join_data_finished = False while True: fill_leader_enough = self._fill_leader_join_window( sync_example_id_finished) leader_exhausted = sync_example_id_finished and \ not self._leader_join_window.is_full() follower_exhausted = False logging.info('Fill leader_exhausted: %s, sync_example_id_finished ' '%s, raw_data_finished %s, leader_win_size %d, ' 'follower_win_size %d, raw_data_finished %d', leader_exhausted, sync_example_id_finished, raw_data_finished, self._leader_join_window.size(), self._follower_join_window.size(), raw_data_finished) while self._fill_follower_join_window(raw_data_finished): follower_exhausted = raw_data_finished and \ not self._follower_join_window.is_full() logging.info("Fill: follower_exhausted=%s, " "follower_win_size=%d", follower_exhausted, self._follower_join_window.size()) #1. find all the matched pairs in current window raw_pairs, mismatches = self._joiner.join( self._follower_join_window, self._leader_join_window, self._max_watermark_delay) if self._enable_negative_example_generator: self._negative_example_generator.update(mismatches) stride = self._trigger.trigger(self._follower_join_window, self._leader_join_window) #2. cache the pairs, evict the leader events which are out of # watermark watermark = self._trigger.watermark() pairs = self._update_matching_pairs(raw_pairs, watermark) #3. push the result into builder if len(pairs) > 0: for meta in self._dump_joined_items(pairs): yield meta self._leader_restart_index = pairs[len(pairs) - 1].li self._follower_restart_index = pairs[len(pairs) - 1].fi logging.info("Restart index of leader %d, follwer %d," "pair_buf=%d, raw_pairs=%d, pairs=%d", self._leader_restart_index, self._follower_restart_index, self._leader_index_ps.size(), len(raw_pairs), len(pairs)) #4. update window self._follower_join_window.forward(stride[0], self._optional_stats) self._leader_join_window.forward(stride[1]) if self._follower_join_window.is_full(): if self._leader_join_window.is_full(): raise RuntimeError('max_matching_size[%d] is too ' 'small, dead looping'% self._follower_join_window.size()) # leader is moving forward break if follower_exhausted: break if leader_exhausted and self._leader_join_window.et_span( self._max_watermark_delay): join_data_finished = True break if follower_exhausted and self._follower_join_window.et_span( self._max_watermark_delay): join_data_finished = True break if self._leader_join_window.is_full() or not fill_leader_enough: break if self._get_data_block_builder(False) is not None and \ (self._need_finish_data_block_since_interval() or join_data_finished): yield self._finish_data_block() if join_data_finished: self._set_join_finished() logging.info("finish join example for partition %d by %s", self._partition_id, self.name()) def _update_matching_pairs(self, raw_pairs, watermark): """ Push the pairs into a order-by-leader-index list, and evict the pairs which are out-of-watermark """ for (cid, sid) in raw_pairs: #fi: follower index, fe: follower example assert cid < self._follower_join_window.size(), \ "Leader index[%d] out of range"%cid assert sid < self._leader_join_window.size(), \ "Follower index[%d] out of range"%(sid) example_with_index = self._follower_join_window[cid] fi, fe = example_with_index.index, example_with_index.item example_with_index = self._leader_join_window[sid] li, le = example_with_index.index, example_with_index.item if li <= self._leader_restart_index: logging.warning("Leader index should be bigger than restart " "index, %d > %d for follower idx %d", li, self._leader_restart_index, fi) continue if abs(fcc.time_diff(fe.event_time, le.event_time)) > \ self._max_watermark_delay: ### unreachable branch logging.info('Pair %s:%s out-of-delay, leader et %d, ' 'follower et %d', le.example_id, fe.example_id, le.event_time, fe.event_time) continue # cache the latest leader event updated = False if fi in self._dedup_by_follower_index: old_conv_int = fcc.time_diff(fe.event_time, le.event_time) new_conv_int = fcc.time_diff( self._dedup_by_follower_index[fi].event_time, le.event_time) if abs(old_conv_int) > abs(new_conv_int): self._dedup_by_follower_index[fi] = \ IndexedTime(li, le.event_time) updated = True else: self._dedup_by_follower_index[fi] = \ IndexedTime(li, le.event_time) updated = True # sort by leader index if not updated: continue self._leader_index_ps.put(_IndexedPair(fe, li, fi)) matches = [] while not self._leader_index_ps.empty(): ip = self._leader_index_ps.get() if ip.fe.event_time <= watermark: if ip.fi not in self._dedup_by_follower_index: logging.info("Ignore the deleted follower index %d", ip.fi) continue indexed_time = self._dedup_by_follower_index[ip.fi] if indexed_time.li == ip.li: matches.append(ip) del self._dedup_by_follower_index[ip.fi] else: logging.info("Example %s matching leader index %s is" \ " older than %d", ip.fe.example_id, \ ip.li, indexed_time.li) else: self._leader_index_ps.put(ip) logging.info('Break dumping, event time %s, watermark %s', ip.fe.event_time, watermark) break return matches # useless def _reset_joiner_state(self, state_stale): self._leader_join_window.reset([], state_stale) if state_stale: self._follower_join_window.reset([], True) def _prepare_join(self, state_stale): if state_stale: self._reset_joiner_state(True) return super(UniversalJoiner, self)._prepare_join(state_stale) def _dump_joined_items(self, indexed_pairs): start_tm = time.time() for ip in indexed_pairs: if self._enable_negative_example_generator: for example in \ self._negative_example_generator.generate(ip.fe, ip.li): builder = self._get_data_block_builder(True) assert builder is not None, "data block builder must be "\ "not None if before dummping" # example: (li, fi, item) builder.append_item(example[0], example[1], example[2], None, True, 0) if builder.check_data_block_full(): yield self._finish_data_block() builder = self._get_data_block_builder(True) assert builder is not None, "data block builder must be "\ "not None if before dummping" builder.append_item(ip.fe, ip.li, ip.fi, None, True, joined=1) if builder.check_data_block_full(): yield self._finish_data_block() metrics.emit_timer(name='universal_joiner_dump_joined_items', value=int(time.time()-start_tm), tags=self._metrics_tags) def _fill_leader_join_window(self, sync_example_id_finished): start_tm = time.time() idx = self._leader_join_window.size() filled_enough = self._fill_join_windows(self._leader_visitor, self._leader_join_window) if not filled_enough: filled_enough = sync_example_id_finished eids = [] while idx < self._leader_join_window.size(): eids.append((self._leader_join_window[idx].index, self._leader_join_window[idx].item.example_id)) idx += 1 self._joiner_stats.fill_leader_example_ids(eids) metrics.emit_timer(name=\ 'universal_joiner_fill_leader_join_window', value=int(time.time()-start_tm), tags=self._metrics_tags) return filled_enough def _fill_follower_join_window(self, raw_data_finished): start_tm = time.time() idx = self._follower_join_window.size() filled_enough = self._fill_join_windows(self._follower_visitor, self._follower_join_window) eids = [] while idx < self._follower_join_window.size(): eids.append((self._follower_join_window[idx].index, self._follower_join_window[idx].item.example_id)) idx += 1 self._joiner_stats.fill_follower_example_ids(eids) metrics.emit_timer(name=\ 'universal_joiner_fill_follower_join_window', value=int(time.time()-start_tm), tags=self._metrics_tags) return filled_enough or raw_data_finished def _fill_join_windows(self, visitor, join_window): size = join_window.size() while not visitor.finished() and not join_window.is_full(): required_item_count = join_window.reserved_size() self._consume_item_until_count( visitor, join_window, required_item_count ) # return True if new elem added or window reaches its capacity return join_window.size() > size or size >= self._max_window_size def _consume_item_until_count(self, visitor, windows, required_item_count): for (index, item) in visitor: if item.example_id == common.InvalidExampleId: logging.warning("ignore item indexed as %d from %s since "\ "invalid example id", index, visitor.name()) elif item.event_time == common.InvalidEventTime: logging.warning("ignore item indexed as %d from %s since "\ "invalid event time", index, visitor.name()) else: windows.append(index, item) if windows.size() >= required_item_count: return assert visitor.finished(), "visitor shoud be finished of "\ "required_item is not satisfied"
class StreamExampleJoiner(ExampleJoiner): def __init__(self, example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id): super(StreamExampleJoiner, self).__init__(example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id) self._min_window_size = example_joiner_options.min_matching_window self._max_window_size = example_joiner_options.max_matching_window self._leader_join_window = _JoinWindow(0.05, 0.99) self._follower_join_window = _JoinWindow(0.05, 0.90) self._joined_cache = {} self._leader_unjoined_example_ids = [] self._follower_example_cache = {} self._fill_leader_enough = False self._enable_negative_example_generator = \ example_joiner_options.enable_negative_example_generator if self._enable_negative_example_generator: sf = example_joiner_options.negative_sampling_rate fe = example_joiner_options.negative_sampling_filter_expr self._negative_example_generator = NegativeExampleGenerator(sf, fe) self._reset_joiner_state(True) @classmethod def name(cls): return 'STREAM_JOINER' def _inner_joiner(self, state_stale): if self.is_join_finished(): return sync_example_id_finished, raw_data_finished = \ self._prepare_join(state_stale) logging.info("streaming joiner: sync_example_id_finished: %s," "raw_data_finished: %s", sync_example_id_finished, raw_data_finished) join_data_finished = False while self._fill_leader_join_window(sync_example_id_finished): leader_exhausted = sync_example_id_finished and \ self._leader_join_window.size() <= \ self._min_window_size / 2 follower_exhausted = False delay_dump = True logging.info("before leader window size %d, follwer %d, " "follower cache %d, leader unjoined example %d", self._leader_join_window.size(), self._follower_join_window.size(), len(self._follower_example_cache), len(self._leader_unjoined_example_ids)) while delay_dump and \ self._fill_follower_join_window(raw_data_finished): follower_exhausted = raw_data_finished and \ self._follower_join_window.size() <= \ self._min_window_size / 2 delay_dump = self._need_delay_dump(raw_data_finished) if delay_dump: self._update_join_cache() else: for meta in self._dump_joined_items(): yield meta self._evit_stale_follower_cache() if not delay_dump: self._reset_joiner_state(False) if leader_exhausted: join_data_finished = not delay_dump elif follower_exhausted: join_data_finished = True logging.info("delay_dump %s, join_data_finished %s, " "leader_exhausted %s, follower_exhausted %s", delay_dump, join_data_finished, leader_exhausted, follower_exhausted) logging.info("leader window size %d, follwer %d, " "follower cache %d, leader unjoined example %d", self._leader_join_window.size(), self._follower_join_window.size(), len(self._follower_example_cache), len(self._leader_unjoined_example_ids)) if delay_dump or join_data_finished: break if self._get_data_block_builder(False) is not None and \ (self._need_finish_data_block_since_interval() or join_data_finished): yield self._finish_data_block() if join_data_finished: self._set_join_finished() logging.warning("finish join example for partition %d by %s", self._partition_id, self.name()) def _prepare_join(self, state_stale): if state_stale: self._reset_joiner_state(True) return super(StreamExampleJoiner, self)._prepare_join(state_stale) def _need_delay_dump(self, raw_data_finished): if self._follower_visitor.finished() and raw_data_finished: return False leader_qt = self._leader_join_window.qt() follower_qt = self._follower_join_window.qt() logging.info("delay dump leader %s, follower %s", leader_qt, follower_qt) if leader_qt is not None and follower_qt is not None and \ not follower_qt < leader_qt: return False return True def _update_join_cache(self): start_tm = time.time() new_unjoined_example_ids = [] for example_id in self._leader_unjoined_example_ids: if example_id in self._follower_example_cache: self._joined_cache[example_id] = \ self._follower_example_cache[example_id] else: new_unjoined_example_ids.append(example_id) self._leader_unjoined_example_ids = new_unjoined_example_ids metrics.emit_timer(name='stream_joiner_update_join_cache', value=int(time.time()-start_tm), tags=self._metrics_tags) def _dump_joined_items(self): start_tm = time.time() self._neg_samples = {} for (leader_idx, leader_item) in self._leader_join_window: eid = leader_item.example_id if (eid not in self._follower_example_cache and eid not in self._joined_cache): if self._enable_negative_example_generator: self._negative_example_generator.update( {leader_idx:leader_item}) continue if eid not in self._joined_cache: self._joined_cache[eid] = self._follower_example_cache[eid] follower_example = self._joined_cache[eid] if self._enable_negative_example_generator: for example in self._negative_example_generator.generate( follower_example[1], leader_idx ): builder = self._get_data_block_builder(True) assert builder is not None, "data block builder must not " \ "be None before dumping" builder.append_item(example[0], example[1], example[2], joined=0) self._optional_stats.update_stats(example[0], kind='fake') if builder.check_data_block_full(): yield self._finish_data_block() prev_leader_idx = leader_idx builder = self._get_data_block_builder(True) assert builder is not None, "data block builder must not be "\ "None before dumping" follower_idx, item = self._joined_cache[eid] builder.append_item(item, leader_idx, follower_idx, joined=1) self._optional_stats.update_stats(item, kind='joined') if builder.check_data_block_full(): yield self._finish_data_block() metrics.emit_timer(name='stream_joiner_dump_joined_items', value=int(time.time()-start_tm), tags=self._metrics_tags) def _reset_joiner_state(self, state_stale): self._leader_join_window.reset([], state_stale) self._fill_leader_enough = False self._joined_cache = {} self._leader_unjoined_example_ids = [] if state_stale: self._follower_join_window.reset([], True) self._follower_example_cache = {} def _fill_leader_join_window(self, sync_example_id_finished): if not self._fill_leader_enough: start_tm = time.time() start_pos = self._leader_join_window.size() if not self._fill_join_windows(self._leader_visitor, self._leader_join_window, None): self._fill_leader_enough = sync_example_id_finished else: self._fill_leader_enough = True if self._fill_leader_enough: self._leader_unjoined_example_ids = \ [item.example_id for _, item in self._leader_join_window] end_pos = self._leader_join_window.size() eids = [(self._leader_join_window[idx][0], self._leader_join_window[idx][1].example_id) for idx in range(start_pos, end_pos)] self._joiner_stats.fill_leader_example_ids(eids) metrics.emit_timer(name='stream_joiner_fill_leader_join_window', value=int(time.time()-start_tm), tags=self._metrics_tags) return self._fill_leader_enough def _fill_follower_join_window(self, raw_data_finished): start_tm = time.time() start_pos = self._follower_join_window.size() follower_enough = self._fill_join_windows(self._follower_visitor, self._follower_join_window, self._follower_example_cache) end_pos = self._follower_join_window.size() eids = [(self._follower_join_window[idx][0], self._follower_join_window[idx][1].example_id) for idx in range(start_pos, end_pos)] self._joiner_stats.fill_follower_example_ids(eids) metrics.emit_timer(name='stream_joiner_fill_leader_join_window', value=int(time.time()-start_tm), tags=self._metrics_tags) return follower_enough or raw_data_finished def _fill_join_windows(self, visitor, join_window, join_cache): while not visitor.finished() and \ join_window.size() < self._max_window_size: required_item_count = self._min_window_size if join_window.size() >= self._min_window_size: required_item_count *= 2 if required_item_count >= self._max_window_size: required_item_count = self._max_window_size self._consume_item_until_count( visitor, join_window, required_item_count, join_cache ) if join_window.forward_pt(): return True return join_window.size() >= self._max_window_size def _evict_if_useless(self, item): outdated = self._leader_join_window.committed_pt() is None \ or _CmpCtnt(item) < self._leader_join_window.committed_pt() if outdated and item.example_id not in self._joined_cache: self._optional_stats.update_stats(item, kind='unjoined') return outdated or item.example_id in self._joined_cache def _evict_if_force(self, item): outdated = self._leader_join_window.qt() is None or \ _CmpCtnt(item) < self._leader_join_window.qt() if outdated: self._optional_stats.update_stats(item, kind='unjoined') return outdated def _evict_impl(self, candidates, filter_fn): reserved_items = [] for (index, item) in candidates: example_id = item.example_id if filter_fn(item): self._follower_example_cache.pop(example_id, None) else: reserved_items.append((index, item)) return reserved_items def _evit_stale_follower_cache(self): start_tm = time.time() tmp_sz = self._follower_join_window.size() reserved_items = self._evict_impl(self._follower_join_window, self._evict_if_useless) logging.info("evict_if_useless %d to %d", tmp_sz, len(reserved_items)) if len(reserved_items) < self._max_window_size: self._follower_join_window.reset(reserved_items, False) return tmp_sz = len(reserved_items) reserved_items = self._evict_impl(reserved_items, self._evict_if_force) logging.info("evict_if_force %d to %d", tmp_sz, len(reserved_items)) self._follower_join_window.reset(reserved_items, False) metrics.emit_timer(name='stream_joiner_evit_stale_follower_cache', value=int(time.time()-start_tm), tags=self._metrics_tags) def _consume_item_until_count(self, visitor, windows, required_item_count, cache=None): for (index, item) in visitor: if item.example_id == common.InvalidExampleId: logging.warning("ignore item indexed as %d from %s since "\ "invalid example id", index, visitor.name()) elif item.event_time == common.InvalidEventTime: logging.warning("ignore item indexed as %d from %s since "\ "invalid event time", index, visitor.name()) else: windows.append(index, item) if cache is not None: cache[item.example_id] = (index, item) if windows.size() >= required_item_count: return assert visitor.finished(), "visitor shoud be finished of "\ "required_item is not satisfied" def _finish_data_block(self): meta = super(StreamExampleJoiner, self)._finish_data_block() self._follower_restart_index = self._follower_visitor.get_index() if self._follower_join_window.size() > 0: self._follower_restart_index = \ self._follower_join_window[0][0] for index, _ in self._joined_cache.values(): if index < self._follower_restart_index: self._follower_restart_index = index return meta
class AttributionJoiner(ExampleJoiner): def __init__(self, example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id): super(AttributionJoiner, self).__init__(example_joiner_options, raw_data_options, data_block_builder_options, kvstore, data_source, partition_id) self._min_window_size = example_joiner_options.min_matching_window # max_window_size must be lesser than max_conversion_delay self._max_window_size = example_joiner_options.max_matching_window self._max_conversion_delay = \ example_joiner_options.max_conversion_delay self._leader_join_window = _SlidingWindow(self._min_window_size, 2**20) self._follower_join_window = _SlidingWindow(self._min_window_size, self._max_window_size) self._leader_restart_index = -1 self._sorted_buf_by_leader_index = [] self._dedup_by_follower_index = {} self._trigger = _Trigger(self._max_conversion_delay) attri = _Attributor(self._max_conversion_delay) self._acc = _Accumulator(attri) self._enable_negative_example_generator = \ example_joiner_options.enable_negative_example_generator if self._enable_negative_example_generator: sf = example_joiner_options.negative_sampling_rate self._negative_example_generator = NegativeExampleGenerator(sf) @classmethod def name(cls): return 'ATTRIBUTION_JOINER' def _inner_joiner(self, state_stale): if self.is_join_finished(): return sync_example_id_finished, raw_data_finished = \ self._prepare_join(state_stale) join_data_finished = False while True: leader_filled = self._fill_leader_join_window() leader_exhausted = sync_example_id_finished and \ self._leader_join_window.et_span() < \ self._max_conversion_delay follower_filled = self._fill_follower_join_window() logging.info("Fill: leader_filled=%s, leader_exhausted=%s,"\ " follower_filled=%s,"\ " sync_example_id_finished=%s, raw_data_finished=%s"\ " leader_win_size=%d, follower_win_size=%d",\ leader_filled, leader_exhausted, \ follower_filled, sync_example_id_finished, \ raw_data_finished, self._leader_join_window.size(), \ self._follower_join_window.size()) watermark = self._trigger.watermark() #1. find all the matched pairs in current window raw_pairs, mismatches = self._acc.join(self._follower_join_window,\ self._leader_join_window) if self._enable_negative_example_generator: self._negative_example_generator.update(mismatches) #2. cache the pairs, evict the show events which are out of # watermark pairs = self._sort_and_evict_attri_buf(raw_pairs, watermark) #3. push the result into builder if len(pairs) > 0: for meta in self._dump_joined_items(pairs): yield meta self._leader_restart_index = pairs[len(pairs) - 1][1] self._follower_restart_index = pairs[len(pairs) - 1][2] logging.info("Restart index of leader %d, follwer %d, pair_buf=%d,"\ " raw_pairs=%d, pairs=%d", self._leader_restart_index,\ self._follower_restart_index, len(self._sorted_buf_by_leader_index), len(raw_pairs), len(pairs)) #4. update the watermark stride = self._trigger.trigger(self._follower_join_window, \ self._leader_join_window) self._follower_join_window.forward(stride[0]) self._leader_join_window.forward(stride[1]) if not leader_filled and \ not sync_example_id_finished and \ self._leader_join_window.reserved_size() > 0: logging.info("Wait for Leader syncing example id...") break if leader_exhausted: join_data_finished = True break if stride == (0, 0): if raw_data_finished: self._leader_join_window.forward( self._leader_join_window.size()) if sync_example_id_finished: force_stride = \ self._trigger.shrink(self._follower_join_window) self._follower_join_window.forward(force_stride) if self._get_data_block_builder(False) is not None and \ (self._need_finish_data_block_since_interval() or join_data_finished): yield self._finish_data_block() if join_data_finished: self._set_join_finished() logging.info("finish join example for partition %d by %s", self._partition_id, self.name()) def _latest_attri(self, index): lf, rt = 0, len(self._sorted_buf_by_leader_index) while lf < rt: mid = (lf + rt) // 2 if index < self._sorted_buf_by_leader_index[mid][1]: rt = mid else: lf = mid + 1 return lf def _sort_and_evict_attri_buf(self, raw_matches, watermark): """ Push the matched pairs to order-by-leader-index list, and evict the pairs which are out of watermark """ for (cid, sid) in raw_matches: #fi: follower index, fe: follower example assert cid < self._follower_join_window.size(), "Invalid l index" assert sid < self._leader_join_window.size(), "Invalid f index" (fi, fe) = self._follower_join_window[cid] (li, le) = self._leader_join_window[sid] assert fe.example_id == le.example_id, "Example id must be equal" if li <= self._leader_restart_index: logging.warning("Unordered event ignored, leader index should"\ " be greater %d > %d for follower idx %d is" \ " false", li, self._leader_restart_index, fi) continue # cache the latest show event updated = False if fi in self._dedup_by_follower_index: if self._dedup_by_follower_index[fi][1] > le.event_time: self._dedup_by_follower_index[fi] = (li, le.event_time) updated = True else: self._dedup_by_follower_index[fi] = (li, le.event_time) updated = True # sort by leader index if not updated: continue latest_pos = self._latest_attri(li) if latest_pos > 0: # remove the dups latest_item = \ self._sorted_buf_by_leader_index[latest_pos - 1] if latest_item[1] == li and latest_item[2] == fi: continue self._sorted_buf_by_leader_index.insert(latest_pos, \ (fe, li, fi)) matches = [] idx = 0 for (fe, li, fi) in self._sorted_buf_by_leader_index: if fe.event_time <= watermark: assert fi in self._dedup_by_follower_index, "Invalid f index" (leader_index, _) = self._dedup_by_follower_index[fi] if leader_index == li: matches.append((fe, li, fi)) del self._dedup_by_follower_index[fi] else: logging.info("Example %s matching leader index %s is"\ " older than %d", fe.example_id, li, \ leader_index) else: # FIXME: Assume the unordered range is limited, # or this will bring an out-of-memory crash break idx += 1 self._sorted_buf_by_leader_index \ = self._sorted_buf_by_leader_index[idx:] return matches # useless def _reset_joiner_state(self, state_stale): self._leader_join_window.reset([], state_stale) if state_stale: self._follower_join_window.reset([], True) def _prepare_join(self, state_stale): if state_stale: self._reset_joiner_state(True) return super(AttributionJoiner, self)._prepare_join(state_stale) def _dump_joined_items(self, matching_list): start_tm = time.time() prev_leader_idx = self._leader_restart_index + 1 for item in matching_list: (fe, li, fi) = item if self._enable_negative_example_generator and li > prev_leader_idx: for example in \ self._negative_example_generator.generate( fe, prev_leader_idx, li): builder = self._get_data_block_builder(True) assert builder is not None, "data block builder must be "\ "not None if before dummping" builder.append_item(example[0], example[1], example[2], None, True) if builder.check_data_block_full(): yield self._finish_data_block() prev_leader_idx = li + 1 builder = self._get_data_block_builder(True) assert builder is not None, "data block builder must be "\ "not None if before dummping" builder.append_item(fe, li, fi, None, True) if builder.check_data_block_full(): yield self._finish_data_block() metrics.emit_timer(name='attribution_joiner_dump_joined_items', value=int(time.time() - start_tm), tags=self._metrics_tags) def _fill_leader_join_window(self): start_tm = time.time() idx = self._leader_join_window.size() filled_new_example = self._fill_join_windows(self._leader_visitor, self._leader_join_window) eids = [] while idx < self._leader_join_window.size(): eids.append((self._leader_join_window[idx][0], self._leader_join_window[idx][1].example_id)) idx += 1 self._joiner_stats.fill_leader_example_ids(eids) metrics.emit_timer(name=\ 'attribution_joiner_fill_leader_join_window', value=int(time.time()-start_tm), tags=self._metrics_tags) return filled_new_example def _fill_follower_join_window(self): start_tm = time.time() idx = self._follower_join_window.size() filled_new_example = self._fill_join_windows( self._follower_visitor, self._follower_join_window) eids = [] while idx < self._follower_join_window.size(): eids.append((self._follower_join_window[idx][0], self._follower_join_window[idx][1].example_id)) idx += 1 self._joiner_stats.fill_follower_example_ids(eids) metrics.emit_timer(name=\ 'attribution_joiner_fill_follower_join_window', value=int(time.time()-start_tm), tags=self._metrics_tags) return filled_new_example def _fill_join_windows(self, visitor, join_window): size = join_window.size() while not visitor.finished() and not join_window.is_full(): required_item_count = join_window.reserved_size() self._consume_item_until_count(visitor, join_window, required_item_count) return join_window.size() > size def _consume_item_until_count(self, visitor, windows, required_item_count): for (index, item) in visitor: if item.example_id == common.InvalidExampleId: logging.warning("ignore item indexed as %d from %s since "\ "invalid example id", index, visitor.name()) elif item.event_time == common.InvalidEventTime: logging.warning("ignore item indexed as %d from %s since "\ "invalid event time", index, visitor.name()) else: windows.append(index, item) if windows.size() >= required_item_count: return assert visitor.finished(), "visitor shoud be finished of "\ "required_item is not satisfied"