Exemplo n.º 1
0
 def merge_sort_runs(self, input_fpaths):
     if self._check_merged():
         logging.info("sort runs have been merged for partition %d",
                      self._partition_id)
         return self._list_merged_sort_run_fpath()
     if len(input_fpaths) == 0:
         logging.info("no sort run for partition %d", self._partition_id)
         return []
     dumped_item, next_process_index = self._sync_merged_state()
     readers = self._create_sort_run_readers(input_fpaths)
     pque = queue.PriorityQueue(len(input_fpaths) + 1)
     for idx, reader in enumerate(readers):
         if not reader.finished():
             for item in reader:
                 if dumped_item is None or \
                         not self._comparator(item, dumped_item):
                     pque.put(item)
                     break
     writer = self._create_sort_run_merger_writer(next_process_index)
     while pque.qsize() > 0:
         item = pque.get()
         writer.append(item.inner_item)
         assert item.reader_index < len(readers)
         self._replenish_item(readers[item.reader_index], pque)
         if common.get_oom_risk_checker().check_oom_risk(0.85):
             gc_cnt = gc.collect()
             logging.info('trigger gc %d object actively since oom risk',
                          gc_cnt)
     writer.finish()
     return writer.get_merged_fpaths()
Exemplo n.º 2
0
 def _id_batch_fetch_fn(self):
     next_index = self._psi_rsa_signer.get_next_index_to_fetch()
     for batch in self._id_batch_fetcher.make_processor(next_index):
         logging.debug("%s fetch batch begin at %d, len %d. wakeup %s",
                       self._id_batch_fetcher_name(), batch.begin_index,
                       len(batch), self._psi_rsa_signer_name())
         if get_oom_risk_checker().check_oom_risk(0.80):
             logging.warning('early stop the id fetch '\
                             'since the oom risk')
         self._wakeup_psi_rsa_signer()
Exemplo n.º 3
0
 def _raw_data_batch_fetch_fn(self):
     next_part_index = self._get_next_part_index()
     fetcher = self._raw_data_batch_fetcher
     for batch in fetcher.make_processor(next_part_index):
         logging.debug("fetch batch begin at %d, len %d. wakeup "\
                       "partitioner", batch.begin_index, len(batch))
         self._wakeup_partitioner()
         if common.get_oom_risk_checker().check_oom_risk(0.80):
             logging.warning('early stop the raw data fetch '\
                             'since the oom risk')
             break
Exemplo n.º 4
0
 def _raw_data_part_fn(self):
     if self._check_finished_tag():
         logging.warning("raw data has been parttedfor rank id of parti"\
                         "tioner %d", self._options.partitioner_rank_id)
         self._notify_part_finished()
         return
     self._sync_partitioner_state()
     assert self._dumped_process_index is not None
     assert len(self._flying_writers) == 0
     fetcher = self._raw_data_batch_fetcher
     fetch_finished = False
     iter_round = 0
     next_index = self._get_next_part_index()
     hint_index = None
     bp_options = self._options.batch_processor_options
     signal_round_threhold = bp_options.max_flying_item * 3 // \
             5 // bp_options.batch_size + 1
     while not fetch_finished:
         fetch_finished, batch, hint_index = \
                 fetcher.fetch_item_batch_by_index(next_index, hint_index)
         if batch is not None:
             for index, item in enumerate(batch):
                 raw_id = getattr(item, self._part_field)
                 partition_id = CityHash32(raw_id) % \
                         self._options.output_partition_num
                 writer = self._get_file_writer(partition_id)
                 writer.append_item(batch.begin_index + index, item)
             next_index += len(batch)
             iter_round += 1
             oom_risk = common.get_oom_risk_checker().check_oom_risk(0.70)
             if iter_round % signal_round_threhold == 0 or oom_risk:
                 self._finish_file_writers()
                 self._set_next_part_index(next_index)
                 hint_index = self._evict_staless_batch(
                     hint_index, next_index - 1)
                 logging.info("consumed %d items", next_index - 1)
                 if oom_risk:
                     gc_cnt = gc.collect()
                     logging.warning("earily finish writer partition "\
                                     "writer since oom risk, trigger "\
                                     "gc %d actively", gc_cnt)
                 self._wakeup_raw_data_fetcher()
         elif not fetch_finished:
             with self._cond:
                 self._cond.wait(1)
     self._finish_file_writers()
     self._dump_finished_tag()
     for partition_id, metas in self._dumped_file_metas.items():
         logging.info("part %d output %d files by partitioner",
                      partition_id, len(metas))
         for meta in metas:
             logging.info("%s", meta.encode_meta_to_fname())
         logging.info("-----------------------------------")
     self._notify_part_finished()
Exemplo n.º 5
0
 def _sort_run_dump_cond(self):
     sort_run_dumper = self._sort_run_dumper
     rsa_signer = self._psi_rsa_signer
     next_index = sort_run_dumper.get_next_index_to_dump()
     max_flying_item = self._options.batch_processor_options.max_flying_item
     dump_finished = sort_run_dumper.is_dump_finished()
     signed_finished = rsa_signer.get_process_finished()
     flying_item_cnt = rsa_signer.get_flying_item_count()
     flying_begin_index = rsa_signer.get_flying_begin_index()
     return not dump_finished and \
             (signed_finished or
              (flying_begin_index is not None and
               next_index is not None and
               (flying_begin_index <= next_index <
                   flying_begin_index + flying_item_cnt) and
                (flying_item_cnt-(next_index-flying_begin_index) >=
                 max_flying_item // 3 or
                 get_oom_risk_checker().check_oom_risk(0.70))))
Exemplo n.º 6
0
 def _id_batch_fetch_cond(self):
     next_index = self._psi_rsa_signer.get_next_index_to_fetch()
     return self._id_batch_fetcher.need_process(next_index) and \
             not get_oom_risk_checker().check_oom_risk(0.80)
Exemplo n.º 7
0
 def _raw_data_batch_fetch_cond(self):
     next_part_index = self._get_next_part_index()
     return self._raw_data_batch_fetcher.need_process(next_part_index) and \
             not common.get_oom_risk_checker().check_oom_risk(0.80)