def _wait_timestamp(self, target_l, target_f):
     while True:
         min_datetime_l = None
         min_datetime_f = None
         for pid in range(
                 self._data_source_f.data_source_meta.partition_num):
             req_l = dj_pb.RawDataRequest(
                 partition_id=pid,
                 data_source_meta=self._data_source_l.data_source_meta)
             req_f = dj_pb.RawDataRequest(
                 partition_id=pid,
                 data_source_meta=self._data_source_f.data_source_meta)
             rsp_l = self._master_client_l.GetRawDataLatestTimeStamp(req_l)
             rsp_f = self._master_client_f.GetRawDataLatestTimeStamp(req_f)
             datetime_l = common.convert_timestamp_to_datetime(
                 rsp_l.timestamp)
             datetime_f = common.convert_timestamp_to_datetime(
                 rsp_f.timestamp)
             if min_datetime_l is None or min_datetime_l > datetime_l:
                 min_datetime_l = datetime_l
             if min_datetime_f is None or min_datetime_f > datetime_f:
                 min_datetime_f = datetime_f
         if min_datetime_l >= target_l and min_datetime_f >= target_f:
             break
         else:
             time.sleep(2)
 def _add_raw_data_impl(self, notify_ctx, portal_manifest, ds_pid):
     dt = notify_ctx.get_raw_data_updated_datetime(ds_pid) + \
             timedelta(hours=1)
     begin_dt = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp))
     if dt < begin_dt:
         dt = begin_dt
     committed_dt = common.convert_timestamp_to_datetime(
         portal_manifest.committed_timestamp)
     fpaths = []
     timestamps = []
     ds_ptnum = notify_ctx.data_source.data_source_meta.partition_num
     while dt <= committed_dt:
         for pt_pid in range(ds_pid, portal_manifest.output_partition_num,
                             ds_ptnum):
             fpath = common.encode_portal_hourly_fpath(
                 portal_manifest.output_data_base_dir, dt, pt_pid)
             if gfile.Exists(fpath):
                 fpaths.append(fpath)
                 timestamps.append(common.convert_datetime_to_timestamp(dt))
         if len(fpaths) > 32 or dt == committed_dt:
             break
         dt += timedelta(hours=1)
     notify_ctx.add_raw_data(ds_pid, fpaths, timestamps, dt)
     logging.info("add %d raw data file for partition %d of data "\
                  "source %s. latest updated datetime %s",
                   len(fpaths), ds_pid,
                   notify_ctx.data_source.data_source_meta.name, dt)
     return dt >= committed_dt
示例#3
0
 def _get_required_datetime(cls, portal_manifest):
     committed_datetime = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(
             portal_manifest.committed_timestamp))
     begin_datetime = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp))
     if begin_datetime >= committed_datetime + timedelta(hours=1):
         return begin_datetime
     return committed_datetime + timedelta(hours=1)
    def setUp(self):
        self._setUpEtcd()
        self._setUpDataSource()
        self._setUpPortalManifest()
        self._remove_existed_dir()
        self._item_num_l = 0
        self._event_time_filter_l = lambda x: x % 877 == 0
        self._dt_l = common.convert_timestamp_to_datetime(
            self._portal_manifest_l.begin_timestamp)
        for i in range(4):
            if i == 1:
                self._missing_datetime_l = self._dt_l
                self._missing_start_index_l = self._item_num_l
                self._missing_item_cnt_l = 1 << 13
                self._item_num_l += self._missing_item_cnt_l
            else:
                self._generate_portal_input_data(self._dt_l,
                                                 self._event_time_filter_l,
                                                 self._item_num_l, 1 << 13,
                                                 self._portal_manifest_l)
                self._item_num_l += 1 << 13
            self._dt_l += timedelta(hours=1)
        self._item_num_f = 0
        self._event_time_filter_f = lambda x: x % 907 == 0
        self._dt_f = common.convert_timestamp_to_datetime(
            self._portal_manifest_f.begin_timestamp)
        for i in range(5):
            if i == 2:
                self._missing_datetime_f = self._dt_f
                self._missing_start_index_f = self._item_num_f
                self._missing_item_cnt_f = 1 << 13
            else:
                self._generate_portal_input_data(self._dt_f,
                                                 self._event_time_filter_f,
                                                 self._item_num_f, 1 << 13,
                                                 self._portal_manifest_f)
            self._item_num_f += 1 << 13
            self._dt_f += timedelta(hours=1)

        self._launch_masters()
        self._launch_workers()
        self._launch_portals()
 def get_raw_data_updated_datetime(self, partition_id):
     if partition_id not in self._raw_data_updated_datetime:
         ts = self._raw_date_controller.get_raw_data_latest_timestamp(
             partition_id)
         if ts.seconds > 3600:
             ts.seconds -= 3600
         else:
             ts.seconds = 0
         self._raw_data_updated_datetime[partition_id] = \
                 common.convert_timestamp_to_datetime(
                         common.trim_timestamp_by_hourly(ts)
                     )
     return self._raw_data_updated_datetime[partition_id]
示例#6
0
 def _update_portal_commited_timestamp(self, new_committed_datetime):
     new_manifest = None
     with self._lock:
         old_committed_datetime = common.convert_timestamp_to_datetime(
             common.trim_timestamp_by_hourly(
                 self._portal_manifest.committed_timestamp))
         assert new_committed_datetime > old_committed_datetime
         new_manifest = common_pb.DataJoinPortalManifest()
         new_manifest.MergeFrom(self._portal_manifest)
     assert new_manifest is not None
     new_manifest.committed_timestamp.MergeFrom(
         common.trim_timestamp_by_hourly(
             common.convert_datetime_to_timestamp(new_committed_datetime)))
     common.commit_portal_manifest(self._etcd, new_manifest)
     return new_manifest
示例#7
0
 def _check_datetime_stale(self, date_time):
     with self._lock:
         committed_datetime = common.convert_timestamp_to_datetime(
             common.trim_timestamp_by_hourly(
                 self._portal_manifest.committed_timestamp))
         if date_time > committed_datetime:
             idx = bisect.bisect_left(self._input_ready_datetime, date_time)
             if idx < len(self._input_ready_datetime) and \
                     self._input_ready_datetime[idx] == date_time:
                 return True
             idx = bisect.bisect_left(self._output_finished_datetime,
                                      date_time)
             if idx < len(self._output_finished_datetime) and \
                     self._output_finished_datetime[idx] == date_time:
                 return True
             return False
         return True
示例#8
0
 def _prepare_test(self):
     self._portal_manifest = common_pb.DataJoinPortalManifest(
         name='test_portal',
         input_partition_num=4,
         output_partition_num=8,
         input_data_base_dir='./portal_input',
         output_data_base_dir='./portal_output')
     self._portal_options = dj_pb.DataJoinPotralOptions(
         example_validator=dj_pb.ExampleValidatorOptions(
             example_validator='EXAMPLE_VALIDATOR',
             validate_event_time=True,
         ),
         reducer_buffer_size=128,
         raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD'),
         use_mock_etcd=True)
     self._date_time = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(
             common.convert_datetime_to_timestamp(datetime.now())))
     self._generate_input_data()