def _wait_timestamp(self, target_l, target_f): while True: min_datetime_l = None min_datetime_f = None for pid in range( self._data_source_f.data_source_meta.partition_num): req_l = dj_pb.RawDataRequest( partition_id=pid, data_source_meta=self._data_source_l.data_source_meta) req_f = dj_pb.RawDataRequest( partition_id=pid, data_source_meta=self._data_source_f.data_source_meta) rsp_l = self._master_client_l.GetRawDataLatestTimeStamp(req_l) rsp_f = self._master_client_f.GetRawDataLatestTimeStamp(req_f) datetime_l = common.convert_timestamp_to_datetime( rsp_l.timestamp) datetime_f = common.convert_timestamp_to_datetime( rsp_f.timestamp) if min_datetime_l is None or min_datetime_l > datetime_l: min_datetime_l = datetime_l if min_datetime_f is None or min_datetime_f > datetime_f: min_datetime_f = datetime_f if min_datetime_l >= target_l and min_datetime_f >= target_f: break else: time.sleep(2)
def _add_raw_data_impl(self, notify_ctx, portal_manifest, ds_pid): dt = notify_ctx.get_raw_data_updated_datetime(ds_pid) + \ timedelta(hours=1) begin_dt = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp)) if dt < begin_dt: dt = begin_dt committed_dt = common.convert_timestamp_to_datetime( portal_manifest.committed_timestamp) fpaths = [] timestamps = [] ds_ptnum = notify_ctx.data_source.data_source_meta.partition_num while dt <= committed_dt: for pt_pid in range(ds_pid, portal_manifest.output_partition_num, ds_ptnum): fpath = common.encode_portal_hourly_fpath( portal_manifest.output_data_base_dir, dt, pt_pid) if gfile.Exists(fpath): fpaths.append(fpath) timestamps.append(common.convert_datetime_to_timestamp(dt)) if len(fpaths) > 32 or dt == committed_dt: break dt += timedelta(hours=1) notify_ctx.add_raw_data(ds_pid, fpaths, timestamps, dt) logging.info("add %d raw data file for partition %d of data "\ "source %s. latest updated datetime %s", len(fpaths), ds_pid, notify_ctx.data_source.data_source_meta.name, dt) return dt >= committed_dt
def _get_required_datetime(cls, portal_manifest): committed_datetime = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly( portal_manifest.committed_timestamp)) begin_datetime = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp)) if begin_datetime >= committed_datetime + timedelta(hours=1): return begin_datetime return committed_datetime + timedelta(hours=1)
def setUp(self): self._setUpEtcd() self._setUpDataSource() self._setUpPortalManifest() self._remove_existed_dir() self._item_num_l = 0 self._event_time_filter_l = lambda x: x % 877 == 0 self._dt_l = common.convert_timestamp_to_datetime( self._portal_manifest_l.begin_timestamp) for i in range(4): if i == 1: self._missing_datetime_l = self._dt_l self._missing_start_index_l = self._item_num_l self._missing_item_cnt_l = 1 << 13 self._item_num_l += self._missing_item_cnt_l else: self._generate_portal_input_data(self._dt_l, self._event_time_filter_l, self._item_num_l, 1 << 13, self._portal_manifest_l) self._item_num_l += 1 << 13 self._dt_l += timedelta(hours=1) self._item_num_f = 0 self._event_time_filter_f = lambda x: x % 907 == 0 self._dt_f = common.convert_timestamp_to_datetime( self._portal_manifest_f.begin_timestamp) for i in range(5): if i == 2: self._missing_datetime_f = self._dt_f self._missing_start_index_f = self._item_num_f self._missing_item_cnt_f = 1 << 13 else: self._generate_portal_input_data(self._dt_f, self._event_time_filter_f, self._item_num_f, 1 << 13, self._portal_manifest_f) self._item_num_f += 1 << 13 self._dt_f += timedelta(hours=1) self._launch_masters() self._launch_workers() self._launch_portals()
def get_raw_data_updated_datetime(self, partition_id): if partition_id not in self._raw_data_updated_datetime: ts = self._raw_date_controller.get_raw_data_latest_timestamp( partition_id) if ts.seconds > 3600: ts.seconds -= 3600 else: ts.seconds = 0 self._raw_data_updated_datetime[partition_id] = \ common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly(ts) ) return self._raw_data_updated_datetime[partition_id]
def _update_portal_commited_timestamp(self, new_committed_datetime): new_manifest = None with self._lock: old_committed_datetime = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly( self._portal_manifest.committed_timestamp)) assert new_committed_datetime > old_committed_datetime new_manifest = common_pb.DataJoinPortalManifest() new_manifest.MergeFrom(self._portal_manifest) assert new_manifest is not None new_manifest.committed_timestamp.MergeFrom( common.trim_timestamp_by_hourly( common.convert_datetime_to_timestamp(new_committed_datetime))) common.commit_portal_manifest(self._etcd, new_manifest) return new_manifest
def _check_datetime_stale(self, date_time): with self._lock: committed_datetime = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly( self._portal_manifest.committed_timestamp)) if date_time > committed_datetime: idx = bisect.bisect_left(self._input_ready_datetime, date_time) if idx < len(self._input_ready_datetime) and \ self._input_ready_datetime[idx] == date_time: return True idx = bisect.bisect_left(self._output_finished_datetime, date_time) if idx < len(self._output_finished_datetime) and \ self._output_finished_datetime[idx] == date_time: return True return False return True
def _prepare_test(self): self._portal_manifest = common_pb.DataJoinPortalManifest( name='test_portal', input_partition_num=4, output_partition_num=8, input_data_base_dir='./portal_input', output_data_base_dir='./portal_output') self._portal_options = dj_pb.DataJoinPotralOptions( example_validator=dj_pb.ExampleValidatorOptions( example_validator='EXAMPLE_VALIDATOR', validate_event_time=True, ), reducer_buffer_size=128, raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD'), use_mock_etcd=True) self._date_time = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly( common.convert_datetime_to_timestamp(datetime.now()))) self._generate_input_data()