def _get_required_datetime(cls, portal_manifest): committed_datetime = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly( portal_manifest.committed_timestamp)) begin_datetime = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp)) if begin_datetime >= committed_datetime + timedelta(hours=1): return begin_datetime return committed_datetime + timedelta(hours=1)
def _update_portal_commited_timestamp(self, new_committed_datetime): new_manifest = None with self._lock: old_committed_datetime = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly( self._portal_manifest.committed_timestamp)) assert new_committed_datetime > old_committed_datetime new_manifest = common_pb.DataJoinPortalManifest() new_manifest.MergeFrom(self._portal_manifest) assert new_manifest is not None new_manifest.committed_timestamp.MergeFrom( common.trim_timestamp_by_hourly( common.convert_datetime_to_timestamp(new_committed_datetime))) common.commit_portal_manifest(self._etcd, new_manifest) return new_manifest
def _add_raw_data_impl(self, notify_ctx, portal_manifest, ds_pid): dt = notify_ctx.get_raw_data_updated_datetime(ds_pid) + \ timedelta(hours=1) begin_dt = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp)) if dt < begin_dt: dt = begin_dt committed_dt = common.convert_timestamp_to_datetime( portal_manifest.committed_timestamp) fpaths = [] timestamps = [] ds_ptnum = notify_ctx.data_source.data_source_meta.partition_num while dt <= committed_dt: for pt_pid in range(ds_pid, portal_manifest.output_partition_num, ds_ptnum): fpath = common.encode_portal_hourly_fpath( portal_manifest.output_data_base_dir, dt, pt_pid) if gfile.Exists(fpath): fpaths.append(fpath) timestamps.append(common.convert_datetime_to_timestamp(dt)) if len(fpaths) > 32 or dt == committed_dt: break dt += timedelta(hours=1) notify_ctx.add_raw_data(ds_pid, fpaths, timestamps, dt) logging.info("add %d raw data file for partition %d of data "\ "source %s. latest updated datetime %s", len(fpaths), ds_pid, notify_ctx.data_source.data_source_meta.name, dt) return dt >= committed_dt
def get_raw_data_updated_datetime(self, partition_id): if partition_id not in self._raw_data_updated_datetime: ts = self._raw_date_controller.get_raw_data_latest_timestamp( partition_id) if ts.seconds > 3600: ts.seconds -= 3600 else: ts.seconds = 0 self._raw_data_updated_datetime[partition_id] = \ common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly(ts) ) return self._raw_data_updated_datetime[partition_id]
def _setUpPortalManifest(self): self._portal_name = 'test_portal' self._etcd_l.delete_prefix(self._portal_name) self._etcd_f.delete_prefix(self._portal_name) self._portal_manifest_l = common_pb.DataJoinPortalManifest( name=self._portal_name, input_partition_num=4, output_partition_num=2, input_data_base_dir='./portal_input_l', output_data_base_dir='./portal_output_l', begin_timestamp=common.trim_timestamp_by_hourly( common.convert_datetime_to_timestamp(datetime.now()))) self._portal_manifest_f = common_pb.DataJoinPortalManifest( name=self._portal_name, input_partition_num=2, output_partition_num=2, input_data_base_dir='./portal_input_f', output_data_base_dir='./portal_output_f', begin_timestamp=common.trim_timestamp_by_hourly( common.convert_datetime_to_timestamp(datetime.now()))) common.commit_portal_manifest(self._etcd_l, self._portal_manifest_l) common.commit_portal_manifest(self._etcd_f, self._portal_manifest_f)
def _check_datetime_stale(self, date_time): with self._lock: committed_datetime = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly( self._portal_manifest.committed_timestamp)) if date_time > committed_datetime: idx = bisect.bisect_left(self._input_ready_datetime, date_time) if idx < len(self._input_ready_datetime) and \ self._input_ready_datetime[idx] == date_time: return True idx = bisect.bisect_left(self._output_finished_datetime, date_time) if idx < len(self._output_finished_datetime) and \ self._output_finished_datetime[idx] == date_time: return True return False return True
def _committed_datetime_forward_fn(self): new_committed_datetime = None updated = False pub_finfos = {} with self._lock: required_datetime = self._get_required_datetime( self._portal_manifest) idx = bisect.bisect_left(self._output_finished_datetime, required_datetime) partition_num = self._portal_manifest.output_partition_num for date_time in self._output_finished_datetime[idx:]: required_datetime = date_time if date_time != required_datetime: break ts = common.trim_timestamp_by_hourly( common.convert_datetime_to_timestamp(date_time)) for partition_id in range(partition_num): fpath = common.encode_portal_hourly_fpath( self._portal_manifest.output_data_base_dir, date_time, partition_id) if partition_id not in pub_finfos: pub_finfos[partition_id] = ([fpath], [ts]) else: pub_finfos[partition_id][0].append(fpath) pub_finfos[partition_id][1].append(ts) new_committed_datetime = required_datetime required_datetime += timedelta(hours=1) updated = True if updated: for partition_id, (fpaths, timestamps) in pub_finfos.items(): self._publisher.publish_raw_data(partition_id, fpaths, timestamps) assert new_committed_datetime is not None updated_manifest = \ self._update_portal_commited_timestamp( new_committed_datetime ) with self._lock: self._portal_manifest = updated_manifest skip_cnt = 0 for date_time in self._output_finished_datetime: if date_time <= new_committed_datetime: skip_cnt += 1 self._output_finished_datetime = \ self._output_finished_datetime[skip_cnt:] self._wakeup_input_data_ready_sniffer()
def _prepare_test(self): self._portal_manifest = common_pb.DataJoinPortalManifest( name='test_portal', input_partition_num=4, output_partition_num=8, input_data_base_dir='./portal_input', output_data_base_dir='./portal_output') self._portal_options = dj_pb.DataJoinPotralOptions( example_validator=dj_pb.ExampleValidatorOptions( example_validator='EXAMPLE_VALIDATOR', validate_event_time=True, ), reducer_buffer_size=128, raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD'), use_mock_etcd=True) self._date_time = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly( common.convert_datetime_to_timestamp(datetime.now()))) self._generate_input_data()