Exemplo n.º 1
0
 def _get_required_datetime(cls, portal_manifest):
     committed_datetime = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(
             portal_manifest.committed_timestamp))
     begin_datetime = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp))
     if begin_datetime >= committed_datetime + timedelta(hours=1):
         return begin_datetime
     return committed_datetime + timedelta(hours=1)
Exemplo n.º 2
0
 def _update_portal_commited_timestamp(self, new_committed_datetime):
     new_manifest = None
     with self._lock:
         old_committed_datetime = common.convert_timestamp_to_datetime(
             common.trim_timestamp_by_hourly(
                 self._portal_manifest.committed_timestamp))
         assert new_committed_datetime > old_committed_datetime
         new_manifest = common_pb.DataJoinPortalManifest()
         new_manifest.MergeFrom(self._portal_manifest)
     assert new_manifest is not None
     new_manifest.committed_timestamp.MergeFrom(
         common.trim_timestamp_by_hourly(
             common.convert_datetime_to_timestamp(new_committed_datetime)))
     common.commit_portal_manifest(self._etcd, new_manifest)
     return new_manifest
 def _add_raw_data_impl(self, notify_ctx, portal_manifest, ds_pid):
     dt = notify_ctx.get_raw_data_updated_datetime(ds_pid) + \
             timedelta(hours=1)
     begin_dt = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp))
     if dt < begin_dt:
         dt = begin_dt
     committed_dt = common.convert_timestamp_to_datetime(
         portal_manifest.committed_timestamp)
     fpaths = []
     timestamps = []
     ds_ptnum = notify_ctx.data_source.data_source_meta.partition_num
     while dt <= committed_dt:
         for pt_pid in range(ds_pid, portal_manifest.output_partition_num,
                             ds_ptnum):
             fpath = common.encode_portal_hourly_fpath(
                 portal_manifest.output_data_base_dir, dt, pt_pid)
             if gfile.Exists(fpath):
                 fpaths.append(fpath)
                 timestamps.append(common.convert_datetime_to_timestamp(dt))
         if len(fpaths) > 32 or dt == committed_dt:
             break
         dt += timedelta(hours=1)
     notify_ctx.add_raw_data(ds_pid, fpaths, timestamps, dt)
     logging.info("add %d raw data file for partition %d of data "\
                  "source %s. latest updated datetime %s",
                   len(fpaths), ds_pid,
                   notify_ctx.data_source.data_source_meta.name, dt)
     return dt >= committed_dt
 def get_raw_data_updated_datetime(self, partition_id):
     if partition_id not in self._raw_data_updated_datetime:
         ts = self._raw_date_controller.get_raw_data_latest_timestamp(
             partition_id)
         if ts.seconds > 3600:
             ts.seconds -= 3600
         else:
             ts.seconds = 0
         self._raw_data_updated_datetime[partition_id] = \
                 common.convert_timestamp_to_datetime(
                         common.trim_timestamp_by_hourly(ts)
                     )
     return self._raw_data_updated_datetime[partition_id]
Exemplo n.º 5
0
 def _setUpPortalManifest(self):
     self._portal_name = 'test_portal'
     self._etcd_l.delete_prefix(self._portal_name)
     self._etcd_f.delete_prefix(self._portal_name)
     self._portal_manifest_l = common_pb.DataJoinPortalManifest(
         name=self._portal_name,
         input_partition_num=4,
         output_partition_num=2,
         input_data_base_dir='./portal_input_l',
         output_data_base_dir='./portal_output_l',
         begin_timestamp=common.trim_timestamp_by_hourly(
             common.convert_datetime_to_timestamp(datetime.now())))
     self._portal_manifest_f = common_pb.DataJoinPortalManifest(
         name=self._portal_name,
         input_partition_num=2,
         output_partition_num=2,
         input_data_base_dir='./portal_input_f',
         output_data_base_dir='./portal_output_f',
         begin_timestamp=common.trim_timestamp_by_hourly(
             common.convert_datetime_to_timestamp(datetime.now())))
     common.commit_portal_manifest(self._etcd_l, self._portal_manifest_l)
     common.commit_portal_manifest(self._etcd_f, self._portal_manifest_f)
Exemplo n.º 6
0
 def _check_datetime_stale(self, date_time):
     with self._lock:
         committed_datetime = common.convert_timestamp_to_datetime(
             common.trim_timestamp_by_hourly(
                 self._portal_manifest.committed_timestamp))
         if date_time > committed_datetime:
             idx = bisect.bisect_left(self._input_ready_datetime, date_time)
             if idx < len(self._input_ready_datetime) and \
                     self._input_ready_datetime[idx] == date_time:
                 return True
             idx = bisect.bisect_left(self._output_finished_datetime,
                                      date_time)
             if idx < len(self._output_finished_datetime) and \
                     self._output_finished_datetime[idx] == date_time:
                 return True
             return False
         return True
 def _committed_datetime_forward_fn(self):
     new_committed_datetime = None
     updated = False
     pub_finfos = {}
     with self._lock:
         required_datetime = self._get_required_datetime(
             self._portal_manifest)
         idx = bisect.bisect_left(self._output_finished_datetime,
                                  required_datetime)
         partition_num = self._portal_manifest.output_partition_num
         for date_time in self._output_finished_datetime[idx:]:
             required_datetime = date_time
             if date_time != required_datetime:
                 break
             ts = common.trim_timestamp_by_hourly(
                 common.convert_datetime_to_timestamp(date_time))
             for partition_id in range(partition_num):
                 fpath = common.encode_portal_hourly_fpath(
                     self._portal_manifest.output_data_base_dir, date_time,
                     partition_id)
                 if partition_id not in pub_finfos:
                     pub_finfos[partition_id] = ([fpath], [ts])
                 else:
                     pub_finfos[partition_id][0].append(fpath)
                     pub_finfos[partition_id][1].append(ts)
             new_committed_datetime = required_datetime
             required_datetime += timedelta(hours=1)
             updated = True
     if updated:
         for partition_id, (fpaths, timestamps) in pub_finfos.items():
             self._publisher.publish_raw_data(partition_id, fpaths,
                                              timestamps)
         assert new_committed_datetime is not None
         updated_manifest = \
                 self._update_portal_commited_timestamp(
                         new_committed_datetime
                     )
         with self._lock:
             self._portal_manifest = updated_manifest
             skip_cnt = 0
             for date_time in self._output_finished_datetime:
                 if date_time <= new_committed_datetime:
                     skip_cnt += 1
             self._output_finished_datetime = \
                     self._output_finished_datetime[skip_cnt:]
         self._wakeup_input_data_ready_sniffer()
Exemplo n.º 8
0
 def _prepare_test(self):
     self._portal_manifest = common_pb.DataJoinPortalManifest(
         name='test_portal',
         input_partition_num=4,
         output_partition_num=8,
         input_data_base_dir='./portal_input',
         output_data_base_dir='./portal_output')
     self._portal_options = dj_pb.DataJoinPotralOptions(
         example_validator=dj_pb.ExampleValidatorOptions(
             example_validator='EXAMPLE_VALIDATOR',
             validate_event_time=True,
         ),
         reducer_buffer_size=128,
         raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD'),
         use_mock_etcd=True)
     self._date_time = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(
             common.convert_datetime_to_timestamp(datetime.now())))
     self._generate_input_data()