def publish_raw_data(self, partition_id, fpaths, timestamps=None): if not fpaths: logging.warning("no raw data will be published") return if timestamps is not None and len(fpaths) != len(timestamps): raise RuntimeError("the number of raw data file "\ "and timestamp mismatch") new_raw_data_pubs = [] for index, fpath in enumerate(fpaths): if not gfile.Exists(fpath): raise ValueError('{} is not existed'.format(fpath)) raw_data_pub = dj_pb.RawDatePub(raw_data_meta=dj_pb.RawDataMeta( file_path=fpath, start_index=-1)) if timestamps is not None: raw_data_pub.raw_data_meta.timestamp.MergeFrom( timestamps[index]) new_raw_data_pubs.append(raw_data_pub) next_pub_index = None item_index = 0 data = text_format.MessageToString(new_raw_data_pubs[item_index]) while item_index < len(new_raw_data_pubs): next_pub_index = self._forward_pub_index(partition_id, next_pub_index) etcd_key = common.raw_data_pub_etcd_key(self._raw_data_pub_dir, partition_id, next_pub_index) if self._etcd.cas(etcd_key, None, data): logging.info("Success publish %s at index %d for partition"\ "%d", data, next_pub_index, partition_id) next_pub_index += 1 item_index += 1 if item_index < len(new_raw_data_pubs): raw_data_pub = new_raw_data_pubs[item_index] data = text_format.MessageToString(raw_data_pub)
def _try_to_sub_raw_data(self, partition_id): sub_src_dir = path.join(self._raw_data_sub_dir, common.partition_repr(partition_id)) with self._lock: manifest = self._sync_manifest(partition_id) if manifest.finished: return next_sub_index = manifest.next_raw_data_sub_index add_candidates = [] raw_data_finished = False while True: etcd_key = common.raw_data_pub_etcd_key( self._raw_data_sub_dir, partition_id, next_sub_index) pub_data = self._etcd.get_data(etcd_key) if pub_data is None: break raw_data_pub = text_format.Parse(pub_data, dj_pb.RawDatePub()) if raw_data_pub.HasField('raw_data_meta'): add_candidates.append(raw_data_pub.raw_data_meta) next_sub_index += 1 elif raw_data_pub.HasField('raw_data_finished'): logging.warning("meet finish pub at pub index %d for "\ "partition %d", next_sub_index, partition_id) raw_data_finished = True break self._store_raw_data_metas(partition_id, add_candidates) new_manifest = self._sync_manifest(partition_id) new_manifest.next_raw_data_sub_index = next_sub_index new_manifest.finished = raw_data_finished self._update_manifest(new_manifest)
def _try_to_sub_raw_data(self, partition_id): manifest = self._sync_manifest(partition_id) if manifest.finished or len(self._raw_data_sub_dir) == 0: return 0 next_sub_index = manifest.next_raw_data_sub_index add_candidates = [] raw_data_finished = False prev_next_sub_index = manifest.next_raw_data_sub_index while True: kvstore_key = common.raw_data_pub_kvstore_key( self._raw_data_sub_dir, partition_id, next_sub_index ) pub_data = self._kvstore.get_data(kvstore_key) if pub_data is None: break raw_data_pub = text_format.Parse(pub_data, dj_pb.RawDatePub(), allow_unknown_field=True) if raw_data_pub.HasField('raw_data_meta'): add_candidates.append(raw_data_pub.raw_data_meta) next_sub_index += 1 elif raw_data_pub.HasField('raw_data_finished'): logging.warning("meet finish pub at pub index %d for "\ "partition %d", next_sub_index, partition_id) raw_data_finished = True break self._store_raw_data_metas(partition_id, add_candidates) new_manifest = self._sync_manifest(partition_id) new_manifest.next_raw_data_sub_index = next_sub_index new_manifest.finished = raw_data_finished self._update_manifest(new_manifest) return next_sub_index - prev_next_sub_index
def _check_finish_tag(self, partition_id, last_index): if last_index >= 0: etcd_key = common.raw_data_pub_etcd_key(self._raw_data_pub_dir, partition_id, last_index) data = self._etcd.get_data(etcd_key) if data is not None: pub_item = text_format.Parse(data, dj_pb.RawDatePub()) return pub_item.HasField('raw_data_finished') return False
def _check_finish_tag(self, partition_id, last_index): if last_index >= 0: kvstore_key = common.raw_data_pub_kvstore_key( self._raw_data_pub_dir, partition_id, last_index) data = self._kvstore.get_data(kvstore_key) if data is not None: pub_item = text_format.Parse(data, dj_pb.RawDatePub(), allow_unknown_field=True) return pub_item.HasField('raw_data_finished') return False
def finish_raw_data(self, partition_id): data = text_format.MessageToString( dj_pb.RawDatePub(raw_data_finished=empty_pb2.Empty())) next_pub_index = None while True: next_pub_index = self._forward_pub_index(partition_id, next_pub_index) etcd_key = common.raw_data_pub_etcd_key(self._raw_data_pub_dir, partition_id, next_pub_index) if self._etcd.cas(etcd_key, None, data): logging.info("Success finish raw data for partition"\ "%d", partition_id) break
def finish_raw_data(self, partition_id): data = text_format.MessageToString( dj_pb.RawDatePub(raw_data_finished=empty_pb2.Empty())) next_pub_index = None while True: next_pub_index = self._forward_pub_index(partition_id, next_pub_index) if self._check_finish_tag(partition_id, next_pub_index - 1): logging.warning("partition %d has been published finish tag"\ "at index %d", partition_id, next_pub_index-1) break kvstore_key = common.raw_data_pub_kvstore_key( self._raw_data_pub_dir, partition_id, next_pub_index) if self._kvstore.cas(kvstore_key, None, data): logging.info("Success finish raw data for partition"\ "%d", partition_id) break
def publish_raw_data(self, partition_id, fpaths, timestamps=None): if not fpaths: logging.warning("no raw data will be published") return if timestamps is not None and len(fpaths) != len(timestamps): raise RuntimeError("the number of raw data file "\ "and timestamp mismatch") new_raw_data_pubs = [] for index, fpath in enumerate(fpaths): if not gfile.Exists(fpath): raise ValueError('{} is not existed'.format(fpath)) raw_data_pub = dj_pb.RawDatePub(raw_data_meta=dj_pb.RawDataMeta( file_path=fpath, start_index=-1)) if timestamps is not None: raw_data_pub.raw_data_meta.timestamp.MergeFrom( timestamps[index]) new_raw_data_pubs.append(raw_data_pub) next_pub_index = None item_index = 0 data = text_format.MessageToString(new_raw_data_pubs[item_index]) while item_index < len(new_raw_data_pubs): next_pub_index = self._forward_pub_index(partition_id, next_pub_index) if self._check_finish_tag(partition_id, next_pub_index - 1): logging.warning("partition %d has been published finish tag "\ "at index %d", partition_id, next_pub_index-1) break kvstore_key = common.raw_data_pub_kvstore_key( self._raw_data_pub_dir, partition_id, next_pub_index) if self._kvstore.cas(kvstore_key, None, data): logging.info("Success publish %s at index %d for partition"\ "%d", data, next_pub_index, partition_id) next_pub_index += 1 item_index += 1 if item_index < len(new_raw_data_pubs): raw_data_pub = new_raw_data_pubs[item_index] data = text_format.MessageToString(raw_data_pub) if item_index < len(new_raw_data_pubs) - 1: logging.warning("%d files are not published since meet finish "\ "tag for partition %d. list following", len(new_raw_data_pubs) - item_index, partition_id) for idx, pub in enumerate(new_raw_data_pubs[item_index:]): logging.warning("%d. %s", idx, pub.raw_data_meta.file_path)