class DataBlockVisitor(object): def __init__(self, data_source_name, kvstore_type, use_mock_etcd=False): self._kvstore = DBClient(kvstore_type, use_mock_etcd) self._data_source = retrieve_data_source(self._kvstore, data_source_name) def LoadDataBlockRepByTimeFrame(self, start_time=None, end_time=None): partition_num = self._data_source.data_source_meta.partition_num data_block_fnames = {} for partition_id in range(0, partition_num): data_block_fnames[partition_id] = \ self._list_data_block(partition_id) data_block_reps = {} for partition_id, fnames in data_block_fnames.items(): manifest = self._sync_raw_data_manifest(partition_id) for idx, fname in enumerate(fnames): check_existed = (idx == len(fnames) - 1) rep = self._make_data_block_rep(partition_id, fname, check_existed) filtered = True reason = '' if rep is None: reason = 'failed to create data block rep' elif end_time is not None and rep.end_time > end_time: reason = 'excess time frame' elif start_time is not None and rep.end_time <= start_time: reason = 'less time frame' elif self._filter_by_visible(rep.data_block_index, manifest): reason = 'data block visible' else: data_block_reps[rep.block_id] = rep filtered = False if filtered: logging.debug('skip %s since %s', fname, reason) return data_block_reps def LoadDataBlockReqByIndex(self, partition_id, data_block_index): partition_num = self._data_source.data_source_meta.partition_num if partition_id < 0 or partition_id >= partition_num: raise IndexError("partition {} out range".format(partition_id)) dirpath = self._partition_data_block_dir(partition_id) meta_fname = encode_data_block_meta_fname(self._data_source_name(), partition_id, data_block_index) meta_fpath = os.path.join(dirpath, meta_fname) meta = load_data_block_meta(meta_fpath) manifest = self._sync_raw_data_manifest(partition_id) if meta is not None and \ not self._filter_by_visible(meta.data_block_index, manifest): fname = encode_data_block_fname(self._data_source_name(), meta) return DataBlockRep(self._data_source_name(), fname, partition_id, dirpath) return None def LoadDataBlockRepByBlockId(self, block_id): block_info = decode_block_id(block_id) dbr = self.LoadDataBlockReqByIndex(block_info['partition_id'], block_info['data_block_index']) if dbr: assert dbr.block_id == block_id, \ "Invalid datablock, expected %s, but got %s), please "\ "check datasource!"%(block_id, dbr.block_id) return dbr def _list_data_block(self, partition_id): dirpath = self._partition_data_block_dir(partition_id) if gfile.Exists(dirpath) and gfile.IsDirectory(dirpath): return [ f for f in gfile.ListDirectory(dirpath) if f.endswith(DataBlockSuffix) ] return [] def _partition_data_block_dir(self, partition_id): return os.path.join(data_source_data_block_dir(self._data_source), partition_repr(partition_id)) def _make_data_block_rep(self, partition_id, data_block_fname, check_existed): try: rep = DataBlockRep(self._data_source.data_source_meta.name, data_block_fname, partition_id, self._partition_data_block_dir(partition_id), check_existed) except Exception as e: # pylint: disable=broad-except logging.warning("Failed to create data block rep for %s in"\ "partition %d reason %s", data_block_fname, partition_id, e) return None return rep def _data_source_name(self): return self._data_source.data_source_meta.name def _sync_raw_data_manifest(self, partition_id): kvstore_key = partition_manifest_kvstore_key(self._data_source_name(), partition_id) data = self._kvstore.get_data(kvstore_key) assert data is not None, "raw data manifest of partition "\ "{} must be existed".format(partition_id) return text_format.Parse(data, dj_pb.RawDataManifest()) def _filter_by_visible(self, index, manifest): join_state = manifest.join_example_rep.state if self._data_source.role == common_pb.FLRole.Follower and \ join_state != dj_pb.JoinExampleState.Joined: return index > manifest.peer_dumped_index return False
parser.add_argument('--raw_data_sub_dir', type=str, required=True, help='the mysql base dir to subscribe new raw data') args = parser.parse_args() data_source = common_pb.DataSource() data_source.data_source_meta.name = args.data_source_name data_source.data_source_meta.partition_num = args.partition_num data_source.data_source_meta.start_time = args.start_time data_source.data_source_meta.end_time = args.end_time data_source.data_source_meta.negative_sampling_rate = \ args.negative_sampling_rate if args.role.upper() == 'LEADER': data_source.role = common_pb.FLRole.Leader else: assert args.role.upper() == 'FOLLOWER' data_source.role = common_pb.FLRole.Follower data_source.output_base_dir = args.output_base_dir data_source.raw_data_sub_dir = args.raw_data_sub_dir data_source.state = common_pb.DataSourceState.Init kvstore = DBClient(args.kvstore_type) master_kvstore_key = common.data_source_kvstore_base_dir( data_source.data_source_meta.name) raw_data = kvstore.get_data(master_kvstore_key) if raw_data is None: logging.info("data source %s is not existed", args.data_source_name) common.commit_data_source(kvstore, data_source) logging.info("apply new data source %s", args.data_source_name) else: logging.info("data source %s has been existed", args.data_source_name)
help='Max number of files in a job') parser.add_argument('--start_date', type=str, default=None, help='Start date of input data, format %Y%m%d') parser.add_argument('--end_date', type=str, default=None, help='End date of input data, format %Y%m%d') args = parser.parse_args() set_logger() use_mock_etcd = (args.kvstore_type == 'mock') kvstore = DBClient(args.kvstore_type, use_mock_etcd) kvstore_key = common.portal_kvstore_base_dir(args.data_portal_name) portal_manifest = kvstore.get_data(kvstore_key) data_portal_type = dp_pb.DataPortalType.PSI if \ args.data_portal_type == 'PSI' else dp_pb.DataPortalType.Streaming if portal_manifest is None: portal_manifest = dp_pb.DataPortalManifest( name=args.data_portal_name, data_portal_type=data_portal_type, output_partition_num=args.output_partition_num, input_file_wildcard=args.input_file_wildcard, input_base_dir=args.input_base_dir, output_base_dir=args.output_base_dir, raw_data_publish_dir=args.raw_data_publish_dir, processing_job_id=-1) kvstore.set_data(kvstore_key, text_format.\ MessageToString(portal_manifest)) else: # validation parameter consistency
parser.add_argument('--output_base_dir', type=str, required=True, help='the base dir of output directory') parser.add_argument('--raw_data_publish_dir', type=str, required=True, help='the raw data publish dir in mysql') parser.add_argument('--long_running', action='store_true', help='make the data portal long running') parser.add_argument('--check_success_tag', action='store_true', help='Check that a _SUCCESS file exists before ' 'processing files in a subfolder') args = parser.parse_args() set_logger() use_mock_etcd = (args.kvstore_type == 'mock') kvstore = DBClient(args.kvstore_type, use_mock_etcd) kvstore_key = common.portal_kvstore_base_dir(args.data_portal_name) if kvstore.get_data(kvstore_key) is None: portal_manifest = dp_pb.DataPortalManifest( name=args.data_portal_name, data_portal_type=(dp_pb.DataPortalType.PSI if args.data_portal_type == 'PSI' else dp_pb.DataPortalType.Streaming), output_partition_num=args.output_partition_num, input_file_wildcard=args.input_file_wildcard, input_base_dir=args.input_base_dir, output_base_dir=args.output_base_dir, raw_data_publish_dir=args.raw_data_publish_dir, processing_job_id=-1 ) kvstore.set_data(kvstore_key, text_format.\ MessageToString(portal_manifest))