def upload_file(self, input_file, head, job_id=None, input_feature_count=None, table=None, without_block=True): if not table: table = self.table with open(input_file, "r") as fin: lines_count = 0 if head is True: data_head = fin.readline() input_feature_count -= 1 self.update_table_meta(data_head) n = 0 fate_uuid = uuid.uuid1().hex get_line = self.get_line() while True: data = list() lines = fin.readlines(JobDefaultConfig.upload_max_bytes) line_index = 0 if lines: # self.append_data_line(lines, data, n) for line in lines: values = line.rstrip().split( self.parameters["id_delimiter"]) k, v = get_line( values=values, line_index=line_index, extend_sid=self.parameters["extend_sid"], auto_increasing_sid=self. parameters["auto_increasing_sid"], id_delimiter=self.parameters["id_delimiter"], fate_uuid=fate_uuid, ) data.append((k, v)) line_index += 1 if without_block: lines_count += len(data) save_progress = lines_count / input_feature_count * 100 // 1 job_info = { "progress": save_progress, "job_id": job_id, "role": self.parameters["local"]["role"], "party_id": self.parameters["local"]["party_id"], } ControllerClient.update_job(job_info=job_info) table.put_all(data) if n == 0 and without_block: table.meta.update_metas(part_of_data=data) else: return n += 1
def save_data_table(self, job_id, dst_table_name, dst_table_namespace, head=True): input_file = self.parameters["file"] input_feature_count = self.get_count(input_file) with open(input_file, 'r') as fin: lines_count = 0 if head is True: data_head = fin.readline() input_feature_count -= 1 _, meta = self.table.get_meta().update_metas( schema=data_utils.get_header_schema( header_line=data_head, id_delimiter=self.parameters["id_delimiter"])) self.table.set_meta(meta) n = 0 while True: data = list() lines = fin.readlines(self.MAX_BYTES) if lines: for line in lines: values = line.rstrip().split( self.parameters["id_delimiter"]) data.append(( values[0], data_utils.list_to_str( values[1:], id_delimiter=self.parameters["id_delimiter"]))) lines_count += len(data) save_progress = lines_count / input_feature_count * 100 // 1 job_info = { 'progress': save_progress, "job_id": job_id, "role": self.parameters["local"]['role'], "party_id": self.parameters["local"]['party_id'] } ControllerClient.update_job(job_info=job_info) self.table.put_all(data) if n == 0: self.table.get_meta().update_metas(part_of_data=data) else: table_count = self.table.count() self.table.get_meta().update_metas( count=table_count, partitions=self.parameters["partition"]) self.save_meta(dst_table_namespace=dst_table_namespace, dst_table_name=dst_table_name, table_count=table_count) return table_count n += 1
def report_task_update_to_driver(cls, task_info): """ Report task update to FATEFlow Server :param task_info: :return: """ schedule_logger().info("report task {} {} {} {} to driver".format( task_info["task_id"], task_info["task_version"], task_info["role"], task_info["party_id"], )) ControllerClient.report_task(task_info=task_info)
def run(self, component_parameters=None, args=None): self.parameters = component_parameters["DownloadParam"] self.parameters["role"] = component_parameters["role"] self.parameters["local"] = component_parameters["local"] name, namespace = self.parameters.get("name"), self.parameters.get( "namespace") with open(os.path.abspath(self.parameters["output_path"]), "w") as fout: with storage.Session.build( session_id=job_utils.generate_session_id( self.tracker.task_id, self.tracker.task_version, self.tracker.role, self.tracker.party_id, suffix="storage", random_end=True), name=name, namespace=namespace) as storage_session: data_table = storage_session.get_table() count = data_table.count() LOGGER.info('===== begin to export data =====') lines = 0 job_info = {} job_info["job_id"] = self.tracker.job_id job_info["role"] = self.tracker.role job_info["party_id"] = self.tracker.party_id for key, value in data_table.collect(): if not value: fout.write(key + "\n") else: fout.write(key + self.parameters.get("delimiter", ",") + str(value) + "\n") lines += 1 if lines % 2000 == 0: LOGGER.info( "===== export {} lines =====".format(lines)) if lines % 10000 == 0: job_info["progress"] = lines / count * 100 // 1 ControllerClient.update_job(job_info=job_info) job_info["progress"] = 100 ControllerClient.update_job(job_info=job_info) self.callback_metric( metric_name='data_access', metric_namespace='download', metric_data=[Metric("count", data_table.count())]) LOGGER.info("===== export {} lines totally =====".format(lines)) LOGGER.info('===== export data finish =====') LOGGER.info('===== export data file path:{} ====='.format( os.path.abspath(self.parameters["output_path"])))
def _run(self, cpn_input: ComponentInputProtocol): self.parameters = cpn_input.parameters self.parameters["role"] = cpn_input.roles["role"] self.parameters["local"] = cpn_input.roles["local"] name, namespace = self.parameters.get("name"), self.parameters.get( "namespace") with open(os.path.abspath(self.parameters["output_path"]), "w") as fw: session = Session( job_utils.generate_session_id( self.tracker.task_id, self.tracker.task_version, self.tracker.role, self.tracker.party_id, )) data_table = session.get_table(name=name, namespace=namespace) if not data_table: raise Exception(f"no found table {name} {namespace}") count = data_table.count() LOGGER.info("===== begin to export data =====") lines = 0 job_info = {} job_info["job_id"] = self.tracker.job_id job_info["role"] = self.tracker.role job_info["party_id"] = self.tracker.party_id for key, value in data_table.collect(): if not value: fw.write(key + "\n") else: fw.write(key + self.parameters.get("delimiter", ",") + str(value) + "\n") lines += 1 if lines % 2000 == 0: LOGGER.info("===== export {} lines =====".format(lines)) if lines % 10000 == 0: job_info["progress"] = lines / count * 100 // 1 ControllerClient.update_job(job_info=job_info) job_info["progress"] = 100 ControllerClient.update_job(job_info=job_info) self.callback_metric( metric_name="data_access", metric_namespace="download", metric_data=[Metric("count", data_table.count())], ) LOGGER.info("===== export {} lines totally =====".format(lines)) LOGGER.info("===== export data finish =====") LOGGER.info("===== export data file path:{} =====".format( os.path.abspath(self.parameters["output_path"])))
def report_task_info_to_driver(self): LOGGER.info("report {} {} {} {} {} to driver:\n{}".format( self.__class__.__name__, self.report_info["task_id"], self.report_info["task_version"], self.report_info["role"], self.report_info["party_id"], self.report_info)) ControllerClient.report_task(self.report_info)