def save(self, address, partitions, schema, **kwargs): from fate_arch.common.address import HDFSAddress if isinstance(address, HDFSAddress): self._rdd.map(lambda x: hdfs_utils.serialize(x[0], x[1]) ).repartition(partitions).saveAsTextFile( f"{address.name_node}/{address.path}") schema.update(self.schema) return from fate_arch.common.address import HiveAddress, LinkisHiveAddress if isinstance(address, (HiveAddress, LinkisHiveAddress)): # df = ( # self._rdd.map(lambda x: hive_utils.to_row(x[0], x[1])) # .repartition(partitions) # .toDF() # ) LOGGER.debug(f"partitions: {partitions}") _repartition = self._rdd.map(lambda x: hive_utils.to_row( x[0], x[1])).repartition(partitions) _repartition.toDF().write.saveAsTable( f"{address.database}.{address.name}") schema.update(self.schema) return raise NotImplementedError( f"address type {type(address)} not supported with spark backend")
def save(self, address, partitions, schema, **kwargs): from fate_arch.common.address import HDFSAddress if isinstance(address, HDFSAddress): self._rdd.map(lambda x: hdfs_utils.serialize(x[0], x[1])) \ .repartition(partitions) \ .saveAsTextFile(f"{address.name_node}/{address.path}") schema.update(self.schema) return raise NotImplementedError( f"address type {type(address)} not supported with spark backend")
def put_all(self, kv_list: Iterable, append=True, assume_file_exist=False, **kwargs): LOGGER.info(f"put in hdfs file: {self._path}") if append and (assume_file_exist or self._exist()): stream = self._hdfs_client.open_append_stream(path=self._path, compression=None) else: stream = self._hdfs_client.open_output_stream(path=self._path, compression=None) counter = 0 with io.TextIOWrapper(stream) as writer: for k, v in kv_list: writer.write(hdfs_utils.serialize(k, v)) writer.write(hdfs_utils.NEWLINE) counter = counter + 1 self._meta.update_metas(count=counter)
def _put_all(self, kv_list: Iterable, append=True, assume_file_exist=False, **kwargs): LOGGER.info(f"put in file: {self.path}") # always create the directory first, otherwise the following creation of file will fail. self._local_fs_client.create_dir("/".join(self.path.split("/")[:-1])) if append and (assume_file_exist or self._exist()): stream = self._local_fs_client.open_append_stream(path=self.path, compression=None) else: stream = self._local_fs_client.open_output_stream(path=self.path, compression=None) counter = self._meta.get_count() if self._meta.get_count() else 0 with io.TextIOWrapper(stream) as writer: for k, v in kv_list: writer.write(hdfs_utils.serialize(k, v)) writer.write(hdfs_utils.NEWLINE) counter = counter + 1 self._meta.update_metas(count=counter)