Пример #1
0
    def save(self, address, partitions, schema, **kwargs):
        from fate_arch.common.address import HDFSAddress

        if isinstance(address, HDFSAddress):
            self._rdd.map(lambda x: hdfs_utils.serialize(x[0], x[1])
                          ).repartition(partitions).saveAsTextFile(
                              f"{address.name_node}/{address.path}")
            schema.update(self.schema)
            return

        from fate_arch.common.address import HiveAddress, LinkisHiveAddress

        if isinstance(address, (HiveAddress, LinkisHiveAddress)):
            # df = (
            #     self._rdd.map(lambda x: hive_utils.to_row(x[0], x[1]))
            #     .repartition(partitions)
            #     .toDF()
            # )
            LOGGER.debug(f"partitions: {partitions}")
            _repartition = self._rdd.map(lambda x: hive_utils.to_row(
                x[0], x[1])).repartition(partitions)
            _repartition.toDF().write.saveAsTable(
                f"{address.database}.{address.name}")
            schema.update(self.schema)
            return
        raise NotImplementedError(
            f"address type {type(address)} not supported with spark backend")
Пример #2
0
 def save(self, address, partitions, schema, **kwargs):
     from fate_arch.common.address import HDFSAddress
     if isinstance(address, HDFSAddress):
         self._rdd.map(lambda x: hdfs_utils.serialize(x[0], x[1])) \
             .repartition(partitions) \
             .saveAsTextFile(f"{address.name_node}/{address.path}")
         schema.update(self.schema)
         return
     raise NotImplementedError(
         f"address type {type(address)} not supported with spark backend")
Пример #3
0
    def put_all(self, kv_list: Iterable, append=True, assume_file_exist=False, **kwargs):
        LOGGER.info(f"put in hdfs file: {self._path}")
        if append and (assume_file_exist or self._exist()):
            stream = self._hdfs_client.open_append_stream(path=self._path, compression=None)
        else:
            stream = self._hdfs_client.open_output_stream(path=self._path, compression=None)

        counter = 0
        with io.TextIOWrapper(stream) as writer:
            for k, v in kv_list:
                writer.write(hdfs_utils.serialize(k, v))
                writer.write(hdfs_utils.NEWLINE)
                counter = counter + 1
        self._meta.update_metas(count=counter)
Пример #4
0
    def _put_all(self,
                 kv_list: Iterable,
                 append=True,
                 assume_file_exist=False,
                 **kwargs):
        LOGGER.info(f"put in file: {self.path}")

        # always create the directory first, otherwise the following creation of file will fail.
        self._local_fs_client.create_dir("/".join(self.path.split("/")[:-1]))

        if append and (assume_file_exist or self._exist()):
            stream = self._local_fs_client.open_append_stream(path=self.path,
                                                              compression=None)
        else:
            stream = self._local_fs_client.open_output_stream(path=self.path,
                                                              compression=None)

        counter = self._meta.get_count() if self._meta.get_count() else 0
        with io.TextIOWrapper(stream) as writer:
            for k, v in kv_list:
                writer.write(hdfs_utils.serialize(k, v))
                writer.write(hdfs_utils.NEWLINE)
                counter = counter + 1
        self._meta.update_metas(count=counter)