Пример #1
0
    def read(self):
        if self.hdfs_client is None:
            if os.path.isdir(self.path):
                return self.__read_directory_localfs()
            else:
                return self.__read_file_path_localfs(filename=self.path)

        else:
            if is_hdfs_directory(self.hdfs_client, self.path):
                return self.__read_directory_hdfs()
            else:
                return self.__read_file_path_hdfs(filename=self.path)

            return
Пример #2
0
    def __read_files_hdfs(self, dataset, read_docs=None):
        if read_docs is None:
            read_docs = set()

        if not is_hdfs_directory(self.hdfs_client, self.directory):
            filenames = [self.directory]
        else:
            filenames = walk_hdfs_directory(self.hdfs_client, self.directory, lambda fname: fname.endswith(".ann.json"))

        for filename in filenames:
            with self.hdfs_client.read(filename, encoding="utf-8") as reader:
                doc_id = self.__read_annjson(reader, filename, dataset)
                read_docs.add(doc_id)

        return read_docs