Python Client.read 예제들, hdfs.client.Client.read Python 예제들

예제 #1

0

파일 보기

    def get(self, request):
        _hdfsName = request.GET.get("hdfsName",
                                    "46eccfa2-1c56-11e8-a752-1008b1983d21")
        _hdfsPath = os.path.join("/datahoop/", _hdfsName)
        # print(_hdfsPath)

        try:
            # 链接HDFS,读取文件
            cli = Client(settings.HDFS_HOST)
            fileName = cli.list(_hdfsPath)[1]
            # print("filename:", fileName)
            _hdfsPath = os.path.join(_hdfsPath + "/", fileName)
            # print(_hdfsPath)
            try:
                with cli.read(_hdfsPath, length=2000, encoding="gbk") as f:
                    datas = f.read()
            except UnicodeDecodeError:
                with cli.read(_hdfsPath, length=2000, encoding="utf8") as f:
                    datas = f.read()
            # 字符转list
            re.sub("\r\n", "\n", datas)
            logger.debug(datas)
            datas = datas.strip('"').split('\n')
            content = []
            for i in datas:
                content.append(i.strip('"').split(","))
        except HdfsError:
            return Response(data={"error": "文件未找到或文件编码格式不符合"},
                            status=status.HTTP_400_BAD_REQUEST)

        return Response(data={"data": content}, status=status.HTTP_200_OK)

예제 #2

0

파일 보기

    def get(self, request):
        _hdfsName = request.GET.get("hdfsName",
                                    "46eccfa2-1c56-11e8-a752-1008b1983d21")
        _hdfsPath = os.path.join("/datahoop/", _hdfsName)
        obj = DataSource.objects.get(format_filename=_hdfsName)
        # print(_hdfsPath)
        try:
            # 链接HDFS,读取文件
            cli = Client(settings.HDFS_HOST)
            try:
                with cli.read(_hdfsPath, encoding="gbk") as f:
                    datas = f.read()
            except UnicodeDecodeError:
                with cli.read(_hdfsPath, encoding="utf8") as f:
                    datas = f.read()
        except HdfsError:
            return Response(data={"error": "文件未找到或文件编码格式不符合"},
                            status=status.HTTP_400_BAD_REQUEST)

        response = HttpResponse(content_type='csv/plain')
        response['Content-Disposition'] = 'attachment; filename={0}'.format(
            obj.file_name)
        response.write(datas)

        return response

예제 #3

0

파일 보기

파일: connect_hdfs.py 프로젝트: NanZhang1991/statistical-analysis-and-machine-learning-algorithm

def get_data(file_path):    
    HDFSUrl = "http://192.168.0.201:50070"
    client = Client(HDFSUrl, root='/')
    with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader:
        data = [line.strip().split() for line in reader]
        print("data",data[0:2])
    return data

예제 #4

0

파일 보기

파일: hdfs_wrapper.py 프로젝트: mayi140611/mayiutils_n1

def download_parquet_from_hdfs_dir(parquet_dir,
                                   local_dir,
                                   hdfs_ip,
                                   hdfs_port=50070):
    """
    从hdfs批量下载parquet文件到local_path
    :param parquet_dir: parquet文件所在的文件'/data/a.parquet'
    :param local_path: '/data_gen/b.parquet'
    :param hdfs_ip:
    :param hdfs_port:
    :return:
    """
    import os
    from hdfs.client import Client
    client = Client(f'http://{hdfs_ip}:{hdfs_port}')
    parquet_list = client.list(parquet_dir)
    print(parquet_list)
    for p in parquet_list:
        if p.endswith('.parquet'):
            print(f'downloading {os.path.join(parquet_dir, p)}')
            with client.read(os.path.join(parquet_dir, p)) as reader:
                data = reader.read()
            if not os.path.exists(local_dir):
                os.makedirs(local_dir)
            with open(os.path.join(local_dir, p), 'wb') as f:
                f.write(data)

예제 #5

0

파일 보기

파일: normal_test.py 프로젝트: NanZhang1991/statistical-analysis-and-machine-learning-algorithm

def get_data_hdfs(file_path):
    HDFSUrl = "http://192.168.0.201:50070"
    client = Client(HDFSUrl, root='/')
    with client.read(file_path,
                     buffer_size=1024,
                     delimiter='\n',
                     encoding='utf-8') as reader:
        data = [line.strip().split(',') for line in reader]
        print("data", data[0:5])
    df = pd.DataFrame(data[1:], columns=data[0])
    return df

예제 #6

0

파일 보기

파일: querier_parallel.py 프로젝트: Rena7ssance/search-trajectory-kBCT

 def hdfs_file2points(path):
     client = Client(QuerierParallel.master_hdfs_path,
                     root="/",
                     timeout=100,
                     session=False)
     points = []
     with client.read(path) as f:
         for line in f:
             info = line.strip('\n').split('\t')
             points.append([float(info[0]), float(info[1])])
     f.close()
     return points

예제 #7

0

파일 보기

파일: 从hdfs上取数据.py 프로젝트: Postan-W/modelservice

def do():
    global csv_path
    client = Client(hdfshost)
    file_list = client.list(csv_path)
    print(file_list)
    for file in file_list:
        if file.endswith(".csv"):
            csv_path = csv_path + file
    # 读取csv并同名写到本地
    with open("./异常临界值local.csv", 'w', encoding='GB2312') as local:
        with client.read(csv_path, encoding='GB2312') as hdfs:
            for line in hdfs:
                local.write(line.strip('\n'))

예제 #8

0

파일 보기

파일: offline.py 프로젝트: Liangzhuoxuan/PoemRS

 def read_accesslog_from_hdfs(self):
     # 实时日志流的存储是每5个点击数据存储一次
     client = Client("http://localhost:50070")
     file_names = client.list("/hadoop_file")
     ss = ""
     for file_name in file_names:
         with client.read("/hadoop_file/" + file_name,
                          encoding="utf-8") as reader:
             for line in reader:
                 # 去除测试数据
                 if line.startswith("filed1"):
                     continue
                 ss += line

예제 #9

0

파일 보기

def read(dir_path, header):
    client = Client("http://127.0.0.1:50070")
    log_data = []
    for date_dir in client.list(dir_path):
        for log_file in client.list(dir_path+'/'+date_dir):
            with client.read(dir_path+'/'+date_dir+'/'+log_file) as fs:
                for line in fs:
                    row = line.strip().split('&')
                    if row != ['']:
                        tmp = []
                        for field in row:
                            tmp.append(field.split('=')[1])
                        log_data.append(tmp)
    return pd.DataFrame(log_data, columns=header)

예제 #10

0

파일 보기

def read_corpus():
    qList = []
    # 问题的关键词列表
    qList_kw = []
    aList = []
    lines = []
    client = Client("http://localhost:50070")
    with client.read("/corpus/q_a.csv", encoding='utf-8') as reader:
        for line in reader:
            lines.append(line.strip())
    for t in lines:
        qList.append(t[0])
        qList_kw.append(seg.cut(t[0]))
        aList.append(t[1])
    return qList_kw, qList, aList

예제 #11

0

파일 보기

class HdfsClient(object):
    def __init__(self, url=None):
        self.url = url
        self.client = Client(url=url)

    def ls(self, path):
        return self.client.list(path)

    def isFile(self, path):
        result = self.client.status(path, strict=False)
        if result:
            return result[TYPE] == FILE
        else:
            return False

    def mkdir(self, path):
        self.client.makedirs(path, permission=777)

    def isDirectory(self, path):
        result = self.client.status(path, strict=False)
        if result:
            return result[TYPE] == DIRECTORY
        else:
            return False

    def upload(self, localSourcePath, remoteDistPath):
        self.client.upload(remoteDistPath, localSourcePath, overwrite=True)

    def dowload(self, remoteSourcePath, localDistPath):
        self.client.download(remoteSourcePath, localDistPath, overwrite=True)

    def put(self, localSourcePath, remoteDistPath):
        with open(localSourcePath,
                  "r") as reader, self.client.write(remoteDistPath) as writer:
            data = reader.read(FILE_SIZE)
            while data != "":
                writer.write(data)
                data = reader.read(FILE_SIZE)

    def get(self, remoteSourcePath, localDistPath):
        with self.client.read(remoteSourcePath,
                              chunk_size=FILE_SIZE) as reader, open(
                                  localDistPath, "a+") as writer:
            for chunk in reader:
                writer.write(chunk)

예제 #12

0

파일 보기

파일: hdfs_wrapper.py 프로젝트: mayi140611/mayiutils_n1

def download_parquet_from_hdfs(parquet_path,
                               local_path,
                               hdfs_ip,
                               hdfs_port=50070):
    """
    从hdfs下载parquet文件到local_path
    :param parquet_path: '/data/a.parquet'
    :param local_path: '/data_gen/b.parquet'
    :param hdfs_ip:
    :param hdfs_port:
    :return:
    """
    from hdfs.client import Client
    client = Client(f'http://{hdfs_ip}:{hdfs_port}')
    with client.read(parquet_path) as reader:
        data = reader.read()
    with open(local_path, 'wb') as f:
        f.write(data)

예제 #13

0

파일 보기

import pandas as pd

client = Client("http://10.103.0.11:9870", root='/')

files = []
path = '/lr/pos'
dirt = []
for a, b, c in client.walk(path):
    root = a
    dirt.append(b)
    files = c
col = [
    'id', 'mmsi', 'latitude', 'longitude', 'course', 'speed', 'lasttm', 'day'
]
for file in dirt[0]:
    print(file)
    res = []
    with client.read(path + '/' + file + '/' + 'part-00000') as read:
        for line in read:
            data = str(line).split('[')[1].split(']')[0]
            l1 = data.split(',')
            l1[2] = float(l1[2])
            l1[3] = float(l1[3])
            l1[4] = float(l1[4])
            l1[5] = float(l1[5])
            l1[6] = int(l1[6])
            res.append(l1)
    df = pd.DataFrame(res, columns=col)
    name = str(file).split('.')[0]
    df.to_excel('../posData/' + name + '.xlsx')

예제 #14

0

파일 보기

파일: photo.py 프로젝트: licosun/python_code

'''

import hdfs
import os
import json
import json, time, re
from hdfs.client import Client

client = Client("http://IP:50070")
data = time.strftime("%Y-%m-%d")
filepath = ("/nginx_log/www_test1-%s.log" % data)
#print(filepath)
dirname = "/nginx_log/"

TARGETPATH = r'D:\targeFile'
with client.read(filepath, encoding='utf-8') as f:
    for l in f:
        d = json.loads(l)
        # lines.append(l.strip())
        # d = json.loads(l)
        lt = []
        s = d.get('status')
        if s == "200":
            #print(d.values())
            for y in d:
                if type(d[y]) == dict:
                    for k in d[y]:
                        lt.append(d[y][k])
                elif type(d[y]) == list:
                    for i in d[y]:
                        lt.append(i)

예제 #15

0

파일 보기

 def test_hdfs(self):
     client = Client('http://172.16.2.41:50070', proxy='hive', root='/')
     print(client.list('/tmp/jiajie'))
     with client.read('/tmp/jiajie/birth_names.txt', length=10) as reader:
         data = reader.read()

예제 #16

0

파일 보기

파일: hdfs_util.py 프로젝트: xiangpingbu/speedup

from hdfs.client import Client
import time
client = Client("http://10.10.10.103:50070")

with client.read('/user/lifeng/test/honeybee/hello') as fs:
    content = fs.read()

예제 #17

0

파일 보기

파일: hdfs_client.py 프로젝트: xingangzhang/HDFSManage

class HDFSClient:
    def __init__(self,
                 url,
                 root=None,
                 user=None,
                 proxy=None,
                 timeout=None,
                 session=None):
        """ 连接hdfs
        url: HDFS名称节点的主机名或IP地址及端口号
        root: 根路径，此路径将作为传递给客户端的所有HDFS路径的前缀
        user: 使用InsecureClient（Base Client）,指定访问hdfs的用户;Client使用默认用户dr.who
        proxy: 代理的用户
        timeout: 连接超时，转发到请求处理程序
        session: request.Session实例，用于发出所有请求
        """
        if user:
            self.client = InsecureClient(url, user=user)
        else:
            self.client = Client(url,
                                 root=root,
                                 proxy=proxy,
                                 timeout=timeout,
                                 session=session)

    def list_hdfs_file(self, hdfs_path, status=False):
        """ 返回目录下的文件
        status: 每个文件或目录的属性信息(FileStatus)
        return: 列表中包含元组，每个元组是目录名或文件名和属性信息构成
        """
        return self.client.list(hdfs_path, status=status)

    def walk_hdfs_file(self,
                       hdfs_path,
                       depth=0,
                       status=False,
                       ignore_missing=False,
                       allow_dir_changes=False):
        """ 深度遍历远程文件系统
        hdfs_path: 起始路径。如果该路径不存在，则会引发HdfsError。如果指向文件，则返回的生成器将为空
        depth: 探索的最大深度。0为无限制
        status: 同时返回每个文件或文件夹的相应FileStatus
        ignore_missing: 忽略缺少的嵌套文件夹，而不是引发异常
        allow_dir_changes: 允许更改目录列表以影响遍历
        return: 生成器，返回值参考python的walk函数
        """
        return self.client.walk(hdfs_path,
                                depth=depth,
                                status=status,
                                ignore_missing=ignore_missing,
                                allow_dir_changes=allow_dir_changes)

    def delete_hdfs_file(self, hdfs_path, recursive=False, skip_trash=False):
        """ 删除文件
        recursive: 递归删除文件或目录，默认情况下，如果尝试删除非空目录，此方法将引发HdfsError
        skip_trash: 设置为false时，已删除的路径将被移动到适当的垃圾回收文件夹，而不是被删除
        return: 如果删除成功，则此函数返回True；如果hdfs_path之前不存在文件或目录，则返回False
        """
        return self.client.delete(hdfs_path,
                                  recursive=recursive,
                                  skip_trash=skip_trash)

    def download_hdfs_file(self,
                           hdfs_path,
                           local_path,
                           overwrite=True,
                           n_threads=1,
                           temp_dir=None,
                           **kwargs):
        """ 下载文件
        hdfs_file: HDFS上要下载的文件或文件夹的路径。如果是文件夹，则将下载该文件夹下的所有文件
        local_file: 本地路径。如果它已经存在并且是目录，则文件将在其中下载
        overwrite: 覆盖任何现有文件或目录
        n_threads: 用于并行化的线程数。值为0（或负数）将使用与文件一样多的线程
        temp_dir: 当overwrite = True并且最终目标路径已经存在时，将首先在其下下载文件的目录。下载成功完成后，它将被交换
        **kwargs: 关键字参数转发给read()。如果未传递chunk_size参数，则将使用默认值64 kB
        return: 方法执行成功，将返回本地下载路径
        """
        res = self.client.download(hdfs_path,
                                   local_path,
                                   overwrite=overwrite,
                                   n_threads=n_threads,
                                   temp_dir=temp_dir,
                                   **kwargs)

    def upload_hdfs_file(self,
                         hdfs_path,
                         local_path,
                         n_threads=1,
                         temp_dir=None,
                         chunk_size=65536,
                         progress=None,
                         cleanup=True,
                         **kwargs):
        """ 上传文件
        hdfs_path: 目标HDFS路径。如果它已经存在并且是目录，则文件将在其中上传
        local_path: 文件或文件夹的本地路径。如果是文件夹，则将上载其中的所有
            文件（请注意，这意味着没有文件的文件夹将不会远程创建）
        cleanup: 如果上传过程中发生错误，删除所有上传的文件
        return: 方法执行成功，将返回状态码，远程上传目录，错误信息
        """
        try:
            res = self.client.upload(hdfs_path,
                                     local_path,
                                     n_threads=n_threads,
                                     temp_dir=temp_dir,
                                     chunk_size=chunk_size,
                                     progress=progress,
                                     cleanup=cleanup,
                                     overwrite=True)
            return 0, res, ''
        except HdfsError as e:
            return 1, '', str(e)

    def makedirs(self, hdfs_path, permission=None):
        """ 创建目录，可以递归
        permission: 在新创建的目录上设置的八进制权限，这些权限将仅在尚不存在的目录上设置
        return: None
        """
        self.client.makedirs(hdfs_path, permission=permission)

    def parts(self, hdfs_path, parts=None, status=False):
        """
        hdfs_path: 远程路径。该目录每个分区最多应包含一个零件文件（否则将任意选择一个文件）
        parts: 零件文件编号列表或要选择的零件文件总数。如果是数字，那么将随机选择那么多分区。 
               默认情况下，将返回所有零件文件。如果部件是列表，但未找到部件之一或需要太多样本，则会引发HdfsError
        status: 返回文件的FileStatus
        return: 返回对应于路径的零件文件的字典
        """
        return self.client.parts(hdfs_path, parts=parts, status=status)

    def read_hdfs_file(self, **kwds):
        """ 读取文件内容，这个方法必须在一个with块中使用，以便关闭连接
        >>> with client.read('foo') as reader:
        >>>     content = reader.read()
        hdfs_path: HDFS路径
        offset: 起始字节位置
        length: 要处理的字节数。设置为None时会读取整个文件
        buffer_size: 用于传输数据的缓冲区大小（以字节为单位）。默认为在HDFS配置中设置的值
        encoding: 用于解码请求的编码。默认情况下，返回原始数据
        chunk_size: 如果设置为正数，则上下文管理器将返回一个生成器，该生成器生成每个chunk_size字节，
                而不是类似文件的对象（除非还设置了定界符）
        delimiter: 如果设置，上下文管理器将在每次遇到定界符时返回生成器。此参数要求指定编码
        progress: 回调函数，用于跟踪进度，称为每个chunk_size字节（如果未指定块大小，则不可用）。
                它将传递两个参数，即要上传的文件的路径和到目前为止已传输的字节数。
                完成后，将以-1作为第二个参数调用一次
        """
        self.client.read(**kwds)

    def write_hdfs_file(self,
                        hdfs_path,
                        data=None,
                        overwrite=False,
                        permission=None,
                        blocksize=None,
                        replication=None,
                        buffersize=None,
                        append=False,
                        encoding=None):
        """ 在HDFS上创建文件
        data: 要写入的文件内容。 可以是字符串，生成器或文件对象。 最后两个选项将允许流式上传（即无需
              将全部内容加载到内存中）。 如果为None，则此方法将返回类似文件的对象，应使用with块调用
              它（请参见下面的示例）
        permission: 在新创建的文件上设置的八进制权限
        append: 附加到文件而不是创建新文件
        encoding: 用于序列化写入数据的编码
        >>> from json import dump, dumps
        >>> records = [
        >>>     {'name': 'foo', 'weight': 1},
        >>>     {'name': 'bar', 'weight': 2},
        >>> ]
        >>> # As a context manager:
        >>> with client.write('data/records.jsonl', encoding='utf-8') as writer:
        >>>     dump(records, writer)
        >>> Or, passing in a generator directly:
        >>> client.write('data/records.jsonl', data=dumps(records), encoding='utf-8')
        """
        self.client.write(hdfs_path,
                          data=data,
                          overwrite=overwrite,
                          permission=permission,
                          blocksize=blocksize,
                          replication=replication,
                          buffersize=buffersize,
                          append=append,
                          encoding=encoding)

    def rename_or_move(self, hdfs_src_path, hdfs_dst_path):
        """ 移动文件或目录
        hdfs_src_path: 源路径
        hdfs_dst_path: 目标路径，如果路径已经存在并且是目录，则源将移入其中。
                如果路径存在并且是文件，或者缺少父目标目录，则此方法将引发HdfsError
        """
        self.client.rename(hdfs_src_path, hdfs_dst_path)

    def set_owner(self, hdfs_path, owner=None, group=None):
        """ 更改文件的所有者，必须至少指定所有者和组之一
        owner: 可选，文件的新所有者
        group: 可选，文件的新所有组
        """
        self.client.set_owner(hdfs_path, owner=owner, group=group)

    def set_permission(self, hdfs_path, permission):
        """ 更改文件权限
        permission: 文件的新八进制权限字符串
        """
        self.client.set_permission(hdfs_path, permission)

    def set_replication(self, hdfs_path, replication):
        """ 设置文件副本
        replication: 副本数
        """
        self.client.set_replication(hdfs_path, replication)

    def set_times(self, hdfs_path, access_time=None, modification_time=None):
        """ 更改文件的时间戳
        """
        self.client.set_times(hdfs_path,
                              access_time=access_time,
                              modification_time=modification_time)

    def status_hdfs_file(self, hdfs_path, strict=True):
        """ 获取文件的FileStatus
        strict: 如果为False，则返回None，而不是如果路径不存在则引发异常
        """
        self.client.status(hdfs_path, strict=strict)

예제 #18

0

파일 보기

파일: hdfsClient.py 프로젝트: clonegod/bigdata

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from hdfs.client import Client


# 读取hdfs文件内容,将每行存入数组返回
def read_hdfs_file(client, filename):
    lines = []
    with client.read(filename, encoding='utf-8', delimiter='\n') as reader:
        for line in reader:
            lines.append(line.strip())
    return lines


if __name__ == '__main__':
    client = Client("http://127.0.0.1:50070/")
    print("hdfs中的目录为:", client.list(hdfs_path="/user", status=True))
    with client.read("/user/hive/warehouse/test.db/t_tmp1/tmp1.txt", length=200, encoding='utf-8') as obj:
        for i in obj:
            print(i)

예제 #19

0

파일 보기

    def Convert_file(self, p_srcfileType, p_srcfilename, p_dstfileType, p_dstfilename):
        try:
            if p_srcfileType.upper() == "MEM":
                m_srcFSType = "MEM"
                if self.g_MemoryFSHandler is None:
                    self.g_MemoryFSHandler = fs.open_fs('mem://')
                    m_srcFS = self.g_MemoryFSHandler
                else:
                    m_srcFS = self.g_MemoryFSHandler
                m_srcFileName = p_srcfilename
            elif p_srcfileType.upper() == "FS":
                m_srcFSType = "FS"
                m_srcFS = fs.open_fs('./')
                m_srcFileName = p_srcfilename
            elif p_srcfileType.upper() == "HDFS":
                m_srcFSType = "HDFS"
                m_srcFullFileName = p_srcfilename
                m_Protocal = m_srcFullFileName.split("://")[0]
                m_NodePort = m_srcFullFileName[len(m_Protocal) + 3:].split("/")[0]
                m_WebFSURL = m_Protocal + "://" + m_NodePort
                m_WebFSDir, m_srcFileName = os.path.split(m_srcFullFileName[len(m_WebFSURL):])
                m_srcFS = Client(m_WebFSURL, m_WebFSDir, proxy=None, session=None)
            else:
                m_srcFS = None
                m_srcFileName = None
                m_srcFSType = "Not Supported"

            if p_dstfileType.upper() == "MEM":
                m_dstFSType = "MEM"
                if self.g_MemoryFSHandler is None:
                    self.g_MemoryFSHandler = fs.open_fs('mem://')
                    m_dstFS = self.g_MemoryFSHandler
                else:
                    m_dstFS = self.g_MemoryFSHandler
                m_dstFileName = p_dstfilename
            elif p_dstfileType.upper() == "FS":
                m_dstFSType = "FS"
                m_dstFS = fs.open_fs('./')
                m_dstFileName = p_dstfilename
            elif p_dstfileType.upper() == "HDFS":
                m_dstFSType = "HDFS"
                m_dstFullFileName = p_dstfilename
                m_Protocal = m_dstFullFileName.split("://")[0]
                m_NodePort = m_dstFullFileName[len(m_Protocal) + 3:].split("/")[0]
                m_WebFSURL = m_Protocal + "://" + m_NodePort
                m_WebFSDir, m_dstFileName = os.path.split(m_dstFullFileName[len(m_WebFSURL):])
                m_dstFS = Client(m_WebFSURL, m_WebFSDir, proxy=None, session=None)
            else:
                m_dstFS = None
                m_dstFileName = None
                m_dstFSType = "Not Supported convert."

            if m_srcFSType == "Not Supported" or m_dstFSType == "Not Supported":
                raise SQLCliException("Not supported convert. From [" + p_srcfileType + "] to [" + p_dstfileType + "]")

            if m_srcFSType in ('MEM', 'FS') and m_dstFSType in ('MEM', 'FS'):
                with m_srcFS.openbin(m_srcFileName, "r") as m_reader, m_dstFS.openbin(m_dstFileName, "w") as m_writer:
                    while True:
                        m_Contents = m_reader.read(8192)
                        if len(m_Contents) == 0:
                            break
                        m_writer.write(m_Contents)

            if m_srcFSType == "HDFS" and m_dstFSType in ('MEM', 'FS'):
                with m_srcFS.read(m_srcFileName, "rb") as m_reader, m_dstFS.openbin(m_dstFileName, "w") as m_writer:
                    while True:
                        m_Contents = m_reader.read(8192)
                        if len(m_Contents) == 0:
                            break
                        m_writer.write(m_Contents)

            # 对于HDFS的写入，每80M提交一次，以避免内存的OOM问题
            if m_srcFSType in ('MEM', 'FS') and m_dstFSType == "HDFS":
                bHeaderWrite = True
                with m_srcFS.openbin(m_srcFileName, "r") as m_reader:
                    while True:
                        m_Contents = m_reader.read(8192 * 10240)
                        if len(m_Contents) == 0:
                            break
                        if bHeaderWrite:
                            with m_dstFS.write(m_dstFileName, overwrite=True) as m_writer:
                                m_writer.write(m_Contents)
                            bHeaderWrite = False
                        else:
                            with m_dstFS.write(m_dstFileName, append=True) as m_writer:
                                m_writer.write(m_Contents)

            if m_srcFSType == "HDFS" and m_dstFSType == "HDFS":
                bHeaderWrite = True
                with m_srcFS.read(m_srcFileName) as m_reader:
                    while True:
                        m_Contents = m_reader.read(8192 * 10240)
                        if len(m_Contents) == 0:
                            break
                        if bHeaderWrite:
                            with m_dstFS.write(m_dstFileName, overwrite=True) as m_writer:
                                m_writer.write(m_Contents)
                            bHeaderWrite = False
                        else:
                            with m_dstFS.write(m_dstFileName, append=True) as m_writer:
                                m_writer.write(m_Contents)
        except HdfsError as he:
            # HDFS 会打印整个堆栈信息，所以这里默认只打印第一行的信息
            if "SQLCLI_DEBUG" in os.environ:
                raise SQLCliException(he.message)
            else:
                raise SQLCliException(he.message.split('\n')[0])

예제 #20

0

파일 보기

lines1 = []
lines2 = []
lines3 = []
lines4 = []
client = Client("http://222.27.166.215:50070")
a = 0
b = 0
c = 0
d = 0
e = 0
f = 0
g = 0
h = 0

############   tiaoxingtu ###############
with client.read("/home/spark-test/picture_data/part-00000",
                 encoding='utf-8') as reader:
    for line in reader:
        line = line.replace("'", "")
        line = line.replace("(", "")
        line = line.replace(")", "")
        lines.append(line.split(","))
df = pd.DataFrame(data=lines)
plt.figure(figsize=(10, 15))
df[1] = df[1].astype(int)
for i in range(0, len(df)):
    if float(df.iloc[i][0]) < 4:
        g = g + df.iloc[i][1]
    if float(df.iloc[i][0]) >= 4 and float(df.iloc[i][0]) < 5:
        a = a + df.iloc[i][1]
    if float(df.iloc[i][0]) >= 5 and float(df.iloc[i][0]) < 6:
        b = b + int(df.iloc[i][1])

예제 #21

0

파일 보기

파일: Test002.py 프로젝트: liupei0210/component

from component.spark.ProcessDriver import ProcessDriver
from hdfs.client import Client
import xml.etree.ElementTree as ET
client = Client("http://172.18.130.100:50070")
with client.read("/liupei/test/template.xml") as fs:
    list = []
    key = ""
    value = ""
    #tree=ET.parse("/home/liupei/test/template.xml")
    tree = ET.parse(fs)
    root = tree.getroot()
    appName = root.attrib["appName"]
    #print(appName)
    for childs in root:
        map = {}
        for child in childs:
            if child.tag == "key":
                key = child.text
            elif child.tag == "value":
                value = child.text
                map[key] = value
        list.append(map)
    #print(list)
    pd = ProcessDriver(appName, list)
    pd.start()

예제 #22

0

파일 보기

from ProcessDriver import ProcessDriver
from hdfs.client import Client
import xml.etree.ElementTree as ET
import sys
client = Client(sys.argv[1])
with client.read(sys.argv[2]) as fs:
    list = []
    key = ""
    value = ""
    tree = ET.parse(fs)
    root = tree.getroot()
    appName = root.attrib["appName"]
    for childs in root:
        map = {}
        for child in childs:
            if child.tag == "key":
                key = child.text
            elif child.tag == "value":
                value = child.text
                map[key] = value
        list.append(map)
    type = list[0]["type"]
    pd = ProcessDriver(appName, list)
    if (type == "core"):
        pd.startCore()
    elif (type == "sql"):
        pd.startSQL()
    elif (type == "gpsql"):
        pd.startgpSQL()

예제 #23

0

파일 보기

파일: HDFS.py 프로젝트: hm631379593/Hadoop

from hdfs.client import Client
client = Client(
    "http://host6.cloud.sinocbd.com:50070/")  # 50070: Hadoop默认namenode
dir(client)
# 其中用到的方法有：
# walk() 类似os.walk，返回值也是包含(路径，目录名，文件名)元素的数组，每层迭代。
# read() 类似file.read，官方文档的说法是client.read必须在with块里使用：
# path=[]
# for i in client.walk('/tempfiles/temp',depth=1):
#     for item in i:
#      path.append(item)
#      print(item)
# print(path)
with client.read('/tempfiles/1.csv', encoding='gbk') as fs:
    content = fs.read()
    print(content)

예제 #24

0

파일 보기

#list()会列出hdfs指定路径的所有文件信息,接收两个参数
print("hdfs中的目录为:", client.list(hdfs_path="/test1", status=True))

#read()
#读取文件信息 类似与 hdfs dfs -cat hfds_path,参数如下:
# hdfs_path hdfs路径
# offset 读取位置
# length 读取长度
# buffer_size 设置buffer_size 不设置使用hdfs默认100MB 对于大文件 buffer够大的化 sort与shuffle都更快
# encoding 指定编码
# chunk_size 字节的生成器,必须和encodeing一起使用 满足chunk_size设置即 yield
# delimiter 设置分隔符 必须和encodeing一起设置
# progress 读取进度回调函数 读取一个chunk_size回调一次

print(
    client.read(
        "/test1/part-00000-15b6e708-1025-408b-a6f2-1f37a7fe7064-c000.csv"))

# 下载
print(
    "下载文件结果part.csv:",
    client.download(
        hdfs_path=
        "/test1/part-00000-15b6e708-1025-408b-a6f2-1f37a7fe7064-c000.csv",
        local_path="/home/yyj2020/test",
        overwrite=True))

# 打包
with tarfile.open("/home/yyj2020/test/tartest.tar.gz", "w:gz") as tar:
    tar.add("/home/yyj2020/test/files",
            arcname=os.path.basename("/home/yyj2020/test/files"))
    tar.close()

예제 #25

0

파일 보기

 def test_something(self):
     client = Client("http://172.18.130.100:50070")
     with client.read("/liupei/test/template.xml") as fs:
         content = fs.read()
         print(content)