Exemplo n.º 1
0
def get_hdfs_client(env="local"):
    master, slave = get_env(env)
    try:
        client = Client(master)
        client.list("/")
    except HdfsError:
        client = Client(slave)
    return client
Exemplo n.º 2
0
def test_hdfs_files():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    root_dirs = hdfs.list('/')
    assert 'spark' in root_dirs

    spark_dirs = hdfs.list('/spark')
    assert 'spark-1.4.1-bin-hadoop2.6.tgz' in spark_dirs
def test_hdfs_files():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    root_dirs = hdfs.list('/')
    assert 'spark' in root_dirs

    spark_dirs = hdfs.list('/spark')
    assert 'spark-1.4.1-bin-hadoop2.6.tgz' in spark_dirs
Exemplo n.º 4
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    users_dirs = hdfs.list('/user')
    assert 'hive' in users_dirs
    assert 'impala' in users_dirs

    users_dirs = hdfs.list('/user/hive')
    assert 'warehouse' in users_dirs
Exemplo n.º 5
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    users_dirs = hdfs.list('/user')
    assert 'hive' in users_dirs
    assert 'impala' in users_dirs

    users_dirs = hdfs.list('/user/hive')
    assert 'warehouse' in users_dirs
Exemplo n.º 6
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    nn_ip = project.cluster.head.ip

    hdfs = Client("http://%s:50070" % nn_ip)
    assert hdfs

    root_dirs = hdfs.list("/")
    assert "tmp" in root_dirs
    assert "user" in root_dirs

    users_dirs = hdfs.list("/user")
    assert project.settings["USERNAME"] in users_dirs
Exemplo n.º 7
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    nn_ip = project.cluster.head.ip

    hdfs = Client('http://%s:50070' % nn_ip)
    assert hdfs

    root_dirs = hdfs.list('/')
    assert 'tmp' in root_dirs
    assert 'user' in root_dirs

    users_dirs = hdfs.list('/user')
    assert project.settings['USERNAME'] in users_dirs
Exemplo n.º 8
0
def read(dir_path, header):
    client = Client("http://127.0.0.1:50070")
    log_data = []
    for date_dir in client.list(dir_path):
        for log_file in client.list(dir_path+'/'+date_dir):
            with client.read(dir_path+'/'+date_dir+'/'+log_file) as fs:
                for line in fs:
                    row = line.strip().split('&')
                    if row != ['']:
                        tmp = []
                        for field in row:
                            tmp.append(field.split('=')[1])
                        log_data.append(tmp)
    return pd.DataFrame(log_data, columns=header)
Exemplo n.º 9
0
    def get(self, request):
        _hdfsName = request.GET.get("hdfsName",
                                    "46eccfa2-1c56-11e8-a752-1008b1983d21")
        _hdfsPath = os.path.join("/datahoop/", _hdfsName)
        # print(_hdfsPath)

        try:
            # 链接HDFS,读取文件
            cli = Client(settings.HDFS_HOST)
            fileName = cli.list(_hdfsPath)[1]
            # print("filename:", fileName)
            _hdfsPath = os.path.join(_hdfsPath + "/", fileName)
            # print(_hdfsPath)
            try:
                with cli.read(_hdfsPath, length=2000, encoding="gbk") as f:
                    datas = f.read()
            except UnicodeDecodeError:
                with cli.read(_hdfsPath, length=2000, encoding="utf8") as f:
                    datas = f.read()
            # 字符转list
            re.sub("\r\n", "\n", datas)
            logger.debug(datas)
            datas = datas.strip('"').split('\n')
            content = []
            for i in datas:
                content.append(i.strip('"').split(","))
        except HdfsError:
            return Response(data={"error": "文件未找到或文件编码格式不符合"},
                            status=status.HTTP_400_BAD_REQUEST)

        return Response(data={"data": content}, status=status.HTTP_200_OK)
Exemplo n.º 10
0
def download_parquet_from_hdfs_dir(parquet_dir,
                                   local_dir,
                                   hdfs_ip,
                                   hdfs_port=50070):
    """
    从hdfs批量下载parquet文件到local_path
    :param parquet_dir: parquet文件所在的文件'/data/a.parquet'
    :param local_path: '/data_gen/b.parquet'
    :param hdfs_ip:
    :param hdfs_port:
    :return:
    """
    import os
    from hdfs.client import Client
    client = Client(f'http://{hdfs_ip}:{hdfs_port}')
    parquet_list = client.list(parquet_dir)
    print(parquet_list)
    for p in parquet_list:
        if p.endswith('.parquet'):
            print(f'downloading {os.path.join(parquet_dir, p)}')
            with client.read(os.path.join(parquet_dir, p)) as reader:
                data = reader.read()
            if not os.path.exists(local_dir):
                os.makedirs(local_dir)
            with open(os.path.join(local_dir, p), 'wb') as f:
                f.write(data)
Exemplo n.º 11
0
def run_hdfs_test(conf: ConfigData):
    # the_date = conf.test_date()  # "20181101"
    client = Client(conf.hdfs_ip())  # "http://10.2.201.197:50070"
    # root_path = conf.unzip_dir(is_baoli)     # 'D:/DATA/UNZIP/'
    # dest_dir = conf.hdfs_dir_syb(is_baoli)

    # file_pre = conf.file_pre1()  # "t1_trxrecord_"
    # file_ext = conf.file_ext2()  # "_V2.csv"

    #    client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True)
    dat = client.list('/', status=False)
    print(dat)
Exemplo n.º 12
0
 def read_accesslog_from_hdfs(self):
     # 实时日志流的存储是每5个点击数据存储一次
     client = Client("http://localhost:50070")
     file_names = client.list("/hadoop_file")
     ss = ""
     for file_name in file_names:
         with client.read("/hadoop_file/" + file_name,
                          encoding="utf-8") as reader:
             for line in reader:
                 # 去除测试数据
                 if line.startswith("filed1"):
                     continue
                 ss += line
Exemplo n.º 13
0
def do():
    global csv_path
    client = Client(hdfshost)
    file_list = client.list(csv_path)
    print(file_list)
    for file in file_list:
        if file.endswith(".csv"):
            csv_path = csv_path + file
    # 读取csv并同名写到本地
    with open("./异常临界值local.csv", 'w', encoding='GB2312') as local:
        with client.read(csv_path, encoding='GB2312') as hdfs:
            for line in hdfs:
                local.write(line.strip('\n'))
 def get_child(client: Client, path: str, f_type: int=3):  # 1 file , 2 dir, 3 any
     a_list = []
     # path = str(pathlib.PosixPath(path).expanduser())
     is_dir = MyHdfsFile.isdir(client, path)
     if is_dir:
         names = client.list(path)
         for a_name in names:
             a_file = str(pathlib.PurePosixPath(path).joinpath(a_name))
             if f_type == 3:
                 a_list.append(a_file)
             elif MyHdfsFile.is_exist(client, a_file, f_type=f_type):
                 a_list.append(a_file)
     return a_list
 def generate_files(
     date,
     path="user/hadoop/trajectory/sim_trajectory_per_day/shanghai/%s-%s/%s",
 ):
     year, month, day = date.split('-')
     if date in QuerierParallel.files:
         return
     else:
         client = Client(QuerierParallel.master_hdfs_path,
                         root="/",
                         timeout=100,
                         session=False)
         QuerierParallel.files.update(
             {date: client.list(path % (year, month, day))})
Exemplo n.º 16
0
def mv_local_to_hdfs(filename):
    '''
    将写好的文件移动到hdfs
    '''
    now_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    file_index=int(now_time[11:13])
    if file_index==0:
        file_path_all=getYesterday()
    else:
        file_path_all=now_time[0:10]
    client=Client("http://master:50070")
    if file_path_all not in client.list('/traffLog'):
        os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -mkdir /traffLog/'+file_path_all)
    local_path=get_path_or_buf(filename)
    os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -put '+local_path+' /traffLog/'+file_path_all)
Exemplo n.º 17
0
def mv_local_to_hdfs():
    '''
    将写好的文件移动到hdfs
    '''
    now_time=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
    file_index=int(now_time[11:13])
    if file_index==0:
        file_path_all=getYesterday()
    else:
        file_path_all=now_time[0:10]
    client=Client("http://master:50070")
    if file_path_all not in client.list('/traffFile'):
        os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -mkdir /traffFile/'+file_path_all)
    local_path='/usr/local/bro/spool/worker-1/extract_files/*'
    os.system('/home/hadoop/hadoop-2.5.2/bin/hadoop fs -put '+local_path+' /traffFile/'+file_path_all)
Exemplo n.º 18
0
class HdfsClient(object):
    def __init__(self, url=None):
        self.url = url
        self.client = Client(url=url)

    def ls(self, path):
        return self.client.list(path)

    def isFile(self, path):
        result = self.client.status(path, strict=False)
        if result:
            return result[TYPE] == FILE
        else:
            return False

    def mkdir(self, path):
        self.client.makedirs(path, permission=777)

    def isDirectory(self, path):
        result = self.client.status(path, strict=False)
        if result:
            return result[TYPE] == DIRECTORY
        else:
            return False

    def upload(self, localSourcePath, remoteDistPath):
        self.client.upload(remoteDistPath, localSourcePath, overwrite=True)

    def dowload(self, remoteSourcePath, localDistPath):
        self.client.download(remoteSourcePath, localDistPath, overwrite=True)

    def put(self, localSourcePath, remoteDistPath):
        with open(localSourcePath,
                  "r") as reader, self.client.write(remoteDistPath) as writer:
            data = reader.read(FILE_SIZE)
            while data != "":
                writer.write(data)
                data = reader.read(FILE_SIZE)

    def get(self, remoteSourcePath, localDistPath):
        with self.client.read(remoteSourcePath,
                              chunk_size=FILE_SIZE) as reader, open(
                                  localDistPath, "a+") as writer:
            for chunk in reader:
                writer.write(chunk)
Exemplo n.º 19
0
def list_files_in_flume_directory(hdfs_address):
    '''
        Description: This function helps users to check files in a specific hdfs
		     directory
        Parameters: -hdfs_address: hadoop master node ip address
                    -hdfs_file_path: the hdfs directory path 
        Returns: files_utf8_encode. A list which contains all files' name in a 
		 directory
        '''
    # connect to hdfs
    client = Client('http://' + hdfs_address)

    # list all folders
    files = client.list('/flume')
    files_utf8_encode = map(lambda x: x.encode('utf-8'), files)
    if len(files_utf8_encode) > 1:
        print(
            'Please check the data pipeline, the number of files should be 1')
    else:
        print(files_utf8_encode[0])
        return files_utf8_encode[0]
Exemplo n.º 20
0
    def get(self, request):
        """
         计算结果下载hdfs 文件
        :param request:
        :return:
        """
        hdfsPath = request.GET.get("hdfsPath")
        logger.debug("请求文件:{0}".format(hdfsPath))
        localPath = os.path.join(settings.BASE_DIR, 'media', 'hdfsFile')
        logger.debug("本地存储路径:{0}".format(localPath))
        # 链接HDFS下载文件
        cli = Client(settings.HDFS_HOST)
        logger.debug("HDFS连接{0}".format(cli))
        try:
            fileName = cli.list(hdfsPath)[1]
            # print("filename:", fileName)
            path = os.path.join(hdfsPath, fileName)
            logger.debug(path, localPath)
            cli.download(hdfs_path=path, local_path=localPath, overwrite=True)
        except HdfsError:
            return Response(data={"error": "文件未找到"},
                            status=status.HTTP_404_NOT_FOUND)

        return Response(data={"fileName": fileName}, status=status.HTTP_200_OK)
 def get_hadoop_connection(cls, host):
     try:
         client = Client(host, root='/', timeout=10000)
         client.list('/')
     except Exception as e:
         try:
             log_handler.log.info('get query data error from hadoop 01 -----{}'.format(e))
             host = host.replace('01', '02')
             client = Client(host, root='/', timeout=10000)
             client.list('/')
         except Exception as e:
             try:
                 log_handler.log.info('get query data error from hadoop 02 -----{}'.format(e))
                 host = host.replace('02', '03')
                 client = Client(host, root='/', timeout=10000)
                 client.list('/')
             except Exception as e:
                 client = None
                 log_handler.log.info('get query data error from hadoop -----{}'.format(e))
     return client
Exemplo n.º 22
0
"""

' a study.__init__.py module '

__author__ = 'steven'

import os
import time

from hdfs.client import Client

client = Client("http://127.0.0.1:50070", root="/", timeout=100)

print(client.makedirs("/test/"))
print(client.status("/test/"))
print(client.list("/test/"))
print(client.delete("/test/", True))
upload_filename = client.upload(
    "/test/" + str(int(round(time.time() * 1000))) + ".pdf", "test.pdf")
print(upload_filename)
download_path = os.path.join(os.path.abspath('.'), 'download/hdfs/')
if not os.path.exists(download_path):
    os.makedirs(download_path, True)
else:
    print(download_path, ' is existed.')
print(
    client.download(
        upload_filename,
        download_path + str(int(round(time.time() * 1000))) + ".pdf"))
print(client.delete(upload_filename))
print(client.delete(upload_filename))
Exemplo n.º 23
0
def resutlApp(dict_parameters):
    # step1 : 获取动态参数
    print('hello wolrd')
    #    arguments = parse_arguments(sys.argv[1:])
    #    width = arguments['width'][0]
    #    height = arguments['height'][0]
    #    model_path = arguments['model_path'][0]
    #    label_path = arguments['label_path'][0]
    #    testDataset = arguments['testDataset'][0]

    print("接受参数中..................")
    print(type(dict_parameters), dict_parameters)
    print("\n" * 3)

    network = dict_parameters.get('network')
    hdfs_label_ip = dict_parameters.get('hdfslabel')
    img_src = dict_parameters.get('img_src')
    width = dict_parameters.get("width")
    height = dict_parameters.get("height")
    testDataset = dict_parameters.get("testDataset")
    model_path = dict_parameters.get("model_path")
    label_path = dict_parameters.get("label_path")
    print(network, img_src, width, height, model_path, label_path, testDataset)
    print("model path is ", model_path)

    #
    # step2测试数据预处理

    X, ImgFiles = preprocessImageFolder(testDataset, width, height)

    # step3 : 获取预测的类别标签名称:
    print('hdfs web interface address is :', hdfs_label_ip)
    client = Client(hdfs_label_ip)
    # client = hdfs.Client("http://172.10.236.21:50070")

    types = client.list(label_path)

    # step4 加载模型
    tf.logging.info("模型加载中.....")
    print(" model loading .....", model_path)
    keras.backend.clear_session()
    model = load_model(model_path)
    print(model.summary())
    np.set_printoptions(precision=2, suppress=True)

    # step 5使用模型进行预测
    print("model predicting ......")

    if 'fcn' in network:
        try:
            print(" test in FCN network structure")
            classes = np.argmax(np.squeeze(model.predict(X)), axis=1)
        except Exception as e:
            print("un expected error").format(e)
    else:
        try:
            print(" test in Dense network structure")
            classes = np.argmax(model.predict(X), axis=1)
        except Exception as e:
            print("un expected error").format(e)
    # print('classes: ', classes)

    #  step 6:测试结果返回
    back_testResult = {}
    print('types:  ', types)

    print("predicted result in your kerboard floder is: \n")

    for i, index in enumerate(classes):
        key = str(ImgFiles[i])
        value = str(types[index])
        back_testResult.setdefault(key, value)
    # plt.imshow(ImgFiles[i])
    # plt.title(str(types[index]))
    # plt.show()
    print('call bck type ', type(back_testResult))
    # return 'abc'
    return str(back_testResult)
Exemplo n.º 24
0
 def test_hdfs(self):
     client = Client('http://172.16.2.41:50070', proxy='hive', root='/')
     print(client.list('/tmp/jiajie'))
     with client.read('/tmp/jiajie/birth_names.txt', length=10) as reader:
         data = reader.read()
Exemplo n.º 25
0
#设置连接
client = Client("http://192.168.56.20:50070", root="/", session=False)

# list——获取指定路径的子目录信息
# print(client.list("/"))

# status——获取路径的具体信息
# print(client.status("/", strict=True))

# makedirs——创建目录
# print(client.makedirs("/hello"))

# rename—重命名
# print(client.rename("/hello","/helloWorld"))

# delete—删除
# print(client.delete("helloWorld"))

# upload——上传数据
# client.upload("/",r"C:\Users\Administrator\Desktop\斗破\斗破苍穹.txt")

# download——下载
# client.download("/斗破苍穹.txt",r"C:\Users\Administrator\Desktop")

# read——读取文件
# with client.read("/斗破苍穹.txt",encoding='GBK') as f:
#     print(f.read())

print(client.list("/"))
Exemplo n.º 26
0
    loss = percent(nnDF)
    print('loss: ', loss)
    #coefficient of determination ---- 1.0 is the best
    score = model.score(testD, testL)
    print('score: ', score)


if __name__ == '__main__':

    filepath = '/sdbadmin/hadoop/input'
    try:
        client = Client('http://192.168.111.130:50070')
    except Exception as e:
        print(e)

    dirs = client.list(filepath)
    #将hdfs本地化
    print('there are %d shares' % (len(dirs)))
    '''
    try:
        for i in range(len(dirs)):
            client.download(filepath+'/'+dirs[i],'/opt/share_code_data/'+dirs[i])
    except Exception as e:
        print(e)
    '''
    min_max_scaler = preprocessing.MinMaxScaler()
    DD = pd.DataFrame([])
    for i in range(len(dirs)):
        df = pd.read_csv('/opt/share_code_data/' + dirs[i], index_col=0)
        if len(DD) == len(df) or len(DD) == 0 and len(df) != 0:
            trun = min_max_scaler.fit_transform(
Exemplo n.º 27
0
def yesterday():
    return today() - datetime.timedelta(days=1)


# 执行主方法
if __name__ == '__main__':
    print "监控HDFS......"
    yesterday_datetime_format = yesterday()
    for table in CHECK_TABLE:
        is_success = False
        has_data = False
        content = ""
        try:
            path = ROOT_DIR + table + "/" + str(yesterday_datetime_format)
            client_list = client.list(path, True)
            for i in range(0, len(client_list)):
                if (client_list[i][0].startswith('part-')) and (int(
                        client_list[i][1].get("length")) > 0):
                    has_data = True
                elif client_list[i][0].__eq__("_SUCCESS"):
                    is_success = True

        except Exception, e:
            content = "异常信息:" + str(e) + "<br>" + \
                      str("HDFS路径:") + path

        if (content == "") and (not is_success):
            content = "异常信息:" + table + "相关job运行失败" + "<br>" + \
                      str("HDFS路径:") + path
Exemplo n.º 28
0
# command line : pip install hdfs
#Permission denied: '/usr/local/anaconda3/lib/python3.7/site-packages/docopt.py'

#try another way like below  and it works
# pip install hdfs --target=/users/home/xzh216/pythonPackage

import sys;
sys.path.append("/users/home/xzh216/pythonPackage/") 
import hdfs
from hdfs.client import Client
client = Client("http://node0:9870/")    
# echo $HADOOP_HOME    
# cat /opt/hadoop/hadoop/etc/hadoop/core-site.xml     --->node0
#node0  hostname    2.X50070   3.X 9870
#print("hdfs:", client.list(hdfs_path="/",status=True))
fileList=client.list("/data/ghcnd/daily/")

#create empty rdd
result = spark.createDataFrame(sc.emptyRDD(),schema_Daily)
path_pre = "hdfs:///data/ghcnd/daily/"
for file in fileList:
    daily_temp= (
        spark.read.format("com.databricks.spark.csv")
        .option("header", "false")
        .option("inferSchema", "false")
        .schema(schema_Daily)
        .load(path_pre+file)
    )  
    result=result.union(daily_temp)

result.rdd.getNumPartitions()    #258
Exemplo n.º 29
0
from hdfs.client import Client

client = Client("http://192.168.1.197:50070",
                root="/",
                timeout=100,
                session=False)
print client.list("/topics")
# with client.read("/topics") as reader:
#     print reader.read()
Exemplo n.º 30
0
#!/usr/bin/python3
#  -*- coding: utf-8 -*-

# hdfs = HDFileSystem(host='hdfs-bizaistca.corp.microsoft.com', port=8020, user='******')
from hdfs.client import Client

hdfs_path = '/user/hadoop/fanyuguang/input/'
local_path = '.'

client = Client("hdfs-bizaistca.corp.microsoft.com:8020/",
                root="/",
                timeout=10000,
                session=False)
result = client.list(hdfs_path, status=False)
print(result)
# client.download(hdfs_path, local_path, overwrite=False)
Exemplo n.º 31
0
class HDFSClient:
    def __init__(self,
                 url,
                 root=None,
                 user=None,
                 proxy=None,
                 timeout=None,
                 session=None):
        """ 连接hdfs
        url: HDFS名称节点的主机名或IP地址及端口号
        root: 根路径,此路径将作为传递给客户端的所有HDFS路径的前缀
        user: 使用InsecureClient(Base Client),指定访问hdfs的用户;Client使用默认用户dr.who
        proxy: 代理的用户
        timeout: 连接超时,转发到请求处理程序
        session: request.Session实例,用于发出所有请求
        """
        if user:
            self.client = InsecureClient(url, user=user)
        else:
            self.client = Client(url,
                                 root=root,
                                 proxy=proxy,
                                 timeout=timeout,
                                 session=session)

    def list_hdfs_file(self, hdfs_path, status=False):
        """ 返回目录下的文件
        status: 每个文件或目录的属性信息(FileStatus)
        return: 列表中包含元组,每个元组是目录名或文件名和属性信息构成
        """
        return self.client.list(hdfs_path, status=status)

    def walk_hdfs_file(self,
                       hdfs_path,
                       depth=0,
                       status=False,
                       ignore_missing=False,
                       allow_dir_changes=False):
        """ 深度遍历远程文件系统
        hdfs_path: 起始路径。如果该路径不存在,则会引发HdfsError。如果指向文件,则返回的生成器将为空
        depth: 探索的最大深度。0为无限制
        status: 同时返回每个文件或文件夹的相应FileStatus
        ignore_missing: 忽略缺少的嵌套文件夹,而不是引发异常
        allow_dir_changes: 允许更改目录列表以影响遍历
        return: 生成器,返回值参考python的walk函数
        """
        return self.client.walk(hdfs_path,
                                depth=depth,
                                status=status,
                                ignore_missing=ignore_missing,
                                allow_dir_changes=allow_dir_changes)

    def delete_hdfs_file(self, hdfs_path, recursive=False, skip_trash=False):
        """ 删除文件
        recursive: 递归删除文件或目录,默认情况下,如果尝试删除非空目录,此方法将引发HdfsError
        skip_trash: 设置为false时,已删除的路径将被移动到适当的垃圾回收文件夹,而不是被删除
        return: 如果删除成功,则此函数返回True;如果hdfs_path之前不存在文件或目录,则返回False
        """
        return self.client.delete(hdfs_path,
                                  recursive=recursive,
                                  skip_trash=skip_trash)

    def download_hdfs_file(self,
                           hdfs_path,
                           local_path,
                           overwrite=True,
                           n_threads=1,
                           temp_dir=None,
                           **kwargs):
        """ 下载文件
        hdfs_file: HDFS上要下载的文件或文件夹的路径。如果是文件夹,则将下载该文件夹下的所有文件
        local_file: 本地路径。如果它已经存在并且是目录,则文件将在其中下载
        overwrite: 覆盖任何现有文件或目录
        n_threads: 用于并行化的线程数。值为0(或负数)将使用与文件一样多的线程
        temp_dir: 当overwrite = True并且最终目标路径已经存在时,将首先在其下下载文件的目录。下载成功完成后,它将被交换
        **kwargs: 关键字参数转发给read()。如果未传递chunk_size参数,则将使用默认值64 kB
        return: 方法执行成功,将返回本地下载路径
        """
        res = self.client.download(hdfs_path,
                                   local_path,
                                   overwrite=overwrite,
                                   n_threads=n_threads,
                                   temp_dir=temp_dir,
                                   **kwargs)

    def upload_hdfs_file(self,
                         hdfs_path,
                         local_path,
                         n_threads=1,
                         temp_dir=None,
                         chunk_size=65536,
                         progress=None,
                         cleanup=True,
                         **kwargs):
        """ 上传文件
        hdfs_path: 目标HDFS路径。如果它已经存在并且是目录,则文件将在其中上传
        local_path: 文件或文件夹的本地路径。如果是文件夹,则将上载其中的所有
            文件(请注意,这意味着没有文件的文件夹将不会远程创建)
        cleanup: 如果上传过程中发生错误,删除所有上传的文件
        return: 方法执行成功,将返回状态码,远程上传目录,错误信息
        """
        try:
            res = self.client.upload(hdfs_path,
                                     local_path,
                                     n_threads=n_threads,
                                     temp_dir=temp_dir,
                                     chunk_size=chunk_size,
                                     progress=progress,
                                     cleanup=cleanup,
                                     overwrite=True)
            return 0, res, ''
        except HdfsError as e:
            return 1, '', str(e)

    def makedirs(self, hdfs_path, permission=None):
        """ 创建目录,可以递归
        permission: 在新创建的目录上设置的八进制权限,这些权限将仅在尚不存在的目录上设置
        return: None
        """
        self.client.makedirs(hdfs_path, permission=permission)

    def parts(self, hdfs_path, parts=None, status=False):
        """
        hdfs_path: 远程路径。该目录每个分区最多应包含一个零件文件(否则将任意选择一个文件)
        parts: 零件文件编号列表或要选择的零件文件总数。如果是数字,那么将随机选择那么多分区。 
               默认情况下,将返回所有零件文件。如果部件是列表,但未找到部件之一或需要太多样本,则会引发HdfsError
        status: 返回文件的FileStatus
        return: 返回对应于路径的零件文件的字典
        """
        return self.client.parts(hdfs_path, parts=parts, status=status)

    def read_hdfs_file(self, **kwds):
        """ 读取文件内容,这个方法必须在一个with块中使用,以便关闭连接
        >>> with client.read('foo') as reader:
        >>>     content = reader.read()
        hdfs_path: HDFS路径
        offset: 起始字节位置
        length: 要处理的字节数。设置为None时会读取整个文件
        buffer_size: 用于传输数据的缓冲区大小(以字节为单位)。默认为在HDFS配置中设置的值
        encoding: 用于解码请求的编码。默认情况下,返回原始数据
        chunk_size: 如果设置为正数,则上下文管理器将返回一个生成器,该生成器生成每个chunk_size字节,
                而不是类似文件的对象(除非还设置了定界符)
        delimiter: 如果设置,上下文管理器将在每次遇到定界符时返回生成器。此参数要求指定编码
        progress: 回调函数,用于跟踪进度,称为每个chunk_size字节(如果未指定块大小,则不可用)。
                它将传递两个参数,即要上传的文件的路径和到目前为止已传输的字节数。
                完成后,将以-1作为第二个参数调用一次
        """
        self.client.read(**kwds)

    def write_hdfs_file(self,
                        hdfs_path,
                        data=None,
                        overwrite=False,
                        permission=None,
                        blocksize=None,
                        replication=None,
                        buffersize=None,
                        append=False,
                        encoding=None):
        """ 在HDFS上创建文件
        data: 要写入的文件内容。 可以是字符串,生成器或文件对象。 最后两个选项将允许流式上传(即无需
              将全部内容加载到内存中)。 如果为None,则此方法将返回类似文件的对象,应使用with块调用
              它(请参见下面的示例)
        permission: 在新创建的文件上设置的八进制权限
        append: 附加到文件而不是创建新文件
        encoding: 用于序列化写入数据的编码
        >>> from json import dump, dumps
        >>> records = [
        >>>     {'name': 'foo', 'weight': 1},
        >>>     {'name': 'bar', 'weight': 2},
        >>> ]
        >>> # As a context manager:
        >>> with client.write('data/records.jsonl', encoding='utf-8') as writer:
        >>>     dump(records, writer)
        >>> Or, passing in a generator directly:
        >>> client.write('data/records.jsonl', data=dumps(records), encoding='utf-8')
        """
        self.client.write(hdfs_path,
                          data=data,
                          overwrite=overwrite,
                          permission=permission,
                          blocksize=blocksize,
                          replication=replication,
                          buffersize=buffersize,
                          append=append,
                          encoding=encoding)

    def rename_or_move(self, hdfs_src_path, hdfs_dst_path):
        """ 移动文件或目录
        hdfs_src_path: 源路径
        hdfs_dst_path: 目标路径,如果路径已经存在并且是目录,则源将移入其中。
                如果路径存在并且是文件,或者缺少父目标目录,则此方法将引发HdfsError
        """
        self.client.rename(hdfs_src_path, hdfs_dst_path)

    def set_owner(self, hdfs_path, owner=None, group=None):
        """ 更改文件的所有者,必须至少指定所有者和组之一
        owner: 可选,文件的新所有者
        group: 可选,文件的新所有组
        """
        self.client.set_owner(hdfs_path, owner=owner, group=group)

    def set_permission(self, hdfs_path, permission):
        """ 更改文件权限
        permission: 文件的新八进制权限字符串
        """
        self.client.set_permission(hdfs_path, permission)

    def set_replication(self, hdfs_path, replication):
        """ 设置文件副本
        replication: 副本数
        """
        self.client.set_replication(hdfs_path, replication)

    def set_times(self, hdfs_path, access_time=None, modification_time=None):
        """ 更改文件的时间戳
        """
        self.client.set_times(hdfs_path,
                              access_time=access_time,
                              modification_time=modification_time)

    def status_hdfs_file(self, hdfs_path, strict=True):
        """ 获取文件的FileStatus
        strict: 如果为False,则返回None,而不是如果路径不存在则引发异常
        """
        self.client.status(hdfs_path, strict=strict)
Exemplo n.º 32
0
from hdfs.client import Client


def con(host):
    conn = Client(host)
    return conn


def ls(host, remote_path):
    conn = con(host)
    list_path = conn.list(remote_path)
    return list_path


def down(host, remote_path, local_path):
    conn = con(host)
    conn.download(remote_path, local_path)


if __name__ == "__main__":
    hdfs_host = "http://192.168.175.231:9870/"
    client = Client(hdfs_host)
    print(client.list("/test/parquet/ns=37082600014020001005/d=20200801"))
    print(len(client.list("/test/parquet/ns=37082600014020001005/d=20200801")))
    # client.download('/test/parquet/ns=37082600014020001005',
    #                 'D:\\PycharmWorkSpace\\demo\\hdfs')
    # local_file_path = "D:\\PycharmWorkSpace\\demo\\hdfs\\ns=37082600014020001005"
    # columns = ['id', 't', 'v', 'vt', 'c', 'u', 's', 'dqsj']
    # df = pa.read_parquet(local_file_path, engine='auto', columns=columns)
    # print(df['dqsj'])
Exemplo n.º 33
0
from hdfs.client import Client
import os, tarfile

client = Client("http://10.239.1.57:50070")

#list()会列出hdfs指定路径的所有文件信息,接收两个参数
print("hdfs中的目录为:", client.list(hdfs_path="/test1", status=True))

#read()
#读取文件信息 类似与 hdfs dfs -cat hfds_path,参数如下:
# hdfs_path hdfs路径
# offset 读取位置
# length 读取长度
# buffer_size 设置buffer_size 不设置使用hdfs默认100MB 对于大文件 buffer够大的化 sort与shuffle都更快
# encoding 指定编码
# chunk_size 字节的生成器,必须和encodeing一起使用 满足chunk_size设置即 yield
# delimiter 设置分隔符 必须和encodeing一起设置
# progress 读取进度回调函数 读取一个chunk_size回调一次

print(
    client.read(
        "/test1/part-00000-15b6e708-1025-408b-a6f2-1f37a7fe7064-c000.csv"))

# 下载
print(
    "下载文件结果part.csv:",
    client.download(
        hdfs_path=
        "/test1/part-00000-15b6e708-1025-408b-a6f2-1f37a7fe7064-c000.csv",
        local_path="/home/yyj2020/test",
        overwrite=True))
Exemplo n.º 34
0
 def test_request(self):
     client = Client(url="http://10.0.137.24:50070")
     print client.list("/user/cabbage", status=True)
     print client.status("/user/cabbage")
                        "--hdfsManager",
                        help="HDFS manager",
                        required=True)
    args = parser.parse_args()

    sc = SparkContext(appName="EFFECT-LOAD-TO-ES")
    conf = SparkConf()
    hdfs_client = Client(args.hdfsManager)
    hdfsRelativeFilname = args.input
    if hdfsRelativeFilname.startswith("hdfs://"):
        idx = hdfsRelativeFilname.find("/", 8)
        if idx != -1:
            hdfsRelativeFilname = hdfsRelativeFilname[idx:]

    if args.doctype is None:
        document_types = hdfs_client.list(args.input, False)
    else:
        document_types = args.doctype.split(",")

    create_index = True

    for doc_type in document_types:
        doc_type = doc_type.strip()
        input_rdd = sc.sequenceFile(args.input + "/" +
                                    doc_type)  #.partitionBy(args.partitions)

        if doc_type == 'topic' or doc_type == 'post':
            es_write_conf = {
                "es.nodes": args.host,
                "es.port": args.port,
                "es.nodes.discover": "false",