示例#1
0
def get_hdfs_client(env="local"):
    master, slave = get_env(env)
    try:
        client = Client(master)
        client.list("/")
    except HdfsError:
        client = Client(slave)
    return client
    def get_rtree(date,
                  hdfs_rtree_path="/user/hadoop/rtree/shanghai/%s-%s/%s.%s",
                  local_path="/home/hadoop/data/downloaded/rtree/%s-%s/"):

        year, month, day = date.split('-')
        # guarantee there exists local directory for the worker nodes
        path = local_path % (year, month)
        if not os.path.exists(path):
            os.makedirs(path)

        client = Client(QuerierParallel.master_hdfs_path,
                        root="/",
                        timeout=100,
                        session=False)
        client.download(hdfs_rtree_path % (year, month, day, "data"),
                        path,
                        overwrite=True)
        client.download(hdfs_rtree_path % (year, month, day, "index"),
                        path,
                        overwrite=True)

        # contruct the rtree
        rtree_properties = index.Property()
        rtree_properties.dat_extension = 'data'
        rtree_properties.idx_extension = 'index'
        rtree = index.Index(path + day, properties=rtree_properties)

        return rtree
示例#3
0
 def generate_temp_files(need_certificate=NEED_CERTIFICATE):
     if need_certificate:
         with krbcontext(using_keytab=True,
                         keytab_file=KEYTAB_PATH,
                         principal=PRINCIPAL):
             for node in HDFS.NODES:
                 try:
                     hdfs_client = KerberosClient(node)
                     hdfs_client.download(HDFS.REMOTE_PATH,
                                          HDFS.LOCAL_PATH,
                                          n_threads=HDFS.THREAD_NUM)
                 except Exception as err:
                     logging.info(err)
                 else:
                     return
             logging.error("Failed to download remote HDFS file.")
             raise Exception("Failed to download remote HDFS file.")
     else:
         for node in HDFS.NODES:
             try:
                 hdfs_client = Client(node)
                 hdfs_client.download(HDFS.REMOTE_PATH,
                                      HDFS.LOCAL_PATH,
                                      n_threads=HDFS.THREAD_NUM)
             except Exception as err:
                 logging.info(err)
             else:
                 return
         logging.error("Failed to download remote HDFS file.")
         raise Exception("Failed to download remote HDFS file.")
示例#4
0
    def get(self, request):
        _hdfsName = request.GET.get("hdfsName",
                                    "46eccfa2-1c56-11e8-a752-1008b1983d21")
        _hdfsPath = os.path.join("/datahoop/", _hdfsName)
        # print(_hdfsPath)

        try:
            # 链接HDFS,读取文件
            cli = Client(settings.HDFS_HOST)
            fileName = cli.list(_hdfsPath)[1]
            # print("filename:", fileName)
            _hdfsPath = os.path.join(_hdfsPath + "/", fileName)
            # print(_hdfsPath)
            try:
                with cli.read(_hdfsPath, length=2000, encoding="gbk") as f:
                    datas = f.read()
            except UnicodeDecodeError:
                with cli.read(_hdfsPath, length=2000, encoding="utf8") as f:
                    datas = f.read()
            # 字符转list
            re.sub("\r\n", "\n", datas)
            logger.debug(datas)
            datas = datas.strip('"').split('\n')
            content = []
            for i in datas:
                content.append(i.strip('"').split(","))
        except HdfsError:
            return Response(data={"error": "文件未找到或文件编码格式不符合"},
                            status=status.HTTP_400_BAD_REQUEST)

        return Response(data={"data": content}, status=status.HTTP_200_OK)
示例#5
0
def put_to_hdfs(result_file):
    client = Client("http://192.168.53.30:50070")
    if client.status('/tmp/result.csv', strict=False):
        client.delete('/tmp/result.csv')
        client.upload('/tmp', result_file)
    else:
        client.upload('/tmp', result_file)
示例#6
0
    def get(self, request):
        _hdfsName = request.GET.get("hdfsName",
                                    "46eccfa2-1c56-11e8-a752-1008b1983d21")
        _hdfsPath = os.path.join("/datahoop/", _hdfsName)
        obj = DataSource.objects.get(format_filename=_hdfsName)
        # print(_hdfsPath)
        try:
            # 链接HDFS,读取文件
            cli = Client(settings.HDFS_HOST)
            try:
                with cli.read(_hdfsPath, encoding="gbk") as f:
                    datas = f.read()
            except UnicodeDecodeError:
                with cli.read(_hdfsPath, encoding="utf8") as f:
                    datas = f.read()
        except HdfsError:
            return Response(data={"error": "文件未找到或文件编码格式不符合"},
                            status=status.HTTP_400_BAD_REQUEST)

        response = HttpResponse(content_type='csv/plain')
        response['Content-Disposition'] = 'attachment; filename={0}'.format(
            obj.file_name)
        response.write(datas)

        return response
示例#7
0
def download_parquet_from_hdfs_dir(parquet_dir,
                                   local_dir,
                                   hdfs_ip,
                                   hdfs_port=50070):
    """
    从hdfs批量下载parquet文件到local_path
    :param parquet_dir: parquet文件所在的文件'/data/a.parquet'
    :param local_path: '/data_gen/b.parquet'
    :param hdfs_ip:
    :param hdfs_port:
    :return:
    """
    import os
    from hdfs.client import Client
    client = Client(f'http://{hdfs_ip}:{hdfs_port}')
    parquet_list = client.list(parquet_dir)
    print(parquet_list)
    for p in parquet_list:
        if p.endswith('.parquet'):
            print(f'downloading {os.path.join(parquet_dir, p)}')
            with client.read(os.path.join(parquet_dir, p)) as reader:
                data = reader.read()
            if not os.path.exists(local_dir):
                os.makedirs(local_dir)
            with open(os.path.join(local_dir, p), 'wb') as f:
                f.write(data)
def run_hive(conf: ConfigData, the_date: str):
    a_client = Client(conf.hdfs_ip())  # "http://10.2.201.197:50070"
    conn = connect(host=conf.hive_ip(),
                   port=conf.hive_port(),
                   auth_mechanism=conf.hive_auth(),
                   user=conf.hive_user())
    cur = conn.cursor()

    print("Start\n")

    the_date = StrTool.get_the_date_str(the_date)  # "20181101"
    # hdfs_dir_bl
    root_path = str(
        pathlib.PurePosixPath(conf.get_hdfs_path()).joinpath(the_date))
    file_name = str(
        pathlib.PurePosixPath(root_path).joinpath(
            conf.get_file_name(the_date)))
    # "/data/posflow/allinpay_utf8_zc/20181101/"
    # 20181101_loginfo_rsp_bl_new.csv
    # 20181101_rsp_agt_bl_new.del
    # 20181101_rxinfo_rsp_bl.txt

    table_name = conf.get_table_name()

    if MyHdfsFile.isfile(a_client, file_name):
        sql = 'LOAD DATA INPATH \'' + file_name + '\' INTO TABLE ' + table_name  # 'test.t1_trxrecprd_v2_zc'
        # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
        print("OK" + "  " + sql + "\n")
        cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
def run_hive(conf: ConfigData, the_date: str):
    client = Client(conf.hdfs_ip())  # "http://10.2.201.197:50070"
    conn = connect(host=conf.hive_ip(), port=conf.hive_port(), auth_mechanism=conf.hive_auth(), user=conf.hive_user())
    cur = conn.cursor()

    the_date = StrTool.get_the_date_str(the_date)  # "20181101"
    root_path = conf.get_data("hdfs_dir_zc")  # "/data/posflow/allinpay_utf8_zc/"
    file_ext3 = conf.get_data("file_ext3")  # _loginfo_rsp.txt          # 20181101_loginfo_rsp.txt
    file_ext4 = conf.get_data("file_ext4")  # _loginfo_rsp_agt.txt      # 20181101_loginfo_rsp_agt.txt
    file_ext5 = conf.get_data("file_ext5")  # _rxinfo_rsp.txt           # 20181101_rxinfo_rsp.txt
    file_ext6 = conf.get_data("file_ext6")  # _rxinfo_rsp_agt.txt       # 20181101_rxinfo_rsp_agt.txt

    print("Start\n")

    file3 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext3))
    file4 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext4))
    file5 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext5))
    file6 = str(pathlib.PurePosixPath(root_path).joinpath(the_date + file_ext6))

    f_list = [file3,file4,file5,file6]
    t_list = ["hive_table3", "hive_table4", "hive_table5", "hive_table6"]

    for n in range(0,4):
        if MyHdfsFile.isfile(client, f_list[n]):
            sql = 'LOAD DATA INPATH \'' + f_list[n] + '\' INTO TABLE ' + conf.get_data(t_list[n])  # 'test.t1_trxrecprd_v2_zc'
            # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
            print("OK" + "  " + sql+"\n")
            cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
示例#10
0
def get_client(host, use_kerberos):
    if use_kerberos:
        from hdfs.ext.kerberos import KerberosClient
        return KerberosClient(host)
    else:
        from hdfs.client import Client
        return Client(host)
def get_data(file_path):    
    HDFSUrl = "http://192.168.0.201:50070"
    client = Client(HDFSUrl, root='/')
    with client.read(file_path, buffer_size=1024, delimiter='\n', encoding='utf-8') as reader:
        data = [line.strip().split() for line in reader]
        print("data",data[0:2])
    return data
 def HDFS_cd(self, hdfs_path):
     """ 切换当前目录, 其实就是重新连接了 """
     m_NewDirectory = Path(os.path.join(self.__m_HDFS_WebFSDir__,
                                        hdfs_path)).as_posix()
     self.__m_HDFS_WebFSDir__ = m_NewDirectory
     self.__m_HDFS_Handler__ = Client(self.__m_HDFS_WebFSURL__,
                                      self.__m_HDFS_WebFSDir__,
                                      session=None)
示例#13
0
 def connect(self, host, port):
     conn_url = "http://{}:{}".format(host, port)
     try:
         self.client = Client(conn_url)
         return True
     except Exception, e:
         print "Connect Failed:{}-{}".format(Exception, e)
         return False
示例#14
0
 def _get_client(self, addr, port):
     if not CACHE or not self._clients.has_key(addr):
         cli = Client('http://%s:%s' % (str(addr), str(port)))
         if CACHE:
             self._clients.update({addr: cli})
     else:
         cli = self._clients.get(addr)
     return cli
示例#15
0
def make_directory(hdfs_address, directory_path, directory_name):
    '''
        Description: This function helps users to create a directory in hdfs
        Parameters: -hdfs_address: hadoop master node ip address
                    -directory_path: the path the user want to create a directory
		    -directory_name: the directory name
        Returns: None
        '''
    client = Client('http://' + hdfs_address)
    client.makedirs(directory_path + directory_name)
示例#16
0
def dataframe_write_to_hdfs(hdfs_path, dataframe):
    """
    :param client:
    :param hdfs_path:
    :param dataframe:
    :return:
    """
    HDFSUrl = "http://192.168.0.201:50070"
    client = Client(HDFSUrl, root='/')    
    client.write(hdfs_path, dataframe.to_csv(header=False,index=False,sep="\t"), encoding='utf-8',overwrite=True)
示例#17
0
def test_hdfs_files():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    root_dirs = hdfs.list('/')
    assert 'spark' in root_dirs

    spark_dirs = hdfs.list('/spark')
    assert 'spark-1.4.1-bin-hadoop2.6.tgz' in spark_dirs
 def get_hadoop_connection(cls, host):
     try:
         client = Client(host, root='/', timeout=10000)
         client.list('/')
     except Exception as e:
         try:
             log_handler.log.info('get query data error from hadoop 01 -----{}'.format(e))
             host = host.replace('01', '02')
             client = Client(host, root='/', timeout=10000)
             client.list('/')
         except Exception as e:
             try:
                 log_handler.log.info('get query data error from hadoop 02 -----{}'.format(e))
                 host = host.replace('02', '03')
                 client = Client(host, root='/', timeout=10000)
                 client.list('/')
             except Exception as e:
                 client = None
                 log_handler.log.info('get query data error from hadoop -----{}'.format(e))
     return client
示例#19
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    head_ip = project.cluster.head.ip
    hdfs = Client('http://%s:50070' % head_ip)

    users_dirs = hdfs.list('/user')
    assert 'hive' in users_dirs
    assert 'impala' in users_dirs

    users_dirs = hdfs.list('/user/hive')
    assert 'warehouse' in users_dirs
def save_page_hdfs(ipPort, file_path, contents):
    """保存网页源码到hdfs

    :param ipPort: hdfs连接地址
    :param file_path: 文件路径
    :param contents: 网页内容
    :return: None
    """
    client = Client(ipPort)
    with client.write(file_path) as writer:
        writer.write(bytes(contents, encoding='utf8'))
def get_data_hdfs(file_path):
    HDFSUrl = "http://192.168.0.201:50070"
    client = Client(HDFSUrl, root='/')
    with client.read(file_path,
                     buffer_size=1024,
                     delimiter='\n',
                     encoding='utf-8') as reader:
        data = [line.strip().split(',') for line in reader]
        print("data", data[0:5])
    df = pd.DataFrame(data[1:], columns=data[0])
    return df
 def hdfs_file2points(path):
     client = Client(QuerierParallel.master_hdfs_path,
                     root="/",
                     timeout=100,
                     session=False)
     points = []
     with client.read(path) as f:
         for line in f:
             info = line.strip('\n').split('\t')
             points.append([float(info[0]), float(info[1])])
     f.close()
     return points
示例#23
0
def run_hdfs_test(conf: ConfigData):
    # the_date = conf.test_date()  # "20181101"
    client = Client(conf.hdfs_ip())  # "http://10.2.201.197:50070"
    # root_path = conf.unzip_dir(is_baoli)     # 'D:/DATA/UNZIP/'
    # dest_dir = conf.hdfs_dir_syb(is_baoli)

    # file_pre = conf.file_pre1()  # "t1_trxrecord_"
    # file_ext = conf.file_ext2()  # "_V2.csv"

    #    client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True)
    dat = client.list('/', status=False)
    print(dat)
示例#24
0
 def read_accesslog_from_hdfs(self):
     # 实时日志流的存储是每5个点击数据存储一次
     client = Client("http://localhost:50070")
     file_names = client.list("/hadoop_file")
     ss = ""
     for file_name in file_names:
         with client.read("/hadoop_file/" + file_name,
                          encoding="utf-8") as reader:
             for line in reader:
                 # 去除测试数据
                 if line.startswith("filed1"):
                     continue
                 ss += line
def do():
    global csv_path
    client = Client(hdfshost)
    file_list = client.list(csv_path)
    print(file_list)
    for file in file_list:
        if file.endswith(".csv"):
            csv_path = csv_path + file
    # 读取csv并同名写到本地
    with open("./异常临界值local.csv", 'w', encoding='GB2312') as local:
        with client.read(csv_path, encoding='GB2312') as hdfs:
            for line in hdfs:
                local.write(line.strip('\n'))
示例#26
0
def test_hdfs_dirs():
    project = utils.get_test_project()
    nn_ip = project.cluster.head.ip

    hdfs = Client('http://%s:50070' % nn_ip)
    assert hdfs

    root_dirs = hdfs.list('/')
    assert 'tmp' in root_dirs
    assert 'user' in root_dirs

    users_dirs = hdfs.list('/user')
    assert project.settings['USERNAME'] in users_dirs
 def generate_files(
     date,
     path="user/hadoop/trajectory/sim_trajectory_per_day/shanghai/%s-%s/%s",
 ):
     year, month, day = date.split('-')
     if date in QuerierParallel.files:
         return
     else:
         client = Client(QuerierParallel.master_hdfs_path,
                         root="/",
                         timeout=100,
                         session=False)
         QuerierParallel.files.update(
             {date: client.list(path % (year, month, day))})
示例#28
0
def read(dir_path, header):
    client = Client("http://127.0.0.1:50070")
    log_data = []
    for date_dir in client.list(dir_path):
        for log_file in client.list(dir_path+'/'+date_dir):
            with client.read(dir_path+'/'+date_dir+'/'+log_file) as fs:
                for line in fs:
                    row = line.strip().split('&')
                    if row != ['']:
                        tmp = []
                        for field in row:
                            tmp.append(field.split('=')[1])
                        log_data.append(tmp)
    return pd.DataFrame(log_data, columns=header)
示例#29
0
def read_corpus():
    qList = []
    # 问题的关键词列表
    qList_kw = []
    aList = []
    lines = []
    client = Client("http://localhost:50070")
    with client.read("/corpus/q_a.csv", encoding='utf-8') as reader:
        for line in reader:
            lines.append(line.strip())
    for t in lines:
        qList.append(t[0])
        qList_kw.append(seg.cut(t[0]))
        aList.append(t[1])
    return qList_kw, qList, aList
示例#30
0
def hdfs_file_compressor(file_path, local_path, model_id, dir_path):

    try:
        file_path_dict = file_path.split('/', 3)[-1]
        # Python连接hdfs客户端
        client = Client(hdfs_url, root="/", timeout=10000, session=False)
        # 获取远程hdfs文件
        get_from_hdfs(client=client,
                      hdfs_path=file_path_dict,
                      local_path=local_path)
        file_list = file_name(local_path)

        for file in file_list:

            separated_file_name = file.split('/')

            if separated_file_name[-2] == 'model_file%s' % model_id:
                continue

            file_name_local = separated_file_name[-1]
            local_path_file = local_path + "/" + file_name_local

            model = compress(load_model(file),
                             acceptable_error=0.001)  # 压缩.h5文件
            model.save(local_path_file)  # 输出压缩文件

            local_path_folder = local_path + "/" + separated_file_name[-2]
            shutil.rmtree(local_path_folder)
            # 上传压缩文件
            put_to_hdfs(client=client,
                        local_path=local_path,
                        hdfs_path=file_path_dict)
            compress_file_path = file_path + '/model_file%s' % model_id

            sql = update_sql % (compress_file_path, model_id)
            compress_file_path_update = mysql.query(dir_path,
                                                    sql,
                                                    work_path='')  # 上传数据库

    except Exception as e:

        print('output->failed', flush=True)
        print(e)
        # 删除文件

    shutil.rmtree(local_path)

    print('output->success', flush=True)