示例#1
0
 def generate_temp_files(need_certificate=NEED_CERTIFICATE):
     if need_certificate:
         with krbcontext(using_keytab=True,
                         keytab_file=KEYTAB_PATH,
                         principal=PRINCIPAL):
             for node in HDFS.NODES:
                 try:
                     hdfs_client = KerberosClient(node)
                     hdfs_client.download(HDFS.REMOTE_PATH,
                                          HDFS.LOCAL_PATH,
                                          n_threads=HDFS.THREAD_NUM)
                 except Exception as err:
                     logging.info(err)
                 else:
                     return
             logging.error("Failed to download remote HDFS file.")
             raise Exception("Failed to download remote HDFS file.")
     else:
         for node in HDFS.NODES:
             try:
                 hdfs_client = Client(node)
                 hdfs_client.download(HDFS.REMOTE_PATH,
                                      HDFS.LOCAL_PATH,
                                      n_threads=HDFS.THREAD_NUM)
             except Exception as err:
                 logging.info(err)
             else:
                 return
         logging.error("Failed to download remote HDFS file.")
         raise Exception("Failed to download remote HDFS file.")
                principal=config['kerberos_principal'],
                keytab_file=config['keytab_file'],
                ccache_file=config['kerberos_cache_file']):
    # hive.Connection()
    con = hive.connect(host='uatnd02.csdntest.com.local',
                       port=10000,
                       auth='KERBEROS',
                       kerberos_service_name="hive"
                       )  # host为hiveserver2所在节点,port默认10000,为hs2的端口
    cursor = con.cursor()
    cursor.execute('select * from dl_nccp.account limit 5')  # 不能有分号!
    # cursor.execute('desc dl_nccp.account') #不能有分号!
    datas = cursor.fetchall()
    print(datas)
    cursor.close()
    con.close()

    conn = dbapi.connect(host='uatnd02.csdntest.com.local',
                         port=10000,
                         auth_mechanism='GSSAPI',
                         kerberos_service_name="hive")
    cursor = conn.cursor()

    # hdfs kerberos
    client = KerberosClient('http://hdfs_ip:50070', hostname_override="hdfs域名")
    client._list_status()
    client.list()
    client.delete()
    client.upload()
    client.download()
    client.makedirs('test')
def check_shapefiles(path_to_zip, zipID):
    import os
    import numpy
    import hdfs
    import subprocess
    from hdfs.ext.kerberos import KerberosClient
    import pandas as pd
    import geopandas
    import fiona
    from shapely.geometry import mapping, shape

    try:
        subprocess.call([
            "kinit", "*****@*****.**", "-k", "-t",
            "/home/[email protected]/danr.keytab"
        ])
        hdfs_host = "https://m1a.geo.sciclone.wm.edu:14000"

        client_hdfs = KerberosClient(hdfs_host)

        ret_obj = {}

        t_id = str(uuid.uuid4())
        try:
            os.mkdir("./temp")
        except:
            1 == 1

        localDest = "./temp/" + t_id + ".zip"
        client_hdfs.download(path_to_zip + "/" + zipID,
                             localDest,
                             overwrite=True)
        ret_obj["error"] = ""
        ret_obj["warning"] = ""

        if (zipfile.is_zipfile(localDest)):

            with zipfile.ZipFile(localDest) as zipObj:
                zipObj.extractall("./tempSparks/" + t_id + "/")
        else:
            ret_obj["error"] = "The zip file failed to extract."
            return [ret_obj["error"], ret_obj["warning"]]

    except:
        ret_obj[
            "error"] = "Something went really, really wrong here.  Probably a corrupt zip, but could be lots of things... |"
        return [ret_obj["error"], ret_obj["warning"]]

    #Ok!  File is now extracted and available at path locally:
    #"./tempSparks/" + t_id + "/"

    #Now let's load the metadata for this file in...:

    #Let's actually do some file checks, recording any errors we find to the appropriate dataframe row.
    try:
        #Load into geopands
        shp = geopandas.read_file("./tempSparks/" + t_id + "/")

        if (not (shp.crs["init"] == "epsg:4326")):
            ret_obj["error"] = ret_obj[
                "error"] + 'Projection was incorrect.  EPSG must be 4326. | '
        tol = 1e-12
        xmin = shp.bounds["minx"].values[0]
        ymin = shp.bounds["miny"].values[0]
        xmax = shp.bounds["maxx"].values[0]
        ymax = shp.bounds["maxy"].values[0]

        valid = ((xmin >= -180 - tol) and (xmax <= 180 + tol)
                 and (ymin >= -90 - tol) and (ymax <= 90 + tol))
        if not valid:
            ret_obj["error"] = ret_obj[
                "error"] + "Bounds appear to be in another castle. xmin: {0}, xmax: {1}, ymin: {2}, ymax: {3} | ".format(
                    xmin, xmax, ymin, ymax)

        #Fiona checks
        shp = fiona.open("./tempSparks/" + t_id + "/")
        valid = True
        error = None
        fixed = []
        for feature in shp:
            raw_shape = shape(feature['geometry'])
            valid = raw_shape.is_valid
            if valid:
                fixed.append(feature)
            if not valid:
                fixed_shape = raw_shape.buffer(0)
                fix_valid = fixed_shape.is_valid
                if fix_valid and error is None:
                    ret_obj[
                        "warning"] = "There is a minor issue with this boundary - i.e., a river might cross somewhere it should not.  We can fix it automatically by using a buffer of 0 in shapely, but this message indicates you should look carefully at the file sometime soon."
                    feature["geometry"] = mapping(fixed_shape)
                    fixed.append(feature)
                elif not fix_valid:
                    if error is not None:
                        ret_obj["error"] = ret_obj[
                            "error"] + "An error in the geometry of the file exists that we could not automatically fix. | "
                    else:
                        ret_obj["error"] = ret_obj[
                            "error"] + "A really bad error in the geometry of the file exists that we could not automatically fix. | "
                    break

        #Clean up temp files
        shutil.rmtree("./tempSparks/" + t_id + "/", ignore_errors=True)
        os.remove("./temp/" + t_id + ".zip")

        return [ret_obj["error"], ret_obj["warning"]]
    except:
        ret_obj[
            "error"] = "Something bad happened while we were trying to unpack the shapefile.  This error indicates it wasn't a specific issue, but rather the entire shapefile seems corrupted (or something equally bad!)"
        return [ret_obj["error"], ret_obj["warning"]]
示例#4
0
class OperateHDFS:
    def __init__(self, url):
        '''

        :param url:HDFS名称节点的主机名或IP地址,以协议为前缀,其次是namenode上的WebHDFS端口,也可以指定多个URL以分号分隔以获取高可用性支持.
        '''
        # 实例化HDFS web client using Kerberos authentication
        self.client = KerberosClient(url)

    def file_list(self, file_path):
        '''

        :param file_path: HDFS远程目录路径
        :return: 返回一个远程目录中包含的所有文件
        '''
        file_detail = self.client.list(hdfs_path=file_path)
        return file_detail

    def file_read(self, file_path):
        '''
        从HDFS中读取文件
        :param file_path: HDFS远程文件路径
        :return:
        '''
        lines = []
        with self.client.read(hdfs_path=file_path,
                              encoding='utf-8',
                              delimiter=r'\n') as reader:
            # content = file.read()
            # print(content)
            for item in reader:
                lines.append(item.strip())
        return lines

    def file_create_write(self, file_path, data_write):
        '''
        在HDFS中创建新文件并写入内容
        :param file_path: HDFS远程文件路径
        :param data_write: 写入到文件的数据
        :return:
        '''
        self.client.write(hdfs_path=file_path,
                          data=data_write,
                          encoding='utf-8')

    def file_append_write(self, file_path, data_append):
        '''
        在HDFS中已存在的文件中追加写入内容,文件必须已存在
        :param file_path: HDFS远程文件路径
        :param data_append: 追加到文件的数据
        :return:
        '''
        self.client.write(hdfs_path=file_path,
                          data=data_append,
                          encoding='utf-8',
                          append=True)

    def file_rename(self, src_file_path, dst_file_path):
        '''
        重命名/移动文件或文件夹
        :param src_file_path: 源文件路径
        :param dst_file_path: 目的文件路径
        :return:
        '''
        self.client.rename(hdfs_src_path=src_file_path,
                           hdfs_dst_path=dst_file_path)

    def mkdir(self, file_path):
        '''
        在HDFS中创建远程目录,必要时递归创建
        :param file_path: 需要新建的文件夹路径(包含名字)
        :return:
        '''
        self.client.makedirs(hdfs_path=file_path)

    def upload_files(self, file_path, local_path):
        '''
        上传文件或目录到HDFS
        :param file_path:HDFS目标路径。如果它已经存在并且是一个目录,文件将被上传其中。
        :param local_path:文件或文件夹的本地路径。 如果是文件夹,则将上传其中的所有文件(请注意,这意味着没有文件的文件夹将不会远程创建)
        :return:hdfs_path_return:成功后,此方法将返回远程上传路径。
        '''
        hdfs_path_return = self.client.upload(hdfs_path=file_path,
                                              local_path=local_path)
        return hdfs_path_return

    def download_files(self, file_path, local_path):
        '''
        从HDFS下载一个文件或文件夹并将其保存在本地
        :param file_path:HDFS上要下载的文件或文件夹的路径。 如果是文件夹,则将下载该文件夹下的所有文件
        :param local_path:本地路径。 如果它已经存在并且是目录,则文件将在其中下载。
        :return: local_path_return:成功后,此方法将返回本地下载路径
        '''
        local_path_return = self.client.download(hdfs_path=file_path,
                                                 local_path=local_path)
        return local_path_return

    def delete_files(self, file_path):
        '''
        从HDFS中删除文件或目录
        :param file_path: HDFS中需要删除的文件或目录的路径
        :return:如果删除成功,则此函数返回“ True”,如果先前在“ hdfs_path”处不存在文件或目录,则返回“ False”。
        '''
        # recursive:递归删除文件和目录。 默认情况下,如果尝试删除非空目录,则此方法将引发HdfsError。
        # skip_trash:设置为false时,已删除的路径将被移至相应的垃圾文件夹,而不是被删除。 这需要Hadoop 2.9+且在集群上启用trash
        return self.client.delete(hdfs_path=file_path,
                                  recursive=False,
                                  skip_trash=True)

    def set_files_permission(self, file_path):
        '''
        更改文件的权限
        :param file_path: 需要更改权限的文件路径
        :return:
        '''
        # permission:文件的新八进制权限字符串
        self.client.set_permission(hdfs_path=file_path, permission=None)
示例#5
0
"""
*******************
*Copyright 2017, MapleLabs, All Rights Reserved.
*
********************
"""

import sys
from hdfs.ext.kerberos import KerberosClient
from hdfs.client import InsecureClient
from requests import Session
from requests_kerberos import HTTPKerberosAuth, DISABLED

session = Session()
session.verify = False
kerberos_auth = HTTPKerberosAuth(mutual_authentication=DISABLED, force_preemptive=True, principal='')
session.auth = kerberos_auth
client = KerberosClient("", session=session)
#client = InsecureClient("", session=session)
file = sys.argv[1]
destfile = sys.argv[2]

print client.list('/mr-history/done')

client.download(file, destfile, overwrite=True)