示例#1
0
文件: hdfs.py 项目: zkkxu/determined
class HDFSTensorboardManager(base.TensorboardManager):
    """
    Store and tfevents files to HDFS.
    """
    @util.preserve_random_state
    def __init__(
        self,
        hdfs_url: str,
        hdfs_path: str,
        user: Optional[str] = None,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.hdfs_url = hdfs_url
        self.hdfs_path = hdfs_path
        self.user = user

        self.client = InsecureClient(self.hdfs_url,
                                     root=self.hdfs_path,
                                     user=self.user)
        self.client.makedirs(str(self.sync_path))

    @util.preserve_random_state
    def sync(self) -> None:
        for path in self.to_sync():
            file_name = str(self.sync_path.joinpath(path.name))

            logging.debug(f"Uploading {path} to {self.hdfs_path}")

            self.client.upload(file_name, str(path))
            self._synced_event_sizes[path] = path.stat().st_size
示例#2
0
def run_conv_file_local_to_hdfs(configData: ConfigData):
    """

    # client.upload('/shouyinbao/', "/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv", cleanup=True)
    # dat = client.list('/shouyinbao/', status=False)
    # print(dat)

    # root_path = "/home/bd/桌面/201811_flow/zc_shouyinbao/UNZIP/"
    # dest_dir1 = "/home/bd/桌面/201811_flow/zc_shouyinbao/UTF8/"
    # dest_dir2 = "/shouyinbao/zc_shouyinbao/UTF8/"

    # root_path = "/home/testFolder/logflow/bl_shouyinbao/UNZIP/"
    # dest_dir1 = "/home/testFolder/logflow/bl_shouyinbao/UTF8/"
    # dest_dir2 = "/shouyinbao/zc_shouyinbao/UTF8/"

    # i_file = '/home/testFolder/logflow/bl_shouyinbao/20181101/9999100000/t1_trxrecord_20181101_V2.csv'
    # o_file = '/home/testFolder/logflow/bl_shouyinbao/UTF8/20181101/9999100000/t1_trxrecord_20181101_V2.csv'

    :param configData:
    :param the_date:
    :param is_baoli:
    :return:
    """
    f_date_str = configData.get_f_date()  # "20181101"

    a_client = InsecureClient(configData.hdfs_ip(), user="******")   # "http://10.2.201.197:50070"
    # webhdfs 默认是 dr.who ,不能伪装成其他用户,可以在配置里修改 hadoop.http.staticuser.user=dr.who
    # https://www.cnblogs.com/peizhe123/p/5540845.html
    root_path = os.path.join(configData.get_data_path(), f_date_str)
    dest_dir1 = os.path.join(configData.get_utf8_path(), f_date_str)
    dest_dir2 = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str))

    print("Start\n")

    f_name = configData.get_file_name(f_date_str)  # "t1_trxrecord_" the_date # "_V2.csv"

    branches = MyLocalFile.get_child_dir(root_path)
    for aBranch in branches:
        if MyLocalFile.check_branch(aBranch):
            files = MyLocalFile.get_child_file(aBranch)
            f_a_branch = os.path.basename(aBranch)
            for aFile in files:
                if MyLocalFile.check_file(aFile, f_name):
                    to_file1 = str(pathlib.PurePath(dest_dir1).joinpath(f_a_branch, f_name))
                    to_file2 = str(pathlib.PurePosixPath(dest_dir2).joinpath(f_a_branch, f_name))
                    f_add_head = configData.get_hive_add_date(f_a_branch)
                    f_add_end = configData.get_hive_add_date("789")
                    f_need_head = not configData.get_hive_head()  # False
                    MyLocalFile.conv_file_local(aFile, to_file1, need_first_line=f_need_head,p_add_head=f_add_head, p_add_tail=f_add_end,quoting="")
                    MyHdfsFile.safe_make_dir(a_client, to_file2)
                    # client.newupload(to_file2, to_file1, encoding='utf-8')
                    the_file = a_client.status(to_file2, strict=False)
                    if the_file is None:
                        a_client.upload(to_file2, to_file1) #, encoding='utf-8')
                        a_client.set_permission(to_file2, 777)
                    # a_client.set_owner(thePath,owner='hdfs',group='supergroup')
                    elif the_file['type'].lower() == 'file':  # 'directory'
                        a_client.set_permission(to_file2, 777)
示例#3
0
class HDFSStorageManager(StorageManager):
    """
    Store and load checkpoints from HDFS.
    """
    def __init__(
        self,
        hdfs_url: str,
        hdfs_path: str,
        user: Optional[str] = None,
        temp_dir: Optional[str] = None,
    ) -> None:
        super().__init__(
            temp_dir if temp_dir is not None else tempfile.gettempdir())

        self.hdfs_url = hdfs_url
        self.hdfs_path = hdfs_path
        self.user = user

        self.client = InsecureClient(self.hdfs_url,
                                     root=self.hdfs_path,
                                     user=self.user)

    def post_store_path(self, storage_id: str, storage_dir: str,
                        metadata: StorageMetadata) -> None:
        """post_store_path uploads the checkpoint to hdfs and deletes the original files."""
        try:
            logging.info("Uploading storage {} to HDFS".format(storage_id))
            result = self.client.upload(metadata, storage_dir)

            logging.info("Uploaded storage {} to HDFS path {}".format(
                storage_id, result))
        finally:
            self._remove_checkpoint_directory(metadata.storage_id)

    @contextlib.contextmanager
    def restore_path(self, metadata: StorageMetadata) -> Iterator[str]:
        logging.info("Downloading storage {} from HDFS".format(
            metadata.storage_id))

        self.client.download(metadata.storage_id,
                             self._base_path,
                             overwrite=True)

        try:
            yield os.path.join(self._base_path, metadata.storage_id)
        finally:
            self._remove_checkpoint_directory(metadata.storage_id)

    def delete(self, metadata: StorageMetadata) -> None:
        logging.info("Deleting storage {} from HDFS".format(
            metadata.storage_id))
        self.client.delete(metadata.storage_id, recursive=True)
def run_remove_files(configData: ConfigData):
    f_date_str = configData.get_f_date()  # StrTool.get_the_date_str(the_date, delta_day)  # "20181101"
    data_path = os.path.join(configData.get_data_path(), f_date_str)   # allinpay_data_bl
    utf8_path = os.path.join(configData.get_utf8_path(), f_date_str)   # allinpay_utf8_bl
    hdfs_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str))    # hdfs_dir_bl

    a_client = InsecureClient(configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"

    shutil.rmtree(data_path, ignore_errors=True)
    shutil.rmtree(utf8_path, ignore_errors=True)
    try:
        a_client.delete(hdfs_path, recursive=True)
    except:
        pass
示例#5
0
    def __init__(
        self,
        hdfs_url: str,
        hdfs_path: str,
        user: Optional[str] = None,
        temp_dir: Optional[str] = None,
    ) -> None:
        super().__init__(temp_dir if temp_dir is not None else tempfile.gettempdir())

        self.hdfs_url = hdfs_url
        self.hdfs_path = hdfs_path
        self.user = user

        self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user)
示例#6
0
    def __init__(
        self,
        hdfs_url: str,
        hdfs_path: str,
        user: Optional[str] = None,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.hdfs_url = hdfs_url
        self.hdfs_path = hdfs_path
        self.user = user

        self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user)
        self.client.makedirs(str(self.sync_path))
def run_hive(configData: ConfigData):
    a_client = InsecureClient(url=configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"
    conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user())
    cur = conn.cursor()

    f_date_str = configData.get_f_date()  # "20181101"
    p_date_str = configData.get_p_date()  # "2018-11-01"

    # hdfs_dir_bl
    root_path = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str))
    file_name = str(pathlib.PurePosixPath(root_path).joinpath(configData.get_file_name(f_date_str)))
    # "/data/posflow/allinpay_utf8_zc/20181101/"
    # 20181101_loginfo_rsp_bl_new.csv
    # 20181101_rsp_agt_bl_new.del
    # 20181101_rxinfo_rsp_bl.txt

    table_name = configData.get_table_name()

    print("Start\n")

    if MyHdfsFile.isfile(a_client, file_name):
        if not configData.get_has_partition():
            sql = "LOAD DATA INPATH '{}' INTO TABLE {}".format(file_name, table_name)  # 'test.t1_trxrecprd_v2_zc'
            # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
        else:
            sql = "LOAD DATA INPATH '{}' INTO TABLE {} PARTITION ( p_date='{}' )".format(file_name, table_name, p_date_str)  # 'test.t1_trxrecprd_v2_zc'
        print("OK" + "  " + sql+"\n")
        cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
示例#8
0
def run_hive(configData: ConfigData):
    a_client = InsecureClient(url=configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"
    conn = connect(host=configData.hive_ip(), port=configData.hive_port(), auth_mechanism=configData.hive_auth(), user=configData.hive_user())
    cur = conn.cursor()

    f_date_str = configData.get_f_date()  # "20181101"
    p_date_str = configData.get_p_date()  # "2018-11-01"

    root_path = configData.get_hdfs_path()  # "/shouyinbao/bl_shouyinbao/UTF8/"
    file_name = configData.get_file_name(f_date_str)  # "t1_trxrecord_" the_date # "_V2.csv"
    table_name = configData.get_table_name()

    print("Start\n")

    idn = 0
    branches = MyHdfsFile.get_child(a_client, str(pathlib.PurePosixPath(root_path).joinpath(f_date_str)))
    for aBranch in branches:
        if MyHdfsFile.check_branch(a_client, aBranch):
            files = MyHdfsFile.get_child(a_client, aBranch)
            f_a_branch = MyHdfsFile.get_name(aBranch)
            for aFile in files:
                if MyHdfsFile.check_file(a_client, aFile, file_name):
                    # '/shouyinbao/bl_shouyinbao/UTF8/20181101/9999997900/t1_trxrecord_20181101_V2.csv'
                    to_file2 = str(pathlib.PurePosixPath(root_path).joinpath(f_date_str, f_a_branch, file_name))
                    if not configData.get_has_partition():
                        sql = "LOAD DATA INPATH '{}' INTO TABLE {}".format(to_file2, table_name)  # 'test.t1_trxrecprd_v2_zc'
                    # '\' OVERWRITE INTO TABLE test.t1_trxrecprd_v2_bl2'
                    else:
                        sql = "LOAD DATA INPATH '{}' INTO TABLE {} PARTITION ( p_date='{}' )".format(to_file2, table_name, p_date_str)  # 'test.t1_trxrecprd_v2_zc'
                    idn += 1
                    print(str(idn) + "  " + sql + "\n")
                    cur.execute(sql)  # , async=True)

    cur.close()
    conn.close()
示例#9
0
文件: tasks.py 项目: sundw2015/841
def upload_and_delete_file(file_path, upload_path):
    for file_name in os.listdir(file_path):
        try:
            # session = Session()
            # session.auth = HTTPBasicAuth('username', 'password')
            # client = InsecureClient(url=HDFS_HOSTS, user="******", session=session)
            client = InsecureClient(url=HDFS_HOSTS, user=HDFS_USER)
            # client.delete("/data/scan/sm_word_log_message-2018-06-27-20")
            # print(client.list("/data/scan/sm/", status=False))
            path = client.upload(upload_path + file_name,
                                 file_path + file_name,
                                 cleanup=True)
            if path:
                os.remove(file_path + file_name)
        except Exception as e:
            logger.error("upload_and_delete_file  error = %s", str(e))
示例#10
0
文件: __init__.py 项目: tvial/ibis
def hdfs_connect(host='localhost',
                 port=50070,
                 protocol='webhdfs',
                 use_https='default',
                 auth_mechanism='NOSASL',
                 verify=True,
                 **kwds):
    """
    Connect to HDFS

    Parameters
    ----------
    host : string, Host name of the HDFS NameNode
    port : int, NameNode's WebHDFS port (default 50070)
    protocol : {'webhdfs'}
    use_https : boolean, default 'default'
        Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure
        authentication, the default for this is True, otherwise False
    auth_mechanism : string, Set to NOSASL or PLAIN for non-secure clusters.
        Set to GSSAPI or LDAP for Kerberos-secured clusters.
    verify : boolean, Set to False to turn off verifying SSL certificates.
        (default True)

    Other keywords are forwarded to hdfs library classes

    Returns
    -------
    client : WebHDFS
    """
    import requests
    session = kwds.setdefault('session', requests.Session())
    session.verify = verify
    if auth_mechanism in ['GSSAPI', 'LDAP']:
        if use_https == 'default':
            prefix = 'https'
        else:
            prefix = 'https' if use_https else 'http'
        try:
            import requests_kerberos
        except ImportError:
            raise IbisError(
                "Unable to import requests-kerberos, which is required for "
                "Kerberos HDFS support. Install it by executing `pip install "
                "requests-kerberos` or `pip install hdfs[kerberos]`.")
        from hdfs.ext.kerberos import KerberosClient
        # note SSL
        url = '{0}://{1}:{2}'.format(prefix, host, port)
        kwds.setdefault('mutual_auth', 'OPTIONAL')
        hdfs_client = KerberosClient(url, **kwds)
    else:
        if use_https == 'default':
            prefix = 'http'
        else:
            prefix = 'https' if use_https else 'http'
        from hdfs.client import InsecureClient
        url = '{0}://{1}:{2}'.format(prefix, host, port)
        hdfs_client = InsecureClient(url, **kwds)
    return WebHDFS(hdfs_client)
示例#11
0
def initialize_hdfs_client(url):
    global client
    if not client:
        session = Session()
        session.verify = False
        if kerberos['enabled']:
            client = KerberosClient(url, session=session)
        else:
            client = InsecureClient(url, user=hdfs['user'], session=session)
示例#12
0
def get_pd_DF(cli: InsecureClient, file_path, header):
    """
       读取hdfs上的csv文件,返回pandas的DataFrame
    :param cli: hdfs的InsecureClient
    :param file_path: hdfs的文件路径,相对InsecureClient里面设置的root路径
    :return:
    """
    with cli.read(file_path) as reader:
        df_pd = pd.read_csv(reader, header=header)
    return df_pd
示例#13
0
 def hdfs_client(self):
     url = 'http://{nn_host}:{webhdfs_port}'.format(
         nn_host=self._nn_host, webhdfs_port=self._webhdfs_port)
     if self._kerberized:
         from hdfs.ext.kerberos import KerberosClient
         client = KerberosClient(url, mutual_auth='REQUIRED')
     else:
         from hdfs.client import InsecureClient
         client = InsecureClient(url, user=self._hdfs_user)
     return client
示例#14
0
def save_pd_DF(df_pd: pd.DataFrame, cli: InsecureClient, file_path):
    """
     将pandas的DataFrame写入hdfs  csv
    :param df_pd: pandas的DataFrame
    :param cli: hdfs的InsecureClient
    :param file_path: hdfs的文件路径,相对InsecureClient里面设置的root路径
    """
    with cli.write(hdfs_path=file_path, encoding='utf-8',
                   overwrite=True) as writer:
        df_pd.to_csv(writer)
  def __init__(self, host, port=50070, use_https=False, secure=False, **kwds):
    import requests

    session = kwds.setdefault('session', requests.Session())
    session.verify = True

    prefix = 'https' if use_https else 'http'
    url = '{0}://{1}:{2}'.format(prefix, host, port)
    
    if secure:
      import requests_kerberos
      from hdfs.ext.kerberos import KerberosClient
      kwds.setdefault('mutual_auth', 'OPTIONAL')
      
      self.client = KerberosClient(url, **kwds)
    
    else:
      from hdfs.client import InsecureClient
      
      self.client = InsecureClient(url, **kwds)
def save_driver_data(spark, broadcast_variable, train_x, train_y, valid_x, valid_y, save_method):
    if broadcast_variable:
        train_x = spark.sparkContext.broadcast(train_x) if train_x is not None else train_x
        train_y = spark.sparkContext.broadcast(train_y) if train_y is not None else train_y
        valid_x = spark.sparkContext.broadcast(valid_x) if valid_x is not None else valid_x
        valid_y = spark.sparkContext.broadcast(valid_y) if valid_y is not None else valid_y
        hdfs_path_dic = None
    else:
        hdfs = InsecureClient(WEB_HDFS_URL, user=WEB_USER)
        hdfs_path_dic = save_train_valid_to_hdfs(hdfs=hdfs, method=save_method, tmp_hdfs_path=TEMP_HDFS_PATH,
                                                 train_x=train_x, train_y=train_y,
                                                 valid_x=valid_x, valid_y=valid_y)
        train_x, train_y, valid_x, valid_y = (None,) * 4
    return train_x, train_y, valid_x, valid_y, hdfs_path_dic
def load_driver_data(hdfs_path, train_x, train_y, valid_x, valid_y, save_method):
    if hdfs_path is None:
        from pyspark.broadcast import Broadcast
        train_x = train_x.value if isinstance(train_x, Broadcast) else train_x
        train_y = train_y.value if isinstance(train_y, Broadcast) else train_y
        valid_x = valid_x.value if isinstance(valid_x, Broadcast) else valid_x
        valid_y = valid_y.value if isinstance(valid_y, Broadcast) else valid_y
    else:
        hdfs = InsecureClient(WEB_HDFS_URL, user=WEB_USER)
        data_dic = load_train_valid_to_local(hdfs=hdfs, method=save_method, ret_dic=hdfs_path)
        train_x, train_y, valid_x, valid_y = data_dic["train_x"], data_dic["train_y"], \
                                             data_dic["valid_x"], data_dic["valid_y"]
        del data_dic
    return train_x, train_y, valid_x, valid_y
def run_conv_file_local_to_hdfs(configData: ConfigData):
    """

    :param configData:
    :return:
    """
    f_date_str = configData.get_f_date()  # "20181101"
    a_client = InsecureClient(configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"

    root_path = os.path.join(configData.get_data_path(), f_date_str)                        # allinpay_data_bl
    dest_dir1 = os.path.join(configData.get_utf8_path(), f_date_str)                        # allinpay_utf8_bl
    dest_dir2 = str(pathlib.PurePosixPath(configData.get_hdfs_path()).joinpath(f_date_str)) # hdfs_dir_bl
    # file_ext7 = configData.get_data("file_ext7")  # _loginfo_rsp_bl_new.csv   # 20181101_loginfo_rsp_bl_new.csv
    # file_ext8 = configData.get_data("file_ext8")  # _rsp_agt_bl_new.del       # 20181101_rsp_agt_bl_new.del
    # file_ext9 = configData.get_data("file_ext9")  # _rxinfo_rsp_bl.txt        # 20181101_rxinfo_rsp_bl.txt

    # f_list = [file_ext7, file_ext8, file_ext9]

    print("Start\n")

    # "file_ext" + str(configData.the_id)
    file_name = configData.get_file_name(f_date_str).lower()

    files = MyLocalFile.get_child_file(root_path)
    for aFile in files:
        short_name = os.path.basename(aFile).lower()
        f_name = pathlib.PurePath(aFile).name
        if short_name == file_name:
            to_file1 = str(pathlib.PurePath(dest_dir1).joinpath(f_name))
            to_file2 = str(pathlib.PurePosixPath(dest_dir2).joinpath(f_name))
            f_add_date = configData.get_hive_add_date(f_date_str)
            f_need_head = not configData.get_hive_head()
            MyLocalFile.conv_file_local(aFile, to_file1, need_first_line=f_need_head, p_add_head=f_add_date)
            MyHdfsFile.safe_make_dir(a_client, to_file2)
            # a_client.newupload(to_file2, to_file1, encoding='utf-8')
            the_file = a_client.status(to_file2, strict=False)
            if the_file is None:
                a_client.upload(to_file2, to_file1)
                a_client.set_permission(to_file2, 777)
            # a_client.set_owner(thePath,owner='hdfs',group='supergroup')
            elif the_file['type'].lower() == 'file':  # 'directory'
                a_client.set_permission(to_file2, 777)
示例#19
0
文件: util.py 项目: nataliaking/ibis
def connect_test(env, with_hdfs=True):
    con = ibis.impala_connect(host=env.impala_host,
                              protocol=env.impala_protocol,
                              database=env.test_data_db,
                              port=env.impala_port,
                              use_kerberos=env.use_kerberos,
                              pool_size=2)
    if with_hdfs:
        if env.use_kerberos:
            from hdfs.ext.kerberos import KerberosClient
            hdfs_client = KerberosClient(env.hdfs_url, mutual_auth='REQUIRED')
        else:
            from hdfs.client import InsecureClient
            hdfs_client = InsecureClient(env.hdfs_url)
        return ibis.make_client(con, hdfs_client)
    else:
        return ibis.make_client(con)
示例#20
0
def hdfs_connect(host='localhost',
                 port=50070,
                 protocol='webhdfs',
                 use_kerberos=False,
                 verify=True,
                 **kwds):
    """
    Connect to HDFS

    Parameters
    ----------
    host : string
    port : int, default 50070 (webhdfs default)
    protocol : {'webhdfs'}
    use_kerberos : boolean, default False
    verify : boolean, default False
        Set to False to turn off verifying SSL certificates

    Other keywords are forwarded to hdfs library classes

    Returns
    -------
    client : ibis HDFS client
    """
    if use_kerberos:
        try:
            import requests_kerberos
        except ImportError:
            raise IbisError(
                "Unable to import requests-kerberos, which is required for "
                "Kerberos HDFS support. Install it by executing `pip install "
                "requests-kerberos` or `pip install hdfs[kerberos]`.")
        from hdfs.ext.kerberos import KerberosClient
        url = 'https://{0}:{1}'.format(host, port)  # note SSL
        hdfs_client = KerberosClient(url,
                                     mutual_auth='OPTIONAL',
                                     verify=verify,
                                     **kwds)
    else:
        from hdfs.client import InsecureClient
        url = 'http://{0}:{1}'.format(host, port)
        hdfs_client = InsecureClient(url, verify=verify, **kwds)
    return WebHDFS(hdfs_client)
示例#21
0
 def save_to_hdfs(self, key, url, title, content):
     current_date = datetime.datetime.now().strftime("%Y%m%d")
     hdfs_path = hdfs_dir + current_date
     import sys
     reload(sys)
     sys.setdefaultencoding('utf-8')
     data = "\n" + key + "\n" + url + "\n"
     if (title != None and title != ''):
         data = data + title + "\n"
     if (content != None and content != ''):
         data = data + content + "\n"
     try:
         client = InsecureClient(hdfs_web, user=hdfs_user)
         client.write(hdfs_path=hdfs_path, data=data, append=True)
     except HdfsError, e:
         client.write(hdfs_path=hdfs_path, data=data)
示例#22
0
 def __init__(self,
              url,
              root=None,
              user=None,
              proxy=None,
              timeout=None,
              session=None):
     """ 连接hdfs
     url: HDFS名称节点的主机名或IP地址及端口号
     root: 根路径,此路径将作为传递给客户端的所有HDFS路径的前缀
     user: 使用InsecureClient(Base Client),指定访问hdfs的用户;Client使用默认用户dr.who
     proxy: 代理的用户
     timeout: 连接超时,转发到请求处理程序
     session: request.Session实例,用于发出所有请求
     """
     if user:
         self.client = InsecureClient(url, user=user)
     else:
         self.client = Client(url,
                              root=root,
                              proxy=proxy,
                              timeout=timeout,
                              session=session)
示例#23
0
def run_conv_file_hdfs(configData: ConfigData):
    f_date_str = configData.get_f_date()  # "20181101"

    client = InsecureClient(configData.hdfs_ip(), user="******")  # "http://10.2.201.197:50070"
    root_path = configData.get_data_path()  # 'D:/DATA/UNZIP/'
    dest_dir = configData.get_hdfs_path()

    f_name = configData.get_file_name(f_date_str)  # "t1_trxrecord_" the_date # "_V2.csv"

    print("Start\n")

    branches = MyLocalFile.get_child(os.path.join(root_path, f_date_str))
    for aBranch in branches:
        if MyLocalFile.check_branch(aBranch):
            files = MyLocalFile.get_child(aBranch)
            for aFile in files:
                if MyLocalFile.check_file(aFile, f_name):
                    MyHdfsFile.conv_file_hdfs(aFile,
                                              os.path.join(dest_dir,
                                                           f_date_str,
                                                           os.path.basename(aBranch),
                                                           f_name),
                                              client)
示例#24
0
class HDFSTensorboardManager(base.TensorboardManager):
    """
    Store and tfevents files to HDFS.
    """
    @util.preserve_random_state
    def __init__(
        self,
        hdfs_url: str,
        hdfs_path: str,
        user: Optional[str] = None,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(*args, **kwargs)
        self.hdfs_url = hdfs_url
        self.hdfs_path = hdfs_path
        self.user = user

        self.client = InsecureClient(self.hdfs_url,
                                     root=self.hdfs_path,
                                     user=self.user)
        self.client.makedirs(str(self.sync_path))

    @util.preserve_random_state
    def sync(
        self,
        selector: Callable[[pathlib.Path], bool] = lambda _: True,
        mangler: Callable[[pathlib.Path, int], pathlib.Path] = lambda p, __: p,
        rank: int = 0,
    ) -> None:
        for path in self.to_sync(selector):
            relative_path = path.relative_to(self.base_path)
            mangled_relative_path = mangler(relative_path, rank)
            mangled_path = self.sync_path.joinpath(mangled_relative_path)
            file_name = str(mangled_path)
            logging.debug(f"Uploading {path} to {self.hdfs_path}")

            self.client.upload(file_name, str(path))

    def delete(self) -> None:
        self.client.delete(self.sync_path, recursive=True)
示例#25
0
class HDFSStorageManager(storage.CloudStorageManager):
    """
    Store and load checkpoints from HDFS.
    """

    def __init__(
        self,
        hdfs_url: str,
        hdfs_path: str,
        user: Optional[str] = None,
        temp_dir: Optional[str] = None,
    ) -> None:
        super().__init__(temp_dir if temp_dir is not None else tempfile.gettempdir())

        self.hdfs_url = hdfs_url
        self.hdfs_path = hdfs_path
        self.user = user

        self.client = InsecureClient(self.hdfs_url, root=self.hdfs_path, user=self.user)

    @util.preserve_random_state
    def upload(self, src: Union[str, os.PathLike], dst: str) -> None:
        src = os.fspath(src)
        logging.info(f"Uploading to HDFS: {dst}")
        self.client.upload(dst, src)

    @util.preserve_random_state
    def download(self, src: str, dst: Union[str, os.PathLike]) -> None:
        dst = os.fspath(dst)
        logging.info(f"Downloading {src} from HDFS")
        self.client.download(src, dst, overwrite=True)

    @util.preserve_random_state
    def delete(self, tgt: str) -> None:
        logging.info(f"Deleting {tgt} from HDFS")
        self.client.delete(tgt, recursive=True)
示例#26
0
def hdfs_connect(host='localhost',
                 port=50070,
                 protocol='webhdfs',
                 use_https='default',
                 auth_mechanism='NOSASL',
                 verify=True,
                 session=None,
                 **kwds):
    """Connect to HDFS.

    Parameters
    ----------
    host : str
        Host name of the HDFS NameNode
    port : int
        NameNode's WebHDFS port
    protocol : str,
        The protocol used to communicate with HDFS. The only valid value is
        ``'webhdfs'``.
    use_https : bool
        Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure
        authentication, the default for this is True, otherwise False.
    auth_mechanism : str
        Set to NOSASL or PLAIN for non-secure clusters.
        Set to GSSAPI or LDAP for Kerberos-secured clusters.
    verify : bool
        Set to :data:`False` to turn off verifying SSL certificates.
    session : Optional[requests.Session]
        A custom :class:`requests.Session` object.

    Notes
    -----
    Other keywords are forwarded to HDFS library classes.

    Returns
    -------
    WebHDFS

    """
    import requests

    if session is None:
        session = requests.Session()
    session.verify = verify
    if auth_mechanism in ('GSSAPI', 'LDAP'):
        if use_https == 'default':
            prefix = 'https'
        else:
            prefix = 'https' if use_https else 'http'
        try:
            import requests_kerberos  # noqa: F401
        except ImportError:
            raise IbisError(
                "Unable to import requests-kerberos, which is required for "
                "Kerberos HDFS support. Install it by executing `pip install "
                "requests-kerberos` or `pip install hdfs[kerberos]`.")
        from hdfs.ext.kerberos import KerberosClient

        # note SSL
        url = '{0}://{1}:{2}'.format(prefix, host, port)
        kwds.setdefault('mutual_auth', 'OPTIONAL')
        hdfs_client = KerberosClient(url, session=session, **kwds)
    else:
        if use_https == 'default':
            prefix = 'http'
        else:
            prefix = 'https' if use_https else 'http'
        from hdfs.client import InsecureClient

        url = '{}://{}:{}'.format(prefix, host, port)
        hdfs_client = InsecureClient(url, session=session, **kwds)
    return WebHDFS(hdfs_client)
示例#27
0
def hdfs_connect(
    host: str = 'localhost',
    port: int = 50070,
    protocol: Literal['webhdfs'] = 'webhdfs',
    use_https: str = 'default',
    auth_mechanism: str = 'NOSASL',
    verify: bool = True,
    session: Any = None,
    **kwds: Any,
) -> WebHDFS:
    """Connect to HDFS.

    Parameters
    ----------
    host
        Host name of the HDFS NameNode
    port
        NameNode's WebHDFS port
    protocol
        The protocol used to communicate with HDFS. The only valid value is
        ``'webhdfs'``.
    use_https
        Connect to WebHDFS with HTTPS, otherwise plain HTTP. For secure
        authentication, the default for this is True, otherwise False.
    auth_mechanism
        Set to NOSASL or PLAIN for non-secure clusters.
        Set to GSSAPI or LDAP for Kerberos-secured clusters.
    verify
        Set to `False` to turn off verifying SSL certificates.
    session
        A custom `requests.Session` object.

    Returns
    -------
    WebHDFS
        WebHDFS client
    """
    import requests

    if session is None:
        session = requests.Session()
    session.verify = verify
    if auth_mechanism in ('GSSAPI', 'LDAP'):
        from hdfs.ext.kerberos import KerberosClient

        if use_https == 'default':
            prefix = 'https'
        else:
            prefix = 'https' if use_https else 'http'

        # note SSL
        url = f'{prefix}://{host}:{port}'
        kwds.setdefault('mutual_auth', 'OPTIONAL')
        hdfs_client = KerberosClient(url, session=session, **kwds)
    else:
        if use_https == 'default':
            prefix = 'http'
        else:
            prefix = 'https' if use_https else 'http'
        from hdfs.client import InsecureClient

        url = f'{prefix}://{host}:{port}'
        hdfs_client = InsecureClient(url, session=session, **kwds)
    return WebHDFS(hdfs_client)
from hdfs.client import Client, InsecureClient
import redis
import json
import time
import pymysql

client = InsecureClient("http://192.168.1.176:50070", user='******')
r = redis.StrictRedis(host='192.168.1.176', port=6379, decode_responses=True)


def connect_to_db(DB_NAME):
    # 設定資料庫連線資訊
    host = 'localhost'
    port = 3306
    user = '******'
    passwd = 'recipe'
    db = DB_NAME
    charset = 'utf8mb4'

    # 建立連線
    conn = pymysql.connect(host=host, port=port, user=user, passwd=passwd, db=db, charset=charset)
    print('Successfully connected to DB : {} !'.format(DB_NAME))
    return conn

def main():
    db = 'recipe'
    conn = connect_to_db(db)
    cursor = conn.cursor()

    # 從mysql同義詞庫抓同義詞存進redis (格式hash-  synonym(key)- {總詞彙(sub key):定義食材(value)
    sql = """
示例#29
0
        # skip the first argument, which is FunctionContext*
        arg_types = tuple([
            llvm2impala[arg.pointee.name]
            for arg in function.type.pointee.args[1:]
        ])
        functions.append((symbol, arg_types))
    except (AttributeError, KeyError):
        # this process could fail for non-UDF helper functions...just ignore
        # them, because we're not going to be registering them anyway
        log("Had trouble with function %s; moving on..." % symbol)
        pass

# transfer the LLVM module to HDFS
url = 'http://{nn_host}:{webhdfs_port}'.format(nn_host=args.nn_host,
                                               webhdfs_port=args.webhdfs_port)
hdfs_client = InsecureClient(url, user=args.user)
hdfs_client.write(args.hdfs_path, bc, overwrite=args.force)
log("Transferred LLVM IR to HDFS at %s" % args.hdfs_path)

# register the functions with impala
conn = impala.dbapi.connect(host=args.impala_host, port=args.impala_port)
cursor = conn.cursor(user=args.user)
log("Connected to impalad: %s" % args.impala_host)
if args.db:
    cursor.execute('USE %s' % args.db)
cursor.execute("SHOW FUNCTIONS")
registered_functions = cursor.fetchall()
for (udf_name, return_type) in zip(args.name, args.return_type):
    log("Registering function %s" % udf_name)
    # find matching LLVM symbols to the current UDF name
    matches = [pair for pair in functions if udf_name in pair[0]]
示例#30
0
from hdfs.client import Client, InsecureClient
import redis
import json
# r = redis.StrictRedis(host='192.168.1.176', port=6379,decode_responses=True)

client = InsecureClient("http://192.168.1.176:50070", user='******')
# 路徑不用hdfs://
# client.list("/") -> ['recipe', 'tmp', 'user']

with client.read("/recipe/recipe1018_V8.json") as reader:
    data = json.load(reader)

print(data[:10])
        symbol = function.name
        log("Loading types for function %s" % symbol)
        # skip the first argument, which is FunctionContext*
        arg_types = tuple([llvm2impala[arg.pointee.name]
                           for arg in function.type.pointee.args[1:]])
        functions.append((symbol, arg_types))
    except (AttributeError, KeyError):
        # this process could fail for non-UDF helper functions...just ignore
        # them, because we're not going to be registering them anyway
        log("Had trouble with function %s; moving on..." % symbol)
        pass

# transfer the LLVM module to HDFS
url = 'http://{nn_host}:{webhdfs_port}'.format(
    nn_host=args.nn_host, webhdfs_port=args.webhdfs_port)
hdfs_client = InsecureClient(url, user=args.user)
hdfs_client.write(args.hdfs_path, bc, overwrite=args.force)
log("Transferred LLVM IR to HDFS at %s" % args.hdfs_path)

# register the functions with impala
conn = impala.dbapi.connect(host=args.impala_host, port=args.impala_port)
cursor = conn.cursor(user=args.user)
log("Connected to impalad: %s" % args.impala_host)
if args.db:
    cursor.execute('USE %s' % args.db)
cursor.execute("SHOW FUNCTIONS")
registered_functions = cursor.fetchall()
for (udf_name, return_type) in zip(args.name, args.return_type):
    log("Registering function %s" % udf_name)
    # find matching LLVM symbols to the current UDF name
    matches = [pair for pair in functions if udf_name in pair[0]]