def basic(): client = HdfsClient(hosts='study:50070') print(client.list_status('/')) print '判断某个路径是否存在' print client.exists("/test") print client.exists("/data/gz/thrift-0.9.2.tar.gz") client = HdfsClient(hosts='study:50070') print client.get_file_checksum("/data/gz/bison-2.5.1.tar.gz") summary = client.get_content_summary("/") print summary #文件拷贝--从HDFS拷贝到本地磁盘系统 client.copy_to_local("/data/gz/pip-7.1.2.tar.gz","/root/data/pip-7.1.2.tar.gz") #文件拷贝--从本地磁盘系统拷贝到HDFS系统中 client.copy_from_local("/root/data/thrift-0.9.2.tar.gz","/data/gz/thrift-0.9.2.tar.gz") print client.get_home_directory()
logger_requests = logging.getLogger('requests') logger_requests.setLevel(logging.ERROR) constants = Constants() logger.info('constants loaded') logger.info('init : hosts_hdfs = %s' % hosts_hdfs) logger.info('init : dataroot = %s' % dir_dataroot) logger.info('init : path main = %s' % path_main) logger.info('init : path codemap = %s' % path_codemap) client = HdfsClient(hosts=hosts_hdfs) logger.info('connect hdfs') logger.info('---- start working ----') # type='DIRECTORY' type='FILE' while 1: list_dirs = [ x['pathSuffix'] for x in client.list_status(dir_dataroot) if x['type'] == 'DIRECTORY' ] list_dirs.sort() for subdir in list_dirs: dir_subdata = os.path.join(dir_dataroot, subdir) logger.debug('data path : %s' % dir_subdata) dir_subdata_cleaned = os.path.join(dir_subdata, 'cleaned4netsec') logger.debug('data path for cleaned files : %s' % dir_subdata_cleaned) list_subdir_date = [ x['pathSuffix'] for x in client.list_status(dir_subdata) if x['type'] == 'FILE' ] if len(list_subdir_date) > 0: if not client.exists(dir_subdata_cleaned):
def yesterday(): return today() - datetime.timedelta(days=1) # 执行主方法 if __name__ == '__main__': print "监控HDFS......" yesterday_datetime_format = yesterday() for table in CHECK_TABLE: is_success = False has_data = False content = "" try: path = ROOT_DIR + table + "/" + str(yesterday_datetime_format) if client.exists(path): client_list = client.list_status(path) for file_status in client_list: if (file_status.get("pathSuffix").startswith('part-')) and (int(file_status.get("length")) > 0): has_data = True elif file_status.get("pathSuffix").__eq__("_SUCCESS"): is_success = True else: content = "异常信息:HDFS路径不存在 <br>" + \ str("HDFS路径:") + path except Exception, e: content = "异常信息:" + str(e) + "<br>" + \ str("HDFS路径:") + path if (content == "") and (not is_success): content = "异常信息:" + table + "相关job运行失败" + "<br>" + \ str("HDFS路径:") + path