def getFileSystem(fs="dfs"): """ Returns a Hadoop FileSystem object, either "dfs" (default) or "local". """ if fs == "dfs": return FileSystem.get(happy.getJobConf()) elif fs == "local": return FileSystem.getLocal(happy.getJobConf()) else: raise Exception("Unknown filesystem " + fs)
def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None): """ :param hdfs_uri: hdfs://hadoop-name-node:port :param kerberos: optional, if kerberos authentication is needed :param kerberos_principal: optional, [email protected] :param keytab_file: optional, absolute path to keytab file """ self.logger = LoggerFactory.getLogger(self.__class__.__name__) self.logger.info("keytab_file: " + keytab_file) hdfs_conf = Configuration() if hdfs_uri.startswith('hdfs://'): hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri) elif hdfs_uri > "": self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri) if kerberos: # init kerberos and keytab if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '': print "Kerberos Principal and Keytab File Name/Path are required!" hdfs_conf.set("hadoop.security.authentication", "kerberos") hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*") UserGroupInformation.setConfiguration(hdfs_conf) UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_file) self.fs = Hdfs.get(hdfs_conf) requests.packages.urllib3.disable_warnings() self.logger.info("Initiated SchemaUrlHelper")
def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None): """ :param hdfs_uri: hdfs://hadoop-name-node:port :param kerberos: optional, if kerberos authentication is needed :param kerberos_principal: optional, [email protected] :param keytab_file: optional, absolute path to keytab file """ self.logger = LoggerFactory.getLogger(self.__class__.__name__) self.logger.info("keytab_file: " + keytab_file) hdfs_conf = Configuration() if hdfs_uri.startswith('hdfs://'): hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri) elif hdfs_uri > "": self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri) if kerberos: # init kerberos and keytab if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '': print "Kerberos Principal and Keytab File Name/Path are required!" hdfs_conf.set("hadoop.security.authentication", "kerberos") hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*") UserGroupInformation.setConfiguration(hdfs_conf) UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_file) self.fs = Hdfs.get(hdfs_conf) requests.packages.urllib3.disable_warnings()
def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None): """ :param hdfs_uri: hdfs://hadoop-name-node:port :param kerberos: optional, if kerberos authentication is needed :param kerberos_principal: optional, [email protected] :param keytab_file: optional, user.keytab or ~/.kerberos/user.keytab """ self.logger = LoggerFactory.getLogger(self.__class__.__name__) hdfs_conf = Configuration() if hdfs_uri.startswith('hdfs://'): hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri) elif hdfs_uri > "": self.logger.error( "%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri) if kerberos == True: # init kerberos and keytab if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '': print "Kerberos Principal and Keytab File Name/Path are required!" keytab_path = keytab_file if keytab_file.startswith('/'): if os.path.exists(keytab_file): keytab_path = keytab_file print "Using keytab at %s" % keytab_path else: # try relative path all_locations = [ os.getcwd(), expanduser("~") + "/.ssh", expanduser("~") + "/.kerberos", expanduser("~") + "/.wherehows", os.getenv("APP_HOME"), os.getenv("WH_HOME") ] for loc in all_locations: if os.path.exists(loc + '/' + keytab_file): keytab_path = loc + '/' + keytab_file print "Using keytab at %s" % keytab_path break hdfs_conf.set("hadoop.security.authentication", "kerberos") hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*") UserGroupInformation.setConfiguration(hdfs_conf) UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_path) self.fs = Hdfs.get(hdfs_conf) requests.packages.urllib3.disable_warnings()
def __init__(self, hdfsCluster): self.logger = Logger.getLogger("Hdfs") # self.logger.setLevel(Level.DEBUG) coreSite = "/etc/hadoop/conf/core-site.xml" hdfsSite = "/etc/hadoop/conf/hdfs-site.xml" hdfsCluster = hdfsCluster self.cHdfs = Configuration() self.cHdfs.addResource(Path(coreSite)) self.cHdfs.addResource(Path(hdfsSite)) self.cHdfs.set("fs.defaultFS", hdfsCluster) self.fileSystem = FileSystem.get(self.cHdfs) self.fileUtil = FileUtil()
def __init__(self, hdfs_uri, kerberos=False, kerberos_principal=None, keytab_file=None): """ :param hdfs_uri: hdfs://hadoop-name-node:port :param kerberos: optional, if kerberos authentication is needed :param kerberos_principal: optional, [email protected] :param keytab_file: optional, user.keytab or ~/.kerberos/user.keytab """ self.logger = LoggerFactory.getLogger(self.__class__.__name__) hdfs_conf = Configuration() if hdfs_uri.startswith('hdfs://'): hdfs_conf.set(Hdfs.FS_DEFAULT_NAME_KEY, hdfs_uri) elif hdfs_uri > "": self.logger.error("%s is an invalid uri for hdfs namenode ipc bind." % hdfs_uri) if kerberos == True: # init kerberos and keytab if not kerberos_principal or not keytab_file or kerberos_principal == '' or keytab_file == '': print "Kerberos Principal and Keytab File Name/Path are required!" keytab_path = keytab_file if keytab_file.startswith('/'): if os.path.exists(keytab_file): keytab_path = keytab_file print "Using keytab at %s" % keytab_path else: # try relative path all_locations = [os.getcwd(), expanduser("~") + "/.ssh", expanduser("~") + "/.kerberos", expanduser("~") + "/.wherehows", os.getenv("APP_HOME"), os.getenv("WH_HOME")] for loc in all_locations: if os.path.exists(loc + '/' + keytab_file): keytab_path = loc + '/' + keytab_file print "Using keytab at %s" % keytab_path break hdfs_conf.set("hadoop.security.authentication", "kerberos") hdfs_conf.set("dfs.namenode.kerberos.principal.pattern", "*") UserGroupInformation.setConfiguration(hdfs_conf) UserGroupInformation.loginUserFromKeytab(kerberos_principal, keytab_path) self.fs = Hdfs.get(hdfs_conf) requests.packages.urllib3.disable_warnings()
from java.io import IOException from java.util import Properties from java.lang import SecurityException from java.lang import System import sys import time startTime = time.time() if len(sys.argv) != 2: raise sys.argv[0] + ' <basedir>' # Get reference to the Hadoop FileSystem object. Everything we do in # this script that interacts with HDFS is through this object. fs = FileSystem.get(Configuration()) # Make sure the requested collection exists. collection = sys.argv[1] collectionDir = Path( collection ) if not fs.exists( collectionDir ): print '\nERROR: no collection directory: %s' % collectionDir System.exit(1) # Check for "guard" file. Like a semaphore, ensures that we don't try # to update this collection while it's in the middle of being updated. # Since file creation in HDFS is atomic, we don't check for the existence # of the guardFile, rather we try to create it. If the file already exists # then fs.createNewFile() will return False guardFile = Path( collectionDir, '_updating' )
from java.io import IOException from java.util import Properties from java.lang import SecurityException from java.lang import System import sys import time startTime = time.time() if len(sys.argv) != 2: raise sys.argv[0] + ' <basedir>' # Get reference to the Hadoop FileSystem object. Everything we do in # this script that interacts with HDFS is through this object. fs = FileSystem.get(Configuration()) # Make sure the requested collection exists. collection = sys.argv[1] collectionDir = Path(collection) if not fs.exists(collectionDir): print '\nERROR: no collection directory: %s' % collectionDir System.exit(1) # Check for "guard" file. Like a semaphore, ensures that we don't try # to update this collection while it's in the middle of being updated. # Since file creation in HDFS is atomic, we don't check for the existence # of the guardFile, rather we try to create it. If the file already exists # then fs.createNewFile() will return False guardFile = Path(collectionDir, '_updating')