def readMapFileFromHdfs(mapFileId): sourceMapfilePath = join(settings.images_hdfs_path, mapFileId) localDistPath = join(settings.mapfile_cache_folder, mapFileId) if HdfsUtil.copyFromHDFS(sourceMapfilePath, localDistPath): return readMapFile(mapFileId) else: return False
def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) self.segDirs = self.getSegmentDirs()
class HawqUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) self.segDirs = self.getSegmentDirs() def printReport(self): printInfo("Getting database names from HAWQ") dbNames = self.getDatabaseNames() printInfo("Getting quotas from these databases %s" % (dbNames)) for db in dbNames: self.printDatabaseQuota(db) self.printTopKLargestDatabases(dbNames) def getDatabaseNames(self): names = queryPostgres(self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), self.conf.get(Config.HAWQ_METADATA_DB), "SELECT DISTINCT datname FROM pg_database", split=False) retval = [] for name in names.split(): if not name in self.conf.get(Config.HAWQ_SYSTEM_DB_BLACKLIST): retval.append(name) return retval def getDatabaseOID(self, name): oids = queryPostgres(self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), self.conf.get(Config.HAWQ_METADATA_DB), "SELECT oid FROM pg_database WHERE datname = '%s'" % (name)) if not oids is None and len(oids) == 1: return oids[0] elif oids is None: printError("Database %s not found" % (name)) else: printError("Received %i OIDs, expecting 1" %(len(oids))) return None def getTableOID(self, database, table): oids = queryPostgres(self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), database, "SELECT oid FROM pg_class WHERE relname = '%s'" % (table)) if len(oids) == 1: return oids[0] else: printError("Received %i OIDs, expecting 1" %(len(oids))) return None def getSegmentDirs(self): cmd = "hdfs dfs -ls %s | grep gpseg | awk '{print $8}'" % (self.conf.get(Config.HAWQ_HDFS_DIR)) dirs = getCommandOutput(cmd).split() if len(dirs) == 0: printError("Failed to get any segment directories from HDFS") sys.exit(1) return dirs def getSchemaTables(self, database): output = queryPostgres(port, username, database, "SELECT table_schema, table_name FROM information_schema.tables", split=False) schemaTableMap = dict() for record in output.split('\n'): (schema, table) = record.strip().replace('|', '').split() if not schema in self.schemaBlackList: try: schemaTableMap[schema] = schemaTableMap[schema] + [table] except KeyError: schemaTableMap[schema] = [table] return schemaTableMap def printDatabaseQuota(self, db): dbOID = self.getDatabaseOID(db) if not dbOID is None: printInfo("Getting quota status for database %s" % (db)) hdfsDBDirs = [] for segDir in self.segDirs: hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID)) quotas = self.hdfsUtil.getSpaceQuotas(hdfsDBDirs) self.__printDBQuotaInserts(db, quotas) row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(db, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printDBQuotaInserts(self, db, quotas): for (directory, quota, remainingQuota) in quotas: row = HawqDBQuotaRow() row.database = db row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def getDatabaseSize(self, db): dbOID = self.getDatabaseOID(db) if not dbOID is None: dbDir = "%s/*/16385/%s" % (self.conf.get(Config.HAWQ_HDFS_DIR), dbOID) sizes = self.hdfsUtil.getDirSizes([dbDir]) sum = 0 for (dir, size) in sizes: sum += size return (db, sum) else: return None def printTopKLargestDatabases(self, dbNames): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s largest HAWQ databases" % (k)) dbSizes = [] for db in dbNames: tDbSize = self.getDatabaseSize(db) if not tDbSize is None: dbSizes.append(tDbSize) if len(dbSizes) == 0: printInfo("No HAWQ databases found in HDFS") return dbSizes.sort(key=operator.itemgetter(1), reverse=True) if len(dbSizes) > k: dbSizes = dbSizes[0:k] self.__printTopKLargestDatabasesInserts(dbSizes) # print sizes row = namedtuple('Row', ['Database', 'Size', 'SizeHR']) toPrint = [] for (db, size) in dbSizes: sizeHR = bytes2human(size) toPrint.append(row(db, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKLargestDatabasesInserts(self, dbSizes): for (db, size) in dbSizes: row = HawqDBSizeRow() row.database = db row.size = size self.pgUtil.writeInsert(row) def setDatabaseQuota(self, db, quota): if db == self.conf.get(Config.HAWQ_DB_BLACKLIST).split(): printError("Database %s is in the blacklist. Remove to set quota" % (db)) return dbOID = self.getDatabaseOID(db) if not dbOID is None: printInfo("Setting quota for %s to %s bytes" % (db, quota)) hdfsDBDirs = [] for segDir in self.segDirs: hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID)) self.hdfsUtil.setSpaceQuotas(hdfsDBDirs, quota) else: sys.exit(1) def clearDatabaseQuota(self, db): dbOID = self.getDatabaseOID(db) if not dbOID is None: printInfo("Clearing quota for database %s" % (db)) hdfsDBDirs = [] for segDir in self.segDirs: hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID)) self.hdfsUtil.clearSpaceQuotas(hdfsDBDirs) else: sys.exit(1)
def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf)
class UserUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): self.printUserSpaceQuotas() self.printUserINodeQuotas() self.printTopKSpaceUsers() self.printTopKINodeUsers() def printUserSpaceQuotas(self): printInfo("Getting space quota status for users") quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserSpaceQuotasInserts(quotas) row = namedtuple( 'Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota # Sometimes the remaining quota is negative... if remainingQuota != 'inf': if long(remainingQuotaHR) < 0: remainingQuotaHR = "-" + bytes2human(-long(remainingQuota)) else: remainingQuotaHR = bytes2human(remainingQuota) else: remainingQuotaHR = remainingQuota toPrint.append( row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printUserSpaceQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserSpaceQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserINodeQuotas(self): printInfo("Getting inode quota status for users") quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserINodeQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def __printUserINodeQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserINodeQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserSpaceQuota(self, user): printInfo("Getting space quota status for user %s" % (user)) quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple( 'Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printUserINodeQuota(self, user): printInfo("Getting inode quota status for user %s" % (user)) quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def printTopKSpaceUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s space users" % (k)) sizes = self.hdfsUtil.getDirSizes(['/user']) if len(sizes) == 0: printInfo("No user directories found in HDFS") return sizes.sort(key=operator.itemgetter(1), reverse=True) if len(sizes) > k: sizes = sizes[0:k] self.__printTopKSpaceInserts(sizes) row = namedtuple('Row', ['User', 'Size', 'SizeHR']) toPrint = [] for (dir, size) in sizes: sizeHR = bytes2human(size) toPrint.append(row(dir, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKSpaceInserts(self, sizes): for (dir, size) in sizes: row = UserSpaceSizeRow() row.username = dir[6:] row.dir = dir row.size = size self.pgUtil.writeInsert(row) def printTopKINodeUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s inode users" % (k)) counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories()) if len(counts) == 0: printInfo("No user directories found in HDFS") return counts.sort(key=operator.itemgetter(1), reverse=True) if len(counts) > k: counts = counts[0:k] self.__printTopKINodeUsersInserts(counts) row = namedtuple('Row', ['User', 'INodes']) toPrint = [] for (dir, count) in counts: toPrint.append(row(dir, str(count))) pprinttable(toPrint) def __printTopKINodeUsersInserts(self, counts): for (dir, count) in counts: row = UserINodeSizeRow() row.username = dir[6:] row.dir = dir row.size = count self.pgUtil.writeInsert(row) def setUserSpaceQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota) def clearUserSpaceQuota(self, user): self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)]) def setUserINodeQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota) def clearUserINodeQuota(self, user): self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)]) def getUserDirectories(self): return self.hdfsUtil.listDirs(['/user'])
class HawqUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) self.segDirs = self.getSegmentDirs() def printReport(self): printInfo("Getting database names from HAWQ") dbNames = self.getDatabaseNames() printInfo("Getting quotas from these databases %s" % (dbNames)) for db in dbNames: self.printDatabaseQuota(db) self.printTopKLargestDatabases(dbNames) def getDatabaseNames(self): names = queryPostgres(self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), self.conf.get(Config.HAWQ_METADATA_DB), "SELECT DISTINCT datname FROM pg_database", split=False) retval = [] for name in names.split(): if not name in self.conf.get(Config.HAWQ_SYSTEM_DB_BLACKLIST): retval.append(name) return retval def getDatabaseOID(self, name): oids = queryPostgres( self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), self.conf.get(Config.HAWQ_METADATA_DB), "SELECT oid FROM pg_database WHERE datname = '%s'" % (name)) if not oids is None and len(oids) == 1: return oids[0] elif oids is None: printError("Database %s not found" % (name)) else: printError("Received %i OIDs, expecting 1" % (len(oids))) return None def getTableOID(self, database, table): oids = queryPostgres( self.conf.get(Config.HAWQ_PORT), self.conf.get(Config.HAWQ_USERNAME), database, "SELECT oid FROM pg_class WHERE relname = '%s'" % (table)) if len(oids) == 1: return oids[0] else: printError("Received %i OIDs, expecting 1" % (len(oids))) return None def getSegmentDirs(self): cmd = "hdfs dfs -ls %s | grep gpseg | awk '{print $8}'" % ( self.conf.get(Config.HAWQ_HDFS_DIR)) dirs = getCommandOutput(cmd).split() if len(dirs) == 0: printError("Failed to get any segment directories from HDFS") sys.exit(1) return dirs def getSchemaTables(self, database): output = queryPostgres( port, username, database, "SELECT table_schema, table_name FROM information_schema.tables", split=False) schemaTableMap = dict() for record in output.split('\n'): (schema, table) = record.strip().replace('|', '').split() if not schema in self.schemaBlackList: try: schemaTableMap[schema] = schemaTableMap[schema] + [table] except KeyError: schemaTableMap[schema] = [table] return schemaTableMap def printDatabaseQuota(self, db): dbOID = self.getDatabaseOID(db) if not dbOID is None: printInfo("Getting quota status for database %s" % (db)) hdfsDBDirs = [] for segDir in self.segDirs: hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID)) quotas = self.hdfsUtil.getSpaceQuotas(hdfsDBDirs) self.__printDBQuotaInserts(db, quotas) row = namedtuple('Row', [ 'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR' ]) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota ) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(db, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printDBQuotaInserts(self, db, quotas): for (directory, quota, remainingQuota) in quotas: row = HawqDBQuotaRow() row.database = db row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def getDatabaseSize(self, db): dbOID = self.getDatabaseOID(db) if not dbOID is None: dbDir = "%s/*/16385/%s" % (self.conf.get( Config.HAWQ_HDFS_DIR), dbOID) sizes = self.hdfsUtil.getDirSizes([dbDir]) sum = 0 for (dir, size) in sizes: sum += size return (db, sum) else: return None def printTopKLargestDatabases(self, dbNames): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s largest HAWQ databases" % (k)) dbSizes = [] for db in dbNames: tDbSize = self.getDatabaseSize(db) if not tDbSize is None: dbSizes.append(tDbSize) if len(dbSizes) == 0: printInfo("No HAWQ databases found in HDFS") return dbSizes.sort(key=operator.itemgetter(1), reverse=True) if len(dbSizes) > k: dbSizes = dbSizes[0:k] self.__printTopKLargestDatabasesInserts(dbSizes) # print sizes row = namedtuple('Row', ['Database', 'Size', 'SizeHR']) toPrint = [] for (db, size) in dbSizes: sizeHR = bytes2human(size) toPrint.append(row(db, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKLargestDatabasesInserts(self, dbSizes): for (db, size) in dbSizes: row = HawqDBSizeRow() row.database = db row.size = size self.pgUtil.writeInsert(row) def setDatabaseQuota(self, db, quota): if db == self.conf.get(Config.HAWQ_DB_BLACKLIST).split(): printError("Database %s is in the blacklist. Remove to set quota" % (db)) return dbOID = self.getDatabaseOID(db) if not dbOID is None: printInfo("Setting quota for %s to %s bytes" % (db, quota)) hdfsDBDirs = [] for segDir in self.segDirs: hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID)) self.hdfsUtil.setSpaceQuotas(hdfsDBDirs, quota) else: sys.exit(1) def clearDatabaseQuota(self, db): dbOID = self.getDatabaseOID(db) if not dbOID is None: printInfo("Clearing quota for database %s" % (db)) hdfsDBDirs = [] for segDir in self.segDirs: hdfsDBDirs.append("%s/16385/%s" % (segDir, dbOID)) self.hdfsUtil.clearSpaceQuotas(hdfsDBDirs) else: sys.exit(1)
class UserUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): self.printUserSpaceQuotas() self.printUserINodeQuotas() self.printTopKSpaceUsers() self.printTopKINodeUsers() def printUserSpaceQuotas(self): printInfo("Getting space quota status for users") quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserSpaceQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota # Sometimes the remaining quota is negative... if remainingQuota != 'inf': if long(remainingQuotaHR) < 0: remainingQuotaHR = "-" + bytes2human(-long(remainingQuota)) else: remainingQuotaHR = bytes2human(remainingQuota) else: remainingQuotaHR = remainingQuota toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printUserSpaceQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserSpaceQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserINodeQuotas(self): printInfo("Getting inode quota status for users") quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserINodeQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def __printUserINodeQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserINodeQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserSpaceQuota(self, user): printInfo("Getting space quota status for user %s" % (user)) quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printUserINodeQuota(self, user): printInfo("Getting inode quota status for user %s" % (user)) quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def printTopKSpaceUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s space users" % (k)) sizes = self.hdfsUtil.getDirSizes(['/user']) if len(sizes) == 0: printInfo("No user directories found in HDFS") return sizes.sort(key=operator.itemgetter(1), reverse=True) if len(sizes) > k: sizes = sizes[0:k] self.__printTopKSpaceInserts(sizes) row = namedtuple('Row', ['User', 'Size', 'SizeHR']) toPrint = [] for (dir, size) in sizes: sizeHR = bytes2human(size) toPrint.append(row(dir, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKSpaceInserts(self, sizes): for (dir, size) in sizes: row = UserSpaceSizeRow() row.username = dir[6:] row.dir = dir row.size = size self.pgUtil.writeInsert(row) def printTopKINodeUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s inode users" % (k)) counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories()) if len(counts) == 0: printInfo("No user directories found in HDFS") return counts.sort(key=operator.itemgetter(1), reverse=True) if len(counts) > k: counts = counts[0:k] self.__printTopKINodeUsersInserts(counts) row = namedtuple('Row', ['User', 'INodes']) toPrint = [] for (dir, count) in counts: toPrint.append(row(dir, str(count))) pprinttable(toPrint) def __printTopKINodeUsersInserts(self, counts): for (dir, count) in counts: row = UserINodeSizeRow() row.username = dir[6:] row.dir = dir row.size = count self.pgUtil.writeInsert(row) def setUserSpaceQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota) def clearUserSpaceQuota(self, user): self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)]) def setUserINodeQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota) def clearUserINodeQuota(self, user): self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)]) def getUserDirectories(self): return self.hdfsUtil.listDirs(['/user'])
elif conf.get(Config.ACTION) == 'clear' and query_yes_no("Are you sure you want to clear the %s database's quota?" % (conf.get(Config.DATABASE)), default="no"): hawqUtil.clearDatabaseQuota(conf.get(Config.DATABASE)) hawqUtil.printDatabaseQuota(conf.get(Config.DATABASE)) else: printError("Unknown action %s" % (conf.get(Config.ACTION))) # HDFS option elif sys.argv[1] == "hdfs-util": parser = OptionParser() parser.add_option("-c", "--config", dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini") parser.add_option("-a", "--action", dest="action", help="Choose an action: report", default=None) (options, args) = parser.parse_args(sys.argv[2:]) conf = Config(parser, sys.argv[2:]) hdfsUtil = HdfsUtil(conf) ### Main program if conf.get(Config.ACTION) == 'report': hdfsUtil.printReport() else: printError("Unknown action %s" % (conf.get(Config.ACTION))) # Hive option elif sys.argv[1] == "hive-util": parser = OptionParser() parser.add_option("-c", "--config", dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini") parser.add_option("-a", "--action", dest="action", help="Choose an action: report, get, set, clear", default=None) parser.add_option("-d", "--database", dest="database", help="Database to get or set (Only for get/set actions)", default=None) parser.add_option("-q", "--quota", dest="quota", help="Database quota, in bytes. Keep in mind the 3x replication. (Only for set action)", default=None)
class HiveUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): printInfo("Fetching contents of Hive warehouse") hivedbdirs = self.getHiveDatabaseDirectories() self.printDatabaseQuotas(hivedbdirs) self.printTopKLargestDatabases(hivedbdirs) def getHiveDatabaseDirectories(self): hivedirs = self.hdfsUtil.listDirs( [self.conf.get(Config.HIVE_WAREHOUSE_DIR)]) retval = [] for dir in hivedirs: if dir.endswith(".db"): retval.append(dir) return retval def printDatabaseQuota(self, db): printInfo("Getting quota status for Hive database %s" % (db)) quotas = self.hdfsUtil.getSpaceQuotas( ["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)]) if len(quotas) == 0: printInfo("No Hive databases found") return row = namedtuple('Row', [ 'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR' ]) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printDatabaseQuotas(self, hivedbdirs): printInfo("Getting quota status for Hive databases") hdfsDirs = [] for dir in hivedbdirs: db = self.getDbNameFromPath(dir) hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)) quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs) if len(quotas) == 0: printInfo("No Hive databases found") return quotas.sort() self.__printDBQuotasInserts(quotas) row = namedtuple('Row', [ 'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR' ]) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printDBQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = HiveDBQuotaRow() row.database = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def getDatabaseSize(self, dbDir): sizes = self.hdfsUtil.getDirSizes([dbDir]) sum = 0 for (dir, size) in sizes: sum += size return (dbDir, sum) def printTopKLargestDatabases(self, hivedbdirs): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s largest Hive databases" % (k)) dbSizes = [] for dbDir in hivedbdirs: tDbSize = self.getDatabaseSize(dbDir) if not tDbSize is None: dbSizes.append(tDbSize) if len(dbSizes) == 0: printInfo("No Hive databases found in HDFS") return dbSizes.sort(key=operator.itemgetter(1), reverse=True) if len(dbSizes) > k: dbSizes = dbSizes[0:k] self.__printTopKLargestDatabases(dbSizes) # print sizes row = namedtuple('Row', ['Database', 'Size', 'SizeHR']) toPrint = [] for (db, size) in dbSizes: sizeHR = bytes2human(size) toPrint.append(row(db, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKLargestDatabases(self, dbSizes): for (db, size) in dbSizes: row = HiveDBSizeRow() row.database = db row.size = size self.pgUtil.writeInsert(row) def setDatabaseQuota(self, db, quota): if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split(): printError("Database %s is in the blacklist. Remove to set quota" % (db)) return printInfo("Setting quota for %s to %s bytes" % (db, quota)) self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota) def clearDatabaseQuota(self, db): printInfo("Clearing quota for database %s" % (db)) self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)]) def getDbNameFromPath(self, dir): return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "") def getDbPathFromName(self, db): return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)
dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini") parser.add_option( "-s", "--sqlfile", dest="sqlFile", help="Filename to write SQL statements to (default none)", default=None) conf = Config(parser, sys.argv[2:]) pgutil = PostgresUtil(conf) pgutil.open() HdfsUtil(conf).printReport() HawqUtil(conf).printReport() HiveUtil(conf).printReport() UserUtil(conf).printReport() FsUtil(conf).printReport() pgutil.close() # Local filesystem option elif sys.argv[1] == "fs-util": parser = OptionParser() parser.add_option("-c", "--config", dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini")
class HiveUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): printInfo("Fetching contents of Hive warehouse") hivedbdirs = self.getHiveDatabaseDirectories() self.printDatabaseQuotas(hivedbdirs) self.printTopKLargestDatabases(hivedbdirs) def getHiveDatabaseDirectories(self): hivedirs = self.hdfsUtil.listDirs([self.conf.get(Config.HIVE_WAREHOUSE_DIR)]) retval = [] for dir in hivedirs: if dir.endswith(".db"): retval.append(dir) return retval def printDatabaseQuota(self, db): printInfo("Getting quota status for Hive database %s" % (db)) quotas = self.hdfsUtil.getSpaceQuotas(["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)]) if len(quotas) == 0: printInfo("No Hive databases found") return; row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printDatabaseQuotas(self, hivedbdirs): printInfo("Getting quota status for Hive databases") hdfsDirs = [] for dir in hivedbdirs: db = self.getDbNameFromPath(dir) hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)) quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs) if len(quotas) == 0: printInfo("No Hive databases found") return; quotas.sort() self.__printDBQuotasInserts(quotas) row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printDBQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = HiveDBQuotaRow() row.database = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def getDatabaseSize(self, dbDir): sizes = self.hdfsUtil.getDirSizes([dbDir]) sum = 0 for (dir, size) in sizes: sum += size return (dbDir, sum) def printTopKLargestDatabases(self, hivedbdirs): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s largest Hive databases" % (k)) dbSizes = [] for dbDir in hivedbdirs: tDbSize = self.getDatabaseSize(dbDir) if not tDbSize is None: dbSizes.append(tDbSize) if len(dbSizes) == 0: printInfo("No Hive databases found in HDFS") return dbSizes.sort(key=operator.itemgetter(1), reverse=True) if len(dbSizes) > k: dbSizes = dbSizes[0:k] self.__printTopKLargestDatabases(dbSizes) # print sizes row = namedtuple('Row', ['Database', 'Size', 'SizeHR']) toPrint = [] for (db, size) in dbSizes: sizeHR = bytes2human(size) toPrint.append(row(db, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKLargestDatabases(self, dbSizes): for (db, size) in dbSizes: row = HiveDBSizeRow() row.database = db row.size = size self.pgUtil.writeInsert(row) def setDatabaseQuota(self, db, quota): if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split(): printError("Database %s is in the blacklist. Remove to set quota" % (db)) return printInfo("Setting quota for %s to %s bytes" % (db, quota)) self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota) def clearDatabaseQuota(self, db): printInfo("Clearing quota for database %s" % (db)) self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)]) def getDbNameFromPath(self, dir): return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "") def getDbPathFromName(self, db): return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db);
def get_hdfs(self): conf = self.hdfs_conf h = HdfsUtil.HDFS(**conf) h.connect() return h