def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf)
class FsUtil: def __init__(self, conf): self.conf = conf self.pgUtil = PostgresUtil(conf) def printReport(self): printInfo("Grabbing and printing drive metrics") self.printDriveUsage() self.printDriveWriteTest() # This function prints usage statistics for drives that are available # It runs an initial test to make sure the drive exists on the host # If the drive does not exist, it is not reported in this report def printDriveUsage(self): hostfile = self.conf.get(Config.HOST_FILE) dataDrives = self.conf.get(Config.DATA_DRIVES) driveMetrics = [] for dataDrive in dataDrives.split(" "): # First, test if the drive exists on all host, getting the list of where it worked output = getCommandOutput("massh %s worked test -e %s" % (hostfile, dataDrive)) if len(output) > 0: # We have some hosts that have this drive tmpHostfile = self.writeHostFile(output.split("\n")) cmd = "massh %s verbose \"df %s | grep -v Filesystem\" | awk '{print $1,$8,$7,$4,$5,$6}'" % ( tmpHostfile, dataDrive) output = getCommandOutput(cmd).split("\n") for line in output: (host, drive, perc, size, used, avail) = line.split(" ") driveMetrics.append( (host.replace("[", "").replace("]", ""), drive, perc.replace("%", ""), size, used, avail)) driveMetrics.sort() self.__printDriveUsageInserts(driveMetrics) row = namedtuple( 'Row', ['Host', 'Drive', 'PercentUsed', 'Size', 'Used', 'Avail']) toPrint = [] for (host, drive, perc, size, used, avail) in driveMetrics: toPrint.append(row(host, drive, perc, size, used, avail)) pprinttable(toPrint) def __printDriveUsageInserts(self, driveMetrics): for (host, drive, perc, size, used, avail) in driveMetrics: row = DriveUsageRow() row.host = host row.drive = drive row.perc = perc row.size = size row.used = used row.avail = avail self.pgUtil.writeInsert(row) def printDriveWriteTest(self): printInfo("Getting non-writeable drives") hostfile = self.conf.get(Config.HOST_FILE) dataDrives = self.conf.get(Config.DATA_DRIVES) failedDrives = [] for drive in dataDrives.split(" "): # Check if the drives exist output = getCommandOutput("massh %s bombed sudo test -e %s" % (hostfile, drive)) if len(output) > 0: for host in output.split("\n"): failedDrives.append((host, drive, 'dne')) output = getCommandOutput("massh %s worked sudo test -e %s" % (hostfile, drive)) if len(output) > 0: tmpHostFile = self.writeHostFile(output.split("\n")) output = getCommandOutput("massh %s bombed sudo test -w %s" % (tmpHostFile, drive)) if len(output) > 0: for host in output.split("\n"): failedDrives.append((host, drive, 'ro')) if len(failedDrives) == 0: printInfo("No non-writeable drives to report") else: row = namedtuple('Row', ['Host', 'Drive', 'Reason']) failedDrives.sort() self.__printDriveWriteTest(failedDrives) toPrint = [] for (host, drive, reason) in failedDrives: toPrint.append(row(host, drive, reason)) pprinttable(toPrint) def __printDriveWriteTest(self, failedDrives): for (host, drive, reason) in failedDrives: row = DriveWriteTestRow() row.host = host row.drive = drive row.reason = reason self.pgUtil.writeInsert(row) def writeHostFile(self, hosts): fName = self.conf.get(Config.TMP_DIR) + "/fsutil.txt" f = open(fName, 'w') for item in hosts: f.write(item + "\n") f.flush() f.close() return fName
class UserUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): self.printUserSpaceQuotas() self.printUserINodeQuotas() self.printTopKSpaceUsers() self.printTopKINodeUsers() def printUserSpaceQuotas(self): printInfo("Getting space quota status for users") quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserSpaceQuotasInserts(quotas) row = namedtuple( 'Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota # Sometimes the remaining quota is negative... if remainingQuota != 'inf': if long(remainingQuotaHR) < 0: remainingQuotaHR = "-" + bytes2human(-long(remainingQuota)) else: remainingQuotaHR = bytes2human(remainingQuota) else: remainingQuotaHR = remainingQuota toPrint.append( row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printUserSpaceQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserSpaceQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserINodeQuotas(self): printInfo("Getting inode quota status for users") quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserINodeQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def __printUserINodeQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserINodeQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserSpaceQuota(self, user): printInfo("Getting space quota status for user %s" % (user)) quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple( 'Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printUserINodeQuota(self, user): printInfo("Getting inode quota status for user %s" % (user)) quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def printTopKSpaceUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s space users" % (k)) sizes = self.hdfsUtil.getDirSizes(['/user']) if len(sizes) == 0: printInfo("No user directories found in HDFS") return sizes.sort(key=operator.itemgetter(1), reverse=True) if len(sizes) > k: sizes = sizes[0:k] self.__printTopKSpaceInserts(sizes) row = namedtuple('Row', ['User', 'Size', 'SizeHR']) toPrint = [] for (dir, size) in sizes: sizeHR = bytes2human(size) toPrint.append(row(dir, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKSpaceInserts(self, sizes): for (dir, size) in sizes: row = UserSpaceSizeRow() row.username = dir[6:] row.dir = dir row.size = size self.pgUtil.writeInsert(row) def printTopKINodeUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s inode users" % (k)) counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories()) if len(counts) == 0: printInfo("No user directories found in HDFS") return counts.sort(key=operator.itemgetter(1), reverse=True) if len(counts) > k: counts = counts[0:k] self.__printTopKINodeUsersInserts(counts) row = namedtuple('Row', ['User', 'INodes']) toPrint = [] for (dir, count) in counts: toPrint.append(row(dir, str(count))) pprinttable(toPrint) def __printTopKINodeUsersInserts(self, counts): for (dir, count) in counts: row = UserINodeSizeRow() row.username = dir[6:] row.dir = dir row.size = count self.pgUtil.writeInsert(row) def setUserSpaceQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota) def clearUserSpaceQuota(self, user): self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)]) def setUserINodeQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota) def clearUserINodeQuota(self, user): self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)]) def getUserDirectories(self): return self.hdfsUtil.listDirs(['/user'])
class UserUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): self.printUserSpaceQuotas() self.printUserINodeQuotas() self.printTopKSpaceUsers() self.printTopKINodeUsers() def printUserSpaceQuotas(self): printInfo("Getting space quota status for users") quotas = self.hdfsUtil.getSpaceQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserSpaceQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota # Sometimes the remaining quota is negative... if remainingQuota != 'inf': if long(remainingQuotaHR) < 0: remainingQuotaHR = "-" + bytes2human(-long(remainingQuota)) else: remainingQuotaHR = bytes2human(remainingQuota) else: remainingQuotaHR = remainingQuota toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printUserSpaceQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserSpaceQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserINodeQuotas(self): printInfo("Getting inode quota status for users") quotas = self.hdfsUtil.getINodeQuotas(self.getUserDirectories()) if len(quotas) == 0: printInfo("No user directories found in HDFS") return quotas.sort() self.__printUserINodeQuotasInserts(quotas) row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def __printUserINodeQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = UserINodeQuotaRow() row.username = directory[6:] row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def printUserSpaceQuota(self, user): printInfo("Getting space quota status for user %s" % (user)) quotas = self.hdfsUtil.getSpaceQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printUserINodeQuota(self, user): printInfo("Getting inode quota status for user %s" % (user)) quotas = self.hdfsUtil.getINodeQuotas(["/user/%s" % (user)]) if len(quotas) == 0: printInfo("Directory for user %s not found in HDFS" % (quotas)) return row = namedtuple('Row', ['Directory', 'Quota', 'Remaining']) toPrint = [] for (directory, quota, remainingQuota) in quotas: toPrint.append(row(directory, quota, remainingQuota)) pprinttable(toPrint) def printTopKSpaceUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s space users" % (k)) sizes = self.hdfsUtil.getDirSizes(['/user']) if len(sizes) == 0: printInfo("No user directories found in HDFS") return sizes.sort(key=operator.itemgetter(1), reverse=True) if len(sizes) > k: sizes = sizes[0:k] self.__printTopKSpaceInserts(sizes) row = namedtuple('Row', ['User', 'Size', 'SizeHR']) toPrint = [] for (dir, size) in sizes: sizeHR = bytes2human(size) toPrint.append(row(dir, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKSpaceInserts(self, sizes): for (dir, size) in sizes: row = UserSpaceSizeRow() row.username = dir[6:] row.dir = dir row.size = size self.pgUtil.writeInsert(row) def printTopKINodeUsers(self): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s inode users" % (k)) counts = self.hdfsUtil.getINodeCounts(self.getUserDirectories()) if len(counts) == 0: printInfo("No user directories found in HDFS") return counts.sort(key=operator.itemgetter(1), reverse=True) if len(counts) > k: counts = counts[0:k] self.__printTopKINodeUsersInserts(counts) row = namedtuple('Row', ['User', 'INodes']) toPrint = [] for (dir, count) in counts: toPrint.append(row(dir, str(count))) pprinttable(toPrint) def __printTopKINodeUsersInserts(self, counts): for (dir, count) in counts: row = UserINodeSizeRow() row.username = dir[6:] row.dir = dir row.size = count self.pgUtil.writeInsert(row) def setUserSpaceQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setSpaceQuotas(["/user/%s" % (user)], quota) def clearUserSpaceQuota(self, user): self.hdfsUtil.clearSpaceQuotas(["/user/%s" % (user)]) def setUserINodeQuota(self, user, quota): if user == self.conf.get(Config.USER_DIR_BLACKLIST).split(): printError("User %s is in the blacklist. Remove to set quota" % (db)) return self.hdfsUtil.setINodeQuotas(["/user/%s" % (user)], quota) def clearUserINodeQuota(self, user): self.hdfsUtil.clearINodeQuotas(["/user/%s" % (user)]) def getUserDirectories(self): return self.hdfsUtil.listDirs(['/user'])
def __init__(self, conf): self.conf = conf self.pgutil = PostgresUtil(conf)
printError("`which hdfs` returned a non-zero exit code. Make sur eyou are using this utility from an HDFS node") sys.exit(1) if getCommandOutput("whoami") != "gpadmin": printError("Please execute this utility as gpadmin") sys.exit(2) ## Report option if sys.argv[1] == "report": parser = OptionParser() parser.add_option("-c", "--config", dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini") parser.add_option("-s", "--sqlfile", dest="sqlFile", help="Filename to write SQL statements to (default none)", default=None) conf = Config(parser, sys.argv[2:]) pgutil = PostgresUtil(conf) pgutil.open() HdfsUtil(conf).printReport() HawqUtil(conf).printReport() HiveUtil(conf).printReport() UserUtil(conf).printReport() FsUtil(conf).printReport() pgutil.close() # Local filesystem option elif sys.argv[1] == "fs-util": parser = OptionParser() parser.add_option("-c", "--config", dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini") parser.add_option("-a", "--action", dest="action", help="Choose an action: report", default=None)
class HiveUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): printInfo("Fetching contents of Hive warehouse") hivedbdirs = self.getHiveDatabaseDirectories() self.printDatabaseQuotas(hivedbdirs) self.printTopKLargestDatabases(hivedbdirs) def getHiveDatabaseDirectories(self): hivedirs = self.hdfsUtil.listDirs( [self.conf.get(Config.HIVE_WAREHOUSE_DIR)]) retval = [] for dir in hivedirs: if dir.endswith(".db"): retval.append(dir) return retval def printDatabaseQuota(self, db): printInfo("Getting quota status for Hive database %s" % (db)) quotas = self.hdfsUtil.getSpaceQuotas( ["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)]) if len(quotas) == 0: printInfo("No Hive databases found") return row = namedtuple('Row', [ 'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR' ]) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printDatabaseQuotas(self, hivedbdirs): printInfo("Getting quota status for Hive databases") hdfsDirs = [] for dir in hivedbdirs: db = self.getDbNameFromPath(dir) hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)) quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs) if len(quotas) == 0: printInfo("No Hive databases found") return quotas.sort() self.__printDBQuotasInserts(quotas) row = namedtuple('Row', [ 'Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR' ]) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human( remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append( row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printDBQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = HiveDBQuotaRow() row.database = directory.replace(".db", "").replace( self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def getDatabaseSize(self, dbDir): sizes = self.hdfsUtil.getDirSizes([dbDir]) sum = 0 for (dir, size) in sizes: sum += size return (dbDir, sum) def printTopKLargestDatabases(self, hivedbdirs): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s largest Hive databases" % (k)) dbSizes = [] for dbDir in hivedbdirs: tDbSize = self.getDatabaseSize(dbDir) if not tDbSize is None: dbSizes.append(tDbSize) if len(dbSizes) == 0: printInfo("No Hive databases found in HDFS") return dbSizes.sort(key=operator.itemgetter(1), reverse=True) if len(dbSizes) > k: dbSizes = dbSizes[0:k] self.__printTopKLargestDatabases(dbSizes) # print sizes row = namedtuple('Row', ['Database', 'Size', 'SizeHR']) toPrint = [] for (db, size) in dbSizes: sizeHR = bytes2human(size) toPrint.append(row(db, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKLargestDatabases(self, dbSizes): for (db, size) in dbSizes: row = HiveDBSizeRow() row.database = db row.size = size self.pgUtil.writeInsert(row) def setDatabaseQuota(self, db, quota): if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split(): printError("Database %s is in the blacklist. Remove to set quota" % (db)) return printInfo("Setting quota for %s to %s bytes" % (db, quota)) self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota) def clearDatabaseQuota(self, db): printInfo("Clearing quota for database %s" % (db)) self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)]) def getDbNameFromPath(self, dir): return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "") def getDbPathFromName(self, db): return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)
class HdfsUtil: def __init__(self, conf): self.conf = conf self.pgutil = PostgresUtil(conf) def printReport(self): self.printFsckSummary() self.printNameNodeReport() def listDirs(self, directories): if len(directories) == 0: return [] dirStr = "" for d in directories: dirStr = dirStr + "%s " % (d) cmd = "hdfs dfs -ls %s | awk '{print $8}'" % (dirStr) out = getCommandOutput(cmd) if len(out) > 0: return out.split("\n") else: return out def getDirSizes(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfs -du " for directory in directories: cmd = cmd + " " + directory cmd = cmd + " | awk '{print $1,$2}'" out = getCommandOutput(cmd) if len(out) == 0: return [] else: retval = [] for line in out.split('\n'): # Returns list of (dir, size) pairs retval.append((line.split(' ')[1], int(line.split(' ')[0]))) return retval def printFsckSummary(self): printInfo("Getting FSCK summary") # Redirecting syslog to /dev/null cmd = "hdfs fsck / 2> /dev/null | grep -v \"^\.\"" out = getCommandOutput(cmd) self.__printFsckInserts(out) print out def __printFsckInserts(self, lines): row = FsckRow() for line in lines.split("\n"): if "Total size" in line: row.totalSize = int(re.sub(r"\D", "", line)) elif "Total dirs" in line: row.totalDirs = int(re.sub(r"\D", "", line)) elif "Total files" in line: row.totalFiles = int(re.sub(r"\D", "", line)) elif "Total symlinks" in line: row.totalSymlinks = int(re.sub(r"\D", "", line)) elif "Total blocks" in line: tmp = line.split('\t')[1] row.totalBlocks = int(tmp[0:tmp.index(' ')]) elif "Minimally replicated blocks" in line: tmp = line.split('\t')[1] row.minRepBlocks = int(tmp[0:tmp.index(' ')]) elif "Over-replicated blocks" in line: tmp = line.split('\t')[1] row.overRepBlocks = int(tmp[0:tmp.index(' ')]) elif "Under-replicated blocks" in line: tmp = line.split('\t')[1] row.underRepBlocks = int(tmp[0:tmp.index(' ')]) elif "Mis-replicated blocks" in line: tmp = line.split('\t')[2] row.misRepBlocks = int(tmp[0:tmp.index(' ')]) elif "Corrupt blocks" in line: row.corruptBlocks = int(re.sub(r"\D", "", line)) elif "Missing replicas" in line: tmp = line.split('\t')[2] row.missReplicas = int(tmp[0:tmp.index(' ')]) elif "Number of data-nodes" in line: row.numDataNodes = int(re.sub(r"\D", "", line)) elif "Number of racks" in line: row.numRacks = int(re.sub(r"\D", "", line)) self.pgutil.writeInsert(row) def printNameNodeReport(self): printInfo("Getting NameNode report") # Redirecting syslog to /dev/null cmd = "hdfs dfsadmin -report 2> /dev/null | grep -v \"^\.\"" out = getCommandOutput(cmd) self.__printNameNodeReportInserts(out) print out def __printNameNodeReportInserts(self, lines): row = None alive = True hitLive = False for line in lines.split("\n"): if "Live datanodes:" in line: alive = True hitLive = True elif "Dead datanodes:" in line: alive = False if not hitLive: continue if "Name:" in line: # Write out the to our list if we've hit a new node report if not row is None: self.pgutil.writeInsert(row) # make a new row row = HdfsReportRow() row.name = line[line.index(' ') + 1:line.index('(') - 1] row.alive = alive elif "Hostname:" in line: row.hostname = line[line.index(' ') + 1:] elif "Rack:" in line: row.rack = line[line.index(' ') + 1:] elif "Decommission Status :" in line: row.decommission_status = line.split(' ')[3] elif "Configured Capacity:" in line: row.conf_capacity = int(line.split(' ')[2]) elif "DFS Used:" in line[0:9]: row.dfs_used = int(line.split(' ')[2]) elif "Non DFS Used:" in line: row.non_dfs_used = int(line.split(' ')[3]) elif "DFS Remaining:" in line: row.dfs_remaining = int(line.split(' ')[2]) elif "DFS Used%:" in line: row.dfs_used_perc = float( line.split(' ')[2][0:len(line.split(' ')[2]) - 1]) elif "DFS Remaining%:" in line: row.dfs_remaining_perc = float( line.split(' ')[2][0:len(line.split(' ')[2]) - 1]) elif "Last contact:" in line: row.last_contact = line[14:] # Write out the last row if not row is None: self.pgutil.writeInsert(row) def getINodeCounts(self, directories): if len(directories) == 0: return [] retval = [] for directory in directories: # Redirecting syslog to /dev/null cmd = "hdfs fsck %s 2> /dev/null | grep Total | egrep \"Total dirs|Total files|Total blocks\"" % ( directory) iNodeCount = 0 for line in getCommandOutput(cmd).split('\n'): if 'dirs' in line: iNodeCount += int(line.split('\t')[1]) if 'files' in line: iNodeCount += int(line.split('\t')[1]) if 'blocks' in line: iNodeCount += int(line.split('\t')[1][0:1]) retval.append((directory, iNodeCount)) return retval def getSpaceQuotas(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfs -count -q" for directory in directories: cmd = cmd + " " + directory try: quotas = getCommandOutput(cmd).split("\n") except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) retval = [] for quota in quotas: # Returns list of (directory, quota, remainingQuota) retval.append( (quota.split()[7], quota.split()[2], quota.split()[3])) return retval def setSpaceQuotas(self, directories, quota): if len(directories) == 0: return [] cmd = "hdfs dfsadmin -setSpaceQuota %s" % (quota) for directory in directories: cmd = cmd + " " + directory try: getCommandOutput(cmd) except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) def clearSpaceQuotas(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfsadmin -clrSpaceQuota" for directory in directories: cmd = cmd + " " + directory try: getCommandOutput(cmd) except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) def getINodeQuotas(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfs -count -q" for directory in directories: cmd = cmd + " " + directory try: quotas = getCommandOutput(cmd).split("\n") except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) retval = [] for quota in quotas: # TODO get the proper indexes from the count retval.append( (quota.split()[7], quota.split()[0], quota.split()[1])) return retval def setINodeQuotas(self, directories, quota): if len(directories) == 0: return [] cmd = "hdfs dfsadmin -setQuota %s" % (quota) for directory in directories: cmd = cmd + " " + directory try: getCommandOutput(cmd).split("\n") except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) def clearINodeQuotas(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfsadmin -clrQuota" for directory in directories: cmd = cmd + " " + directory try: getCommandOutput(cmd).split("\n") except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1)
parser = OptionParser() parser.add_option("-c", "--config", dest="configFile", help="Configuration file (default phd-metrics.ini)", default="phd-metrics.ini") parser.add_option( "-s", "--sqlfile", dest="sqlFile", help="Filename to write SQL statements to (default none)", default=None) conf = Config(parser, sys.argv[2:]) pgutil = PostgresUtil(conf) pgutil.open() HdfsUtil(conf).printReport() HawqUtil(conf).printReport() HiveUtil(conf).printReport() UserUtil(conf).printReport() FsUtil(conf).printReport() pgutil.close() # Local filesystem option elif sys.argv[1] == "fs-util": parser = OptionParser() parser.add_option("-c", "--config",
class HdfsUtil: def __init__(self, conf): self.conf = conf self.pgutil = PostgresUtil(conf) def printReport(self): self.printFsckSummary() self.printNameNodeReport() def listDirs(self, directories): if len(directories) == 0: return [] dirStr = "" for d in directories: dirStr = dirStr + "%s " %(d) cmd = "hdfs dfs -ls %s | awk '{print $8}'" % (dirStr) out = getCommandOutput(cmd) if len(out) > 0: return out.split("\n") else: return out def getDirSizes(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfs -du " for directory in directories: cmd = cmd + " " + directory cmd = cmd + " | awk '{print $1,$2}'" out = getCommandOutput(cmd) if len(out) == 0: return [] else: retval = [] for line in out.split('\n'): # Returns list of (dir, size) pairs retval.append((line.split(' ')[1], int(line.split(' ')[0]))) return retval def printFsckSummary(self): printInfo("Getting FSCK summary") # Redirecting syslog to /dev/null cmd = "hdfs fsck / 2> /dev/null | grep -v \"^\.\"" out = getCommandOutput(cmd) self.__printFsckInserts(out) print out def __printFsckInserts(self, lines): row = FsckRow() for line in lines.split("\n"): if "Total size" in line: row.totalSize = int(re.sub(r"\D", "", line)) elif "Total dirs" in line: row.totalDirs = int(re.sub(r"\D", "", line)) elif "Total files" in line: row.totalFiles = int(re.sub(r"\D", "", line)) elif "Total symlinks" in line: row.totalSymlinks = int(re.sub(r"\D", "", line)) elif "Total blocks" in line: tmp = line.split('\t')[1] row.totalBlocks = int(tmp[0:tmp.index(' ')]) elif "Minimally replicated blocks" in line: tmp = line.split('\t')[1] row.minRepBlocks = int(tmp[0:tmp.index(' ')]) elif "Over-replicated blocks" in line: tmp = line.split('\t')[1] row.overRepBlocks = int(tmp[0:tmp.index(' ')]) elif "Under-replicated blocks" in line: tmp = line.split('\t')[1] row.underRepBlocks = int(tmp[0:tmp.index(' ')]) elif "Mis-replicated blocks" in line: tmp = line.split('\t')[2] row.misRepBlocks = int(tmp[0:tmp.index(' ')]) elif "Corrupt blocks" in line: row.corruptBlocks = int(re.sub(r"\D", "", line)) elif "Missing replicas" in line: tmp = line.split('\t')[2] row.missReplicas = int(tmp[0:tmp.index(' ')]) elif "Number of data-nodes" in line: row.numDataNodes = int(re.sub(r"\D", "", line)) elif "Number of racks" in line: row.numRacks = int(re.sub(r"\D", "", line)) self.pgutil.writeInsert(row) def printNameNodeReport(self): printInfo("Getting NameNode report") # Redirecting syslog to /dev/null cmd = "hdfs dfsadmin -report 2> /dev/null | grep -v \"^\.\"" out = getCommandOutput(cmd) self.__printNameNodeReportInserts(out) print out def __printNameNodeReportInserts(self, lines): row = None alive = True hitLive = False for line in lines.split("\n"): if "Live datanodes:" in line: alive = True hitLive = True elif "Dead datanodes:" in line: alive = False if not hitLive: continue if "Name:" in line: # Write out the to our list if we've hit a new node report if not row is None: self.pgutil.writeInsert(row) # make a new row row = HdfsReportRow() row.name = line[line.index(' ')+1:line.index('(')-1] row.alive = alive elif "Hostname:" in line: row.hostname = line[line.index(' ')+1:] elif "Rack:" in line: row.rack = line[line.index(' ')+1:] elif "Decommission Status :" in line: row.decommission_status = line.split(' ')[3] elif "Configured Capacity:" in line: row.conf_capacity = int(line.split(' ')[2]) elif "DFS Used:" in line[0:9]: row.dfs_used = int(line.split(' ')[2]) elif "Non DFS Used:" in line: row.non_dfs_used = int(line.split(' ')[3]) elif "DFS Remaining:" in line: row.dfs_remaining = int(line.split(' ')[2]) elif "DFS Used%:" in line: row.dfs_used_perc = float(line.split(' ')[2][0:len(line.split(' ')[2])-1]) elif "DFS Remaining%:" in line: row.dfs_remaining_perc = float(line.split(' ')[2][0:len(line.split(' ')[2])-1]) elif "Last contact:" in line: row.last_contact = line[14:] # Write out the last row if not row is None: self.pgutil.writeInsert(row) def getINodeCounts(self, directories): if len(directories) == 0: return [] retval = [] for directory in directories: # Redirecting syslog to /dev/null cmd = "hdfs fsck %s 2> /dev/null | grep Total | egrep \"Total dirs|Total files|Total blocks\"" % (directory) iNodeCount = 0 for line in getCommandOutput(cmd).split('\n'): if 'dirs' in line: iNodeCount += int(line.split('\t')[1]) if 'files' in line: iNodeCount += int(line.split('\t')[1]) if 'blocks' in line: iNodeCount += int(line.split('\t')[1][0:1]) retval.append((directory, iNodeCount)) return retval def getSpaceQuotas(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfs -count -q" for directory in directories: cmd = cmd + " " + directory try: quotas = getCommandOutput(cmd).split("\n") except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) retval = [] for quota in quotas: # Returns list of (directory, quota, remainingQuota) retval.append(( quota.split()[7], quota.split()[2], quota.split()[3] )) return retval def setSpaceQuotas(self, directories, quota): if len(directories) == 0: return [] cmd = "hdfs dfsadmin -setSpaceQuota %s" % (quota) for directory in directories: cmd = cmd + " " + directory try: getCommandOutput(cmd) except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) def clearSpaceQuotas(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfsadmin -clrSpaceQuota" for directory in directories: cmd = cmd + " " + directory try: getCommandOutput(cmd) except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) def getINodeQuotas(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfs -count -q" for directory in directories: cmd = cmd + " " + directory try: quotas = getCommandOutput(cmd).split("\n") except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) retval = [] for quota in quotas: # TODO get the proper indexes from the count retval.append(( quota.split()[7], quota.split()[0], quota.split()[1] )) return retval def setINodeQuotas(self, directories, quota): if len(directories) == 0: return [] cmd = "hdfs dfsadmin -setQuota %s" % (quota) for directory in directories: cmd = cmd + " " + directory try: getCommandOutput(cmd).split("\n") except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1) def clearINodeQuotas(self, directories): if len(directories) == 0: return [] cmd = "hdfs dfsadmin -clrQuota" for directory in directories: cmd = cmd + " " + directory try: getCommandOutput(cmd).split("\n") except subprocess.CalledProcessError: printError("Directories not found: %s" % (cmd)) sys.exit(1)
class HiveUtil: def __init__(self, conf): self.conf = conf self.hdfsUtil = HdfsUtil(conf) self.pgUtil = PostgresUtil(conf) def printReport(self): printInfo("Fetching contents of Hive warehouse") hivedbdirs = self.getHiveDatabaseDirectories() self.printDatabaseQuotas(hivedbdirs) self.printTopKLargestDatabases(hivedbdirs) def getHiveDatabaseDirectories(self): hivedirs = self.hdfsUtil.listDirs([self.conf.get(Config.HIVE_WAREHOUSE_DIR)]) retval = [] for dir in hivedirs: if dir.endswith(".db"): retval.append(dir) return retval def printDatabaseQuota(self, db): printInfo("Getting quota status for Hive database %s" % (db)) quotas = self.hdfsUtil.getSpaceQuotas(["%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)]) if len(quotas) == 0: printInfo("No Hive databases found") return; row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def printDatabaseQuotas(self, hivedbdirs): printInfo("Getting quota status for Hive databases") hdfsDirs = [] for dir in hivedbdirs: db = self.getDbNameFromPath(dir) hdfsDirs.append("%s/%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db)) quotas = self.hdfsUtil.getSpaceQuotas(hdfsDirs) if len(quotas) == 0: printInfo("No Hive databases found") return; quotas.sort() self.__printDBQuotasInserts(quotas) row = namedtuple('Row', ['Database', 'Directory', 'Quota', 'Remaining', 'QuotaHR', 'RemainingHR']) toPrint = [] for (directory, quota, remainingQuota) in quotas: dbName = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") quotaHR = bytes2human(quota) if quota != 'none' else quota remainingQuotaHR = bytes2human(remainingQuota) if remainingQuota != 'inf' else remainingQuota toPrint.append(row(dbName, directory, quota, remainingQuota, quotaHR, remainingQuotaHR)) pprinttable(toPrint) def __printDBQuotasInserts(self, quotas): for (directory, quota, remainingQuota) in quotas: row = HiveDBQuotaRow() row.database = directory.replace(".db", "").replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "") row.dir = directory if not quota == 'none': row.quota = int(quota) row.quotaRemaining = int(remainingQuota) row.quotaUsed = row.quota - row.quotaRemaining else: row.quota = None row.quotaRemaining = None row.quotaUsed = None self.pgUtil.writeInsert(row) def getDatabaseSize(self, dbDir): sizes = self.hdfsUtil.getDirSizes([dbDir]) sum = 0 for (dir, size) in sizes: sum += size return (dbDir, sum) def printTopKLargestDatabases(self, hivedbdirs): k = self.conf.get(Config.REPORTER_K) printInfo("Getting top %s largest Hive databases" % (k)) dbSizes = [] for dbDir in hivedbdirs: tDbSize = self.getDatabaseSize(dbDir) if not tDbSize is None: dbSizes.append(tDbSize) if len(dbSizes) == 0: printInfo("No Hive databases found in HDFS") return dbSizes.sort(key=operator.itemgetter(1), reverse=True) if len(dbSizes) > k: dbSizes = dbSizes[0:k] self.__printTopKLargestDatabases(dbSizes) # print sizes row = namedtuple('Row', ['Database', 'Size', 'SizeHR']) toPrint = [] for (db, size) in dbSizes: sizeHR = bytes2human(size) toPrint.append(row(db, str(size), str(sizeHR))) pprinttable(toPrint) def __printTopKLargestDatabases(self, dbSizes): for (db, size) in dbSizes: row = HiveDBSizeRow() row.database = db row.size = size self.pgUtil.writeInsert(row) def setDatabaseQuota(self, db, quota): if db == self.conf.get(Config.HIVE_DB_BLACKLIST).split(): printError("Database %s is in the blacklist. Remove to set quota" % (db)) return printInfo("Setting quota for %s to %s bytes" % (db, quota)) self.hdfsUtil.setSpaceQuotas([self.getDbPathFromName(db)], quota) def clearDatabaseQuota(self, db): printInfo("Clearing quota for database %s" % (db)) self.hdfsUtil.clearSpaceQuotas([self.getDbPathFromName(db)]) def getDbNameFromPath(self, dir): return dir.replace(self.conf.get(Config.HIVE_WAREHOUSE_DIR), "").replace(".db", "") def getDbPathFromName(self, db): return "%s%s.db" % (self.conf.get(Config.HIVE_WAREHOUSE_DIR), db);
class FsUtil: def __init__(self, conf): self.conf = conf self.pgUtil = PostgresUtil(conf) def printReport(self): printInfo("Grabbing and printing drive metrics") self.printDriveUsage() self.printDriveWriteTest() # This function prints usage statistics for drives that are available # It runs an initial test to make sure the drive exists on the host # If the drive does not exist, it is not reported in this report def printDriveUsage(self): hostfile = self.conf.get(Config.HOST_FILE) dataDrives = self.conf.get(Config.DATA_DRIVES) driveMetrics = [] for dataDrive in dataDrives.split(" "): # First, test if the drive exists on all host, getting the list of where it worked output = getCommandOutput("massh %s worked test -e %s" % (hostfile, dataDrive)) if len(output) > 0: # We have some hosts that have this drive tmpHostfile = self.writeHostFile(output.split("\n")) cmd = "massh %s verbose \"df %s | grep -v Filesystem\" | awk '{print $1,$8,$7,$4,$5,$6}'" % (tmpHostfile, dataDrive) output = getCommandOutput(cmd).split("\n") for line in output: (host, drive, perc, size, used, avail) = line.split(" ") driveMetrics.append((host.replace("[","").replace("]",""), drive, perc.replace("%", ""), size, used, avail)) driveMetrics.sort() self.__printDriveUsageInserts(driveMetrics) row = namedtuple('Row', ['Host', 'Drive', 'PercentUsed', 'Size', 'Used', 'Avail']) toPrint = [] for (host, drive, perc, size, used, avail) in driveMetrics: toPrint.append(row(host, drive, perc, size, used, avail)) pprinttable(toPrint) def __printDriveUsageInserts(self, driveMetrics): for (host, drive, perc, size, used, avail) in driveMetrics: row = DriveUsageRow() row.host = host row.drive = drive row.perc = perc row.size = size row.used = used row.avail = avail self.pgUtil.writeInsert(row) def printDriveWriteTest(self): printInfo("Getting non-writeable drives") hostfile = self.conf.get(Config.HOST_FILE) dataDrives = self.conf.get(Config.DATA_DRIVES) failedDrives = [] for drive in dataDrives.split(" "): # Check if the drives exist output = getCommandOutput("massh %s bombed sudo test -e %s" % (hostfile, drive)) if len(output) > 0: for host in output.split("\n"): failedDrives.append((host, drive, 'dne')) output = getCommandOutput("massh %s worked sudo test -e %s" % (hostfile, drive)) if len(output) > 0: tmpHostFile = self.writeHostFile(output.split("\n")) output = getCommandOutput("massh %s bombed sudo test -w %s" % (tmpHostFile, drive)) if len(output) > 0: for host in output.split("\n"): failedDrives.append((host, drive, 'ro')) if len(failedDrives) == 0: printInfo("No non-writeable drives to report") else: row = namedtuple('Row', ['Host', 'Drive', 'Reason']) failedDrives.sort() self.__printDriveWriteTest(failedDrives) toPrint = [] for (host, drive, reason) in failedDrives: toPrint.append(row(host, drive, reason)) pprinttable(toPrint) def __printDriveWriteTest(self, failedDrives): for (host, drive, reason) in failedDrives: row = DriveWriteTestRow() row.host = host row.drive = drive row.reason = reason self.pgUtil.writeInsert(row) def writeHostFile(self, hosts): fName = self.conf.get(Config.TMP_DIR) + "/fsutil.txt" f = open(fName, 'w') for item in hosts: f.write(item + "\n") f.flush() f.close() return fName