def __init__(self, job_id, all_hosts): ''' Constructor ''' self.__job_id = job_id self.__all_hosts = all_hosts self.__sshClient = SSHClient()
class StorageConfiguration(object): ''' Storage configuration ''' def __init__(self, job_id, all_hosts): ''' Constructor ''' self.__job_id = job_id self.__all_hosts = all_hosts self.__sshClient = SSHClient() # Broken for VMs! def configure_and_mount_nfs(self): ''' Configuring all_hosts ''' # To access G5K mount point from VMs uid_set_command = "usermod -u" + settings.WHITESPACE + settings.G5K_USERID + settings.WHITESPACE \ + settings.SYSTEM_HADOOP_USER_NAME # To mount NFS directory mount_command = "mount -t nfs" + settings.WHITESPACE + settings.NFS_STORAGE_SERVER + ":/data/" \ + settings.G5K_USERNAME + "_" + self.__job_id + settings.WHITESPACE + settings.NFS_MOUNT_DIRECTORY # Chown hadoop directory commands = [settings.HADOOP_CHOWN, uid_set_command, settings.SYSTEM_UMOUNT_NFS, mount_command] self.__sshClient.run_same_commands_on_hosts(self.__all_hosts, commands, settings.SYSTEM_ROOT_USER_NAME, settings.SYSTEM_ROOT_USER_PASSWORD)
class StorageConfiguration(object): ''' Storage configuration ''' def __init__(self, job_id, all_hosts): ''' Constructor ''' self.__job_id = job_id self.__all_hosts = all_hosts self.__sshClient = SSHClient() # Broken for VMs! def configure_and_mount_nfs(self): ''' Configuring all_hosts ''' # To access G5K mount point from VMs uid_set_command = "usermod -u" + settings.WHITESPACE + settings.G5K_USERID + settings.WHITESPACE \ + settings.SYSTEM_HADOOP_USER_NAME # To mount NFS directory mount_command = "mount -t nfs" + settings.WHITESPACE + settings.NFS_STORAGE_SERVER + ":/data/" \ + settings.G5K_USERNAME + "_" + self.__job_id + settings.WHITESPACE + settings.NFS_MOUNT_DIRECTORY # Chown hadoop directory commands = [ settings.HADOOP_CHOWN, uid_set_command, settings.SYSTEM_UMOUNT_NFS, mount_command ] self.__sshClient.run_same_commands_on_hosts( self.__all_hosts, commands, settings.SYSTEM_ROOT_USER_NAME, settings.SYSTEM_ROOT_USER_PASSWORD)
def copy_images(self): """ copy nxos bin and issu images to /tftpboot/sanity-image """ try: log.debug("Copying images for job") # check download_image ssh_client = SSHClient(constants.TFTP_SERVER, constants.TFTP_USER, constants.TFTP_PASS) if self.job["download_image"] != "": log.debug("checking for image in tftpboot since download_image is set") tftp_path = "{0}/{1}".format(constants.TFTP_SAVE_PATH, self.job["download_image"].strip("/")) # TODO: check in tftp server # if not os.path.exists(tftp_path): if not ssh_client.check_path_exists(tftp_path): log.debug("download_image path does not exist: {0}".format( self.job["download_image"])) sys.exit(1) self.update_job_details(constants.FINAL_IMAGE_PATH, tftp_path) else: # copy nxos bin to /tftpboot log.debug("checking to copy image to tftpboot") nxos_bin = self.get_image_path() if nxos_bin is None: log.debug("Couldn't derive nxos image path") sys.exit(1) # copy nxos bin dest_image_name = "{0}-{1}-{2}".format(self.job["submitter"], self.job["submit_id"], os.path.basename(nxos_bin)) dest_path = "{0}/{1}".format(constants.TFTP_SAVE_PATH, dest_image_name) # scp the image to tftp server if not ssh_client.scp_img(nxos_bin, dest_path): log.error("setup_data failed in copy_images: scp issu to tftp failed") sys.exit(-1) # shutil.copyfile(nxos_bin, dest_path) self.update_job_details(constants.FINAL_IMAGE_PATH, dest_path) # copy issu nxos bin to /tftpboot issu_bin = self.get_issu_image_path() if issu_bin is None: return dest_issu_name = "{0}-{1}-{2}".format(self.job["submitter"], self.job["submit_id"], os.path.basename(issu_bin)) dest_issu_path = "{0}/{1}".format(constants.TFTP_SAVE_PATH, dest_issu_name) # scp the image to tftp server if not ssh_client.scp_img(issu_bin, dest_issu_path): log.error("setup_data failed in copy_images: scp issu to tftp failed") sys.exit(-1) self.update_job_details(constants.FINAL_ISSU_PATH, dest_issu_path) except Exception as e: log.error("setup_data failed in copy_images:" + repr(e)) sys.exit(-1)
def __init__(self, master): ''' Constructor ''' self.__master = master self.__sshClient = SSHClient()
class HadoopBenchmark(object): ''' Hadoop MapReduce benchmarks ''' def __init__(self, master): ''' Constructor ''' self.__master = master self.__sshClient = SSHClient() def __read_maps_reduces(self): number_of_maps = raw_input("Number of maps:") number_of_reduces = raw_input("Numer of reduces:") return number_of_maps, number_of_reduces def __read_input_census_and_wiki(self): ''' Reads the input ''' test_number = raw_input( "Test number(0 -> read=write, 1 -> read>write, 2 -> read<write):") number_of_maps, number_of_reduces = self.__read_maps_reduces() return test_number, number_of_maps, number_of_reduces def dfsio(self): number_of_files = raw_input("Number of files:") file_size = raw_input("File size: ") logger.info("Write test started") write_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \ + "TestDFSIO -write -nrFiles " + number_of_files + " -fileSize " + file_size logger.info("Read test started") read_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \ + "TestDFSIO -read -nrFiles " + number_of_files + " -fileSize " + file_size logger.info("Cleaning") clean_command = settings.HADOOP_START_TEST_BENCHMARK + "TestDFSIO -clean" return [write_command, read_command, clean_command] def dfsthroughput(self): start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK + "dfsthroughput" clean_command = settings.HADOOP_START_TEST_BENCHMARK + "dfsthroughput -clean" return [start_command, clean_command] def mrbench(self): number_of_runs = raw_input("Number of runs:") command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \ + "mrbench -numRuns " + number_of_runs return [command] def nnbench(self): logger.info("nnbench") operation = raw_input( "Operation (create_write/open_read/rename/delete):") number_of_maps, number_of_reduces = self.__read_maps_reduces() number_of_files = raw_input("Number of files:") command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK + "nnbench -operation " + operation + " -maps " \ + number_of_maps + " -reduces " + number_of_reduces + " -blockSize 1 -bytesToWrite 0 -numberOfFiles "\ + number_of_files + " -replicationFactorPerFile 3 -readFileAfterOpen true" return [command] def pi(self): number_of_maps = raw_input("Number of maps:") number_of_samples = raw_input("Numer of samples:") command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK + "pi " + number_of_maps + \ settings.WHITESPACE + number_of_samples return [command] def __get_number_of_maps_reduces_parameter(self): ''' Computes number of maps, reduces parameter setting ''' number_of_maps, number_of_reduces = self.__read_maps_reduces() return settings.HADOOP_SET_NUMBER_OF_MAPS + number_of_maps + settings.WHITESPACE \ + settings.HADOOP_SET_NUMBER_OF_REDUCES + number_of_reduces + settings.WHITESPACE def teragen(self): number_of_rows = raw_input("Number of 100 byte rows:") number_of_maps = raw_input("Number of maps:") teragen_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \ + "teragen" + settings.WHITESPACE + settings.HADOOP_SET_NUMBER_OF_MAPS + number_of_maps \ + settings.WHITESPACE + number_of_rows + settings.WHITESPACE + settings.TERAGEN_LOCATION_INPUT return [settings.TERAGEN_CLEAN_INPUT, teragen_command] def terasort(self): maps_reduces_parameter = self.__get_number_of_maps_reduces_parameter() terasort_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \ + "terasort" + settings.WHITESPACE + maps_reduces_parameter \ + settings.WHITESPACE + settings.TERAGEN_LOCATION_INPUT \ + settings.WHITESPACE + settings.TERAGEN_LOCATION_OUTPUT return [settings.TERAGEN_CLEAN_OUTPUT, terasort_command] def teravalidate(self): maps_reduces_parameter = self.__get_number_of_maps_reduces_parameter() teravalidate_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \ + "teravalidate" + settings.WHITESPACE + maps_reduces_parameter \ + settings.WHITESPACE + settings.TERAGEN_LOCATION_OUTPUT \ + settings.WHITESPACE + settings.TERAGEN_LOCATION_VALIDATE return [settings.TERAGEN_CLEAN_VALIDATE, teravalidate_command] def census_data(self): data_source_directory = raw_input("Input data file/directory on NAS:") mkdir_command = settings.HADOOP_MKDIR_COMMAND + settings.CENSUSPROC_LOCATION_INPUT move_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_COPY_FROM_LOCAL \ + data_source_directory + settings.WHITESPACE + settings.CENSUSPROC_LOCATION_INPUT return [settings.CENSUSPROC_CLEAN_INPUT, mkdir_command, move_command] def census_bench(self): test_number, number_of_maps, number_of_reducers = self.__read_input_census_and_wiki( ) start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_COMMAND + settings.CENSUSPROC_LOCATION_BIN \ + settings.WHITESPACE + test_number + settings.WHITESPACE + number_of_maps + settings.WHITESPACE \ + number_of_reducers + settings.WHITESPACE + settings.CENSUSPROC_LOCATION_INPUT + settings.WHITESPACE \ +settings.CENSUSPROC_LOCATION_OUTPUT return [settings.CENSUSPROC_CLEAN_OUTPUT, start_command] def wikipedia_data(self): data_source_directory = raw_input("Input data file/directory on NAS: ") mkdir_command = settings.HADOOP_MKDIR_COMMAND + settings.WIKIPROC_LOCATION_INPUT move_command = settings.SYSTEM_TIME_COMMAND + settings.WHITESPACE + settings.HADOOP_COPY_FROM_LOCAL \ + data_source_directory + settings.WHITESPACE + settings.WIKIPROC_LOCATION_INPUT return [settings.WIKIPROC_CLEAN_INPUT, mkdir_command, move_command] def wikipedia_bench(self): test_number, number_of_maps, number_of_reducers = self.__read_input_census_and_wiki( ) start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_COMMAND + settings.WIKIPROC_LOCATION_BIN \ + settings.WHITESPACE + test_number + settings.WHITESPACE + number_of_maps + settings.WHITESPACE \ + number_of_reducers + settings.WHITESPACE + settings.WIKIPROC_LOCATION_INPUT + settings.WHITESPACE \ + settings.WIKIPROC_LOCATION_OUTPUT return [settings.WIKIPROC_CLEAN_OUTPUT, start_command] def run_benchmark(self, name): logger.debug("Starting %s benchmark on master node: %s" % (name, self.__master)) benchmarks = { "dfsio": self.dfsio, "dfsthroughput": self.dfsthroughput, "mrbench": self.mrbench, "nnbench": self.nnbench, "pi": self.pi, "teragen": self.teragen, "terasort": self.terasort, "teravalidate": self.teravalidate, "censusdata": self.census_data, "censusbench": self.census_bench, "wikidata": self.wikipedia_data, "wikibench": self.wikipedia_bench } benchmark = benchmarks[name] commands = benchmark() self.__sshClient.run_commands_on_host( self.__master, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD)
def __init__(self, master_host, slave_hosts, storage_mode): self.master_host = master_host self.slave_hosts = slave_hosts self.__storage_mode = storage_mode self.__hosts = [master_host] + slave_hosts self.sshClient = SSHClient()
class HadoopConfigureNormal(object): """ Hadoop related logic. """ def __init__(self, master_host, slave_hosts, storage_mode): self.master_host = master_host self.slave_hosts = slave_hosts self.__storage_mode = storage_mode self.__hosts = [master_host] + slave_hosts self.sshClient = SSHClient() def __configure_master_host(self): # Clear master_host/slave files and add master_host host address to the hadoop master_hosts file commands = [ settings.HADOOP_CLEAN_SLAVES_FILE, "echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + " > " + settings.HADOOP_MASTER_FILE, ] # add slave hosts ip to hadoop slave_hosts file for host in self.slave_hosts: commands.append( "echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " >> " + settings.HADOOP_SLAVES_FILE ) if self.__storage_mode == "nfs": commands.append(settings.HADOOP_START_MAPRED) elif self.__storage_mode == "hdfs": commands.append(settings.HADOOP_FORMAT_DFS) commands.append(settings.HADOOP_START_ALL_SERVICES) # run the commands on the master_host host self.sshClient.run_commands_on_host( self.master_host, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD ) print "Waiting %s seconds for nodes to become ready" % (settings.HADOOP_WAIT_TIME) time.sleep(settings.HADOOP_WAIT_TIME) def __generate_hosts_update_command(self): """ Generates a hosts update command """ hosts_file_update = [settings.SYSTEM_CLEAN_HOSTS_FILE] for host in self.__hosts: hosts_file_update.append( "echo '" + host + settings.WHITESPACE + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + "' >> /etc/hosts" ) return hosts_file_update def prepare_environment(self): """ Prepares the system environment (updates hosts list, sets hostname, apply urandom and ulimit fixes) """ hosts_file_update_command = self.__generate_hosts_update_command() hosts_dict = {} for host in self.__hosts: commands = [settings.SYSTEM_URANDOM_FIX, settings.SYSTEM_ULIMIT_FIX] commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " > /etc/hostname") commands.append("hostname -v " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host)) commands.extend(hosts_file_update_command) hosts_dict.update({host: commands}) self.sshClient.run_distinct_commands_on_hosts( hosts_dict, settings.SYSTEM_ROOT_USER_NAME, settings.SYSTEM_ROOT_USER_PASSWORD ) def start(self): self.prepare_environment() if self.__storage_mode == "nfs": self.configure_slave_hosts_nfs() elif self.__storage_mode == "hdfs": self.configure_slave_hosts_hdfs() self.__configure_master_host() def configure_slave_hosts_nfs(self): logger.info("Preparing the following VMs with NFS: %s" % self.__hosts) commands = [ settings.SYSTEM_KILL_JAVA, settings.SYSTEM_CLEAN_TMP, settings.HADOOP_DISABLE_HOST_KEY_CHECK, settings.HADOOP_UPDATE_ENV, ] commands.append( """cat >""" + settings.HADOOP_INSTALL_DIR + """/conf/mapred-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>mapred.job.tracker</name> <value>""" + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + """:8021</value> </property> <property> <name>mapred.child.java.opts</name> <value>-Xmx""" + settings.HADOOP_XMX_SIZE + """m -Xmn""" + settings.HADOOP_XMN_SIZE + """m</value> </property> <property> <name>mapred.tasktracker.map.tasks.maximum</name> <value>""" + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS + """</value> </property> <property> <name>mapred.tasktracker.reduce.tasks.maximum</name> <value>""" + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS + """</value> </property> <property> <name>mapred.local.dir</name> <value>""" + settings.HADOOP_MAPRED_LOCAL_DIR + """</value> </property> <property> <name>mapred.system.dir</name> <value>""" + settings.HADOOP_MAPRED_SYSTEM_DIR + """</value> </property> <property> <name>mapred.temp.dir</name> <value>""" + settings.HADOOP_MAPRED_TEMP_DIR + """</value> </property> </configuration> EOF""" ) commands.append( """cat >""" + settings.HADOOP_INSTALL_DIR + """/conf/core-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>fs.default.name</name> <value>file:///</value> </property> <property> <name>io.file.buffer.size</name> <value>""" + settings.HADOOP_IO_FILE_BUFFER_SIZE + """</value> </property> </configuration> EOF""" ) self.sshClient.run_same_commands_on_hosts( self.__hosts, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD ) def configure_slave_hosts_hdfs(self): logger.info("Preparing the following VMs with HDFS: %s" % self.__hosts) commands = [ settings.SYSTEM_KILL_JAVA, settings.SYSTEM_CLEAN_TMP, settings.HADOOP_DISABLE_HOST_KEY_CHECK, settings.HADOOP_UPDATE_ENV, ] commands.append( """cat >""" + settings.HADOOP_INSTALL_DIR + """/conf/hdfs-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>dfs.block.size</name> <value>""" + settings.HADOOP_BLOCK_SIZE + """</value> <final>true</final> </property> <property> <name>dfs.datanode.max.xcievers</name> <value>""" + settings.HADOOP_MAX_XCIEVERS + """</value> </property> <property> <name>dfs.replication</name> <value>""" + settings.HADOOP_RELICATION_FACTOR + """</value> <final>true</final> </property> </configuration> EOF""" ) commands.append( """cat >""" + settings.HADOOP_INSTALL_DIR + """/conf/mapred-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>mapred.job.tracker</name> <value>""" + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + """:8021</value> </property> <property> <name>mapred.child.java.opts</name> <value>-Xmx""" + settings.HADOOP_XMX_SIZE + """m -Xmn""" + settings.HADOOP_XMN_SIZE + """m</value> </property> <property> <name>mapred.tasktracker.map.tasks.maximum</name> <value>""" + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS + """</value> </property> <property> <name>mapred.tasktracker.reduce.tasks.maximum</name> <value>""" + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS + """</value> </property> </configuration> EOF""" ) commands.append( """cat >""" + settings.HADOOP_INSTALL_DIR + """/conf/core-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>fs.default.name</name> <value>hdfs://""" + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + """</value> </property> <property> <name>io.file.buffer.size</name> <value>hdfs://""" + settings.HADOOP_IO_FILE_BUFFER_SIZE + """</value> </property> </configuration> EOF""" ) self.sshClient.run_same_commands_on_hosts( self.__hosts, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD )
class HadoopConfigureNormal(object): ''' Hadoop related logic. ''' def __init__(self, master_host, slave_hosts, storage_mode): self.master_host = master_host self.slave_hosts = slave_hosts self.__storage_mode = storage_mode self.__hosts = [master_host] + slave_hosts self.sshClient = SSHClient() def __configure_master_host(self): # Clear master_host/slave files and add master_host host address to the hadoop master_hosts file commands = [settings.HADOOP_CLEAN_SLAVES_FILE, "echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + " > " \ + settings.HADOOP_MASTER_FILE] # add slave hosts ip to hadoop slave_hosts file for host in self.slave_hosts: commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " >> " + \ settings.HADOOP_SLAVES_FILE) if self.__storage_mode == "nfs": commands.append(settings.HADOOP_START_MAPRED) elif self.__storage_mode == "hdfs": commands.append(settings.HADOOP_FORMAT_DFS) commands.append(settings.HADOOP_START_ALL_SERVICES) # run the commands on the master_host host self.sshClient.run_commands_on_host(self.master_host, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD) print "Waiting %s seconds for nodes to become ready" % (settings.HADOOP_WAIT_TIME) time.sleep(settings.HADOOP_WAIT_TIME) def __generate_hosts_update_command(self): ''' Generates a hosts update command ''' hosts_file_update = [settings.SYSTEM_CLEAN_HOSTS_FILE] for host in self.__hosts: hosts_file_update.append("echo '" + host + settings.WHITESPACE + settings.SYSTEM_HOSTNAME_PREFIX \ + remove_dots(host) + "' >> /etc/hosts") return hosts_file_update def prepare_environment(self): ''' Prepares the system environment (updates hosts list, sets hostname, apply urandom and ulimit fixes) ''' hosts_file_update_command = self.__generate_hosts_update_command() hosts_dict = {} for host in self.__hosts: commands = [settings.SYSTEM_URANDOM_FIX, settings.SYSTEM_ULIMIT_FIX] commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " > /etc/hostname") commands.append("hostname -v " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host)) commands.extend(hosts_file_update_command) hosts_dict.update({host: commands}) self.sshClient.run_distinct_commands_on_hosts(hosts_dict, settings.SYSTEM_ROOT_USER_NAME, settings.SYSTEM_ROOT_USER_PASSWORD) def start(self): self.prepare_environment() if self.__storage_mode == "nfs": self.configure_slave_hosts_nfs() elif self.__storage_mode == "hdfs": self.configure_slave_hosts_hdfs() self.__configure_master_host() def configure_slave_hosts_nfs(self): logger.info("Preparing the following VMs with NFS: %s" % self.__hosts) commands = [settings.SYSTEM_KILL_JAVA, settings.SYSTEM_CLEAN_TMP, settings.HADOOP_DISABLE_HOST_KEY_CHECK, settings.HADOOP_UPDATE_ENV] commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/mapred-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>mapred.job.tracker</name> <value>''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + ''':8021</value> </property> <property> <name>mapred.child.java.opts</name> <value>-Xmx''' + settings.HADOOP_XMX_SIZE + '''m -Xmn''' + settings.HADOOP_XMN_SIZE + '''m</value> </property> <property> <name>mapred.tasktracker.map.tasks.maximum</name> <value>''' + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS + '''</value> </property> <property> <name>mapred.tasktracker.reduce.tasks.maximum</name> <value>''' + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS + '''</value> </property> <property> <name>mapred.local.dir</name> <value>''' + settings.HADOOP_MAPRED_LOCAL_DIR + '''</value> </property> <property> <name>mapred.system.dir</name> <value>''' + settings.HADOOP_MAPRED_SYSTEM_DIR + '''</value> </property> <property> <name>mapred.temp.dir</name> <value>''' + settings.HADOOP_MAPRED_TEMP_DIR + '''</value> </property> </configuration> EOF''') commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/core-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>fs.default.name</name> <value>file:///</value> </property> <property> <name>io.file.buffer.size</name> <value>''' + settings.HADOOP_IO_FILE_BUFFER_SIZE + '''</value> </property> </configuration> EOF''') self.sshClient.run_same_commands_on_hosts(self.__hosts, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD) def configure_slave_hosts_hdfs(self): logger.info("Preparing the following VMs with HDFS: %s" % self.__hosts) commands = [settings.SYSTEM_KILL_JAVA, settings.SYSTEM_CLEAN_TMP, settings.HADOOP_DISABLE_HOST_KEY_CHECK, settings.HADOOP_UPDATE_ENV] commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/hdfs-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>dfs.block.size</name> <value>''' + settings.HADOOP_BLOCK_SIZE + '''</value> <final>true</final> </property> <property> <name>dfs.datanode.max.xcievers</name> <value>''' + settings.HADOOP_MAX_XCIEVERS + '''</value> </property> <property> <name>dfs.replication</name> <value>''' + settings.HADOOP_RELICATION_FACTOR + '''</value> <final>true</final> </property> </configuration> EOF''') commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/mapred-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>mapred.job.tracker</name> <value>''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + ''':8021</value> </property> <property> <name>mapred.child.java.opts</name> <value>-Xmx''' + settings.HADOOP_XMX_SIZE + '''m -Xmn''' + settings.HADOOP_XMN_SIZE + '''m</value> </property> <property> <name>mapred.tasktracker.map.tasks.maximum</name> <value>''' + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS + '''</value> </property> <property> <name>mapred.tasktracker.reduce.tasks.maximum</name> <value>''' + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS + '''</value> </property> </configuration> EOF''') commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/core-site.xml <<EOF <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration> <property> <name>fs.default.name</name> <value>hdfs://''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + '''</value> </property> <property> <name>io.file.buffer.size</name> <value>hdfs://''' + settings.HADOOP_IO_FILE_BUFFER_SIZE + '''</value> </property> </configuration> EOF''') self.sshClient.run_same_commands_on_hosts(self.__hosts, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD)
class HadoopBenchmark(object): ''' Hadoop MapReduce benchmarks ''' def __init__(self, master): ''' Constructor ''' self.__master = master self.__sshClient = SSHClient() def __read_maps_reduces(self): number_of_maps = raw_input("Number of maps:") number_of_reduces = raw_input("Numer of reduces:") return number_of_maps, number_of_reduces def __read_input_census_and_wiki(self): ''' Reads the input ''' test_number = raw_input("Test number(0 -> read=write, 1 -> read>write, 2 -> read<write):") number_of_maps, number_of_reduces = self.__read_maps_reduces() return test_number, number_of_maps, number_of_reduces def dfsio(self): number_of_files = raw_input("Number of files:") file_size = raw_input("File size: ") logger.info("Write test started") write_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \ + "TestDFSIO -write -nrFiles " + number_of_files + " -fileSize " + file_size logger.info("Read test started") read_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \ + "TestDFSIO -read -nrFiles " + number_of_files + " -fileSize " + file_size logger.info("Cleaning") clean_command = settings.HADOOP_START_TEST_BENCHMARK + "TestDFSIO -clean" return [write_command, read_command, clean_command] def dfsthroughput(self): start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK + "dfsthroughput" clean_command = settings.HADOOP_START_TEST_BENCHMARK + "dfsthroughput -clean" return [start_command, clean_command] def mrbench(self): number_of_runs = raw_input("Number of runs:") command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \ + "mrbench -numRuns " + number_of_runs return [command] def nnbench(self): logger.info("nnbench") operation = raw_input("Operation (create_write/open_read/rename/delete):") number_of_maps, number_of_reduces = self.__read_maps_reduces() number_of_files = raw_input("Number of files:") command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK + "nnbench -operation " + operation + " -maps " \ + number_of_maps + " -reduces " + number_of_reduces + " -blockSize 1 -bytesToWrite 0 -numberOfFiles "\ + number_of_files + " -replicationFactorPerFile 3 -readFileAfterOpen true" return [command] def pi(self): number_of_maps = raw_input("Number of maps:") number_of_samples = raw_input("Numer of samples:") command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK + "pi " + number_of_maps + \ settings.WHITESPACE + number_of_samples return [command] def __get_number_of_maps_reduces_parameter(self): ''' Computes number of maps, reduces parameter setting ''' number_of_maps, number_of_reduces = self.__read_maps_reduces() return settings.HADOOP_SET_NUMBER_OF_MAPS + number_of_maps + settings.WHITESPACE \ + settings.HADOOP_SET_NUMBER_OF_REDUCES + number_of_reduces + settings.WHITESPACE def teragen(self): number_of_rows = raw_input("Number of 100 byte rows:") number_of_maps = raw_input("Number of maps:") teragen_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \ + "teragen" + settings.WHITESPACE + settings.HADOOP_SET_NUMBER_OF_MAPS + number_of_maps \ + settings.WHITESPACE + number_of_rows + settings.WHITESPACE + settings.TERAGEN_LOCATION_INPUT return [settings.TERAGEN_CLEAN_INPUT, teragen_command] def terasort(self): maps_reduces_parameter = self.__get_number_of_maps_reduces_parameter() terasort_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \ + "terasort" + settings.WHITESPACE + maps_reduces_parameter \ + settings.WHITESPACE + settings.TERAGEN_LOCATION_INPUT \ + settings.WHITESPACE + settings.TERAGEN_LOCATION_OUTPUT return [settings.TERAGEN_CLEAN_OUTPUT, terasort_command] def teravalidate(self): maps_reduces_parameter = self.__get_number_of_maps_reduces_parameter() teravalidate_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \ + "teravalidate" + settings.WHITESPACE + maps_reduces_parameter \ + settings.WHITESPACE + settings.TERAGEN_LOCATION_OUTPUT \ + settings.WHITESPACE + settings.TERAGEN_LOCATION_VALIDATE return [settings.TERAGEN_CLEAN_VALIDATE, teravalidate_command] def census_data(self): data_source_directory = raw_input("Input data file/directory on NAS:") mkdir_command = settings.HADOOP_MKDIR_COMMAND + settings.CENSUSPROC_LOCATION_INPUT move_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_COPY_FROM_LOCAL \ + data_source_directory + settings.WHITESPACE + settings.CENSUSPROC_LOCATION_INPUT return [settings.CENSUSPROC_CLEAN_INPUT, mkdir_command, move_command] def census_bench(self): test_number, number_of_maps, number_of_reducers = self.__read_input_census_and_wiki() start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_COMMAND + settings.CENSUSPROC_LOCATION_BIN \ + settings.WHITESPACE + test_number + settings.WHITESPACE + number_of_maps + settings.WHITESPACE \ + number_of_reducers + settings.WHITESPACE + settings.CENSUSPROC_LOCATION_INPUT + settings.WHITESPACE \ +settings.CENSUSPROC_LOCATION_OUTPUT return [settings.CENSUSPROC_CLEAN_OUTPUT, start_command] def wikipedia_data(self): data_source_directory = raw_input("Input data file/directory on NAS: ") mkdir_command = settings.HADOOP_MKDIR_COMMAND + settings.WIKIPROC_LOCATION_INPUT move_command = settings.SYSTEM_TIME_COMMAND + settings.WHITESPACE + settings.HADOOP_COPY_FROM_LOCAL \ + data_source_directory + settings.WHITESPACE + settings.WIKIPROC_LOCATION_INPUT return [settings.WIKIPROC_CLEAN_INPUT, mkdir_command, move_command] def wikipedia_bench(self): test_number, number_of_maps, number_of_reducers = self.__read_input_census_and_wiki() start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_COMMAND + settings.WIKIPROC_LOCATION_BIN \ + settings.WHITESPACE + test_number + settings.WHITESPACE + number_of_maps + settings.WHITESPACE \ + number_of_reducers + settings.WHITESPACE + settings.WIKIPROC_LOCATION_INPUT + settings.WHITESPACE \ + settings.WIKIPROC_LOCATION_OUTPUT return [settings.WIKIPROC_CLEAN_OUTPUT, start_command] def run_benchmark(self, name): logger.debug("Starting %s benchmark on master node: %s" %(name, self.__master)) benchmarks = { "dfsio": self.dfsio, "dfsthroughput": self.dfsthroughput, "mrbench": self.mrbench, "nnbench": self.nnbench, "pi": self.pi, "teragen": self.teragen, "terasort": self.terasort, "teravalidate": self.teravalidate, "censusdata": self.census_data, "censusbench": self.census_bench, "wikidata": self.wikipedia_data, "wikibench": self.wikipedia_bench } benchmark = benchmarks[name] commands = benchmark() self.__sshClient.run_commands_on_host(self.__master, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD)