def __init__(self, job_id, all_hosts):
     '''
     Constructor
     '''
     self.__job_id = job_id
     self.__all_hosts = all_hosts
     self.__sshClient = SSHClient()
class StorageConfiguration(object):
    '''
    Storage configuration
    '''

    def __init__(self, job_id, all_hosts):
        '''
        Constructor
        '''
        self.__job_id = job_id
        self.__all_hosts = all_hosts
        self.__sshClient = SSHClient()
        
    # Broken for VMs!
    def configure_and_mount_nfs(self):
        '''
        Configuring all_hosts
        '''

        # To access G5K mount point from VMs
        uid_set_command = "usermod -u" + settings.WHITESPACE + settings.G5K_USERID + settings.WHITESPACE \
                          + settings.SYSTEM_HADOOP_USER_NAME
                    
        # To mount NFS directory
        mount_command = "mount -t nfs" + settings.WHITESPACE + settings.NFS_STORAGE_SERVER + ":/data/" \
                        + settings.G5K_USERNAME + "_" + self.__job_id + settings.WHITESPACE + settings.NFS_MOUNT_DIRECTORY
                        
        # Chown hadoop directory
        commands = [settings.HADOOP_CHOWN, uid_set_command, settings.SYSTEM_UMOUNT_NFS, mount_command]
        
        self.__sshClient.run_same_commands_on_hosts(self.__all_hosts, 
                                                    commands, 
                                                    settings.SYSTEM_ROOT_USER_NAME, 
                                                    settings.SYSTEM_ROOT_USER_PASSWORD)
class StorageConfiguration(object):
    '''
    Storage configuration
    '''
    def __init__(self, job_id, all_hosts):
        '''
        Constructor
        '''
        self.__job_id = job_id
        self.__all_hosts = all_hosts
        self.__sshClient = SSHClient()

    # Broken for VMs!
    def configure_and_mount_nfs(self):
        '''
        Configuring all_hosts
        '''

        # To access G5K mount point from VMs
        uid_set_command = "usermod -u" + settings.WHITESPACE + settings.G5K_USERID + settings.WHITESPACE \
                          + settings.SYSTEM_HADOOP_USER_NAME

        # To mount NFS directory
        mount_command = "mount -t nfs" + settings.WHITESPACE + settings.NFS_STORAGE_SERVER + ":/data/" \
                        + settings.G5K_USERNAME + "_" + self.__job_id + settings.WHITESPACE + settings.NFS_MOUNT_DIRECTORY

        # Chown hadoop directory
        commands = [
            settings.HADOOP_CHOWN, uid_set_command, settings.SYSTEM_UMOUNT_NFS,
            mount_command
        ]

        self.__sshClient.run_same_commands_on_hosts(
            self.__all_hosts, commands, settings.SYSTEM_ROOT_USER_NAME,
            settings.SYSTEM_ROOT_USER_PASSWORD)
Exemplo n.º 4
0
    def copy_images(self):
        """ copy nxos bin and issu images to /tftpboot/sanity-image """
        try:
            log.debug("Copying images for job")
            # check download_image
            ssh_client = SSHClient(constants.TFTP_SERVER, constants.TFTP_USER, constants.TFTP_PASS)
            if self.job["download_image"] != "":
                log.debug("checking for image in tftpboot since download_image is set")
                tftp_path = "{0}/{1}".format(constants.TFTP_SAVE_PATH,
                                             self.job["download_image"].strip("/"))
                # TODO: check in tftp server
                # if not os.path.exists(tftp_path):
                if not ssh_client.check_path_exists(tftp_path):
                    log.debug("download_image path does not exist: {0}".format(
                        self.job["download_image"]))
                    sys.exit(1)
                self.update_job_details(constants.FINAL_IMAGE_PATH, tftp_path)

            else:
                # copy nxos bin to /tftpboot
                log.debug("checking to copy image to tftpboot")
                nxos_bin = self.get_image_path()
                if nxos_bin is None:
                    log.debug("Couldn't derive nxos image path")
                    sys.exit(1)
                # copy nxos bin
                dest_image_name = "{0}-{1}-{2}".format(self.job["submitter"], self.job["submit_id"],
                                                       os.path.basename(nxos_bin))
                dest_path = "{0}/{1}".format(constants.TFTP_SAVE_PATH, dest_image_name)

                # scp the image to tftp server
                if not ssh_client.scp_img(nxos_bin, dest_path):
                    log.error("setup_data failed in copy_images: scp issu to tftp failed")
                    sys.exit(-1)
                # shutil.copyfile(nxos_bin, dest_path)
                self.update_job_details(constants.FINAL_IMAGE_PATH, dest_path)

                # copy issu nxos bin to /tftpboot
                issu_bin = self.get_issu_image_path()
                if issu_bin is None:
                    return
                dest_issu_name = "{0}-{1}-{2}".format(self.job["submitter"], self.job["submit_id"],
                                                       os.path.basename(issu_bin))
                dest_issu_path = "{0}/{1}".format(constants.TFTP_SAVE_PATH, dest_issu_name)

                # scp the image to tftp server
                if not ssh_client.scp_img(issu_bin, dest_issu_path):
                    log.error("setup_data failed in copy_images: scp issu to tftp failed")
                    sys.exit(-1)

                self.update_job_details(constants.FINAL_ISSU_PATH, dest_issu_path)
        except Exception as e:
            log.error("setup_data failed in copy_images:" + repr(e))
            sys.exit(-1)
 def __init__(self, job_id, all_hosts):
     '''
     Constructor
     '''
     self.__job_id = job_id
     self.__all_hosts = all_hosts
     self.__sshClient = SSHClient()
 def __init__(self, master):
     '''
     Constructor
     '''
     self.__master = master
     self.__sshClient = SSHClient()
class HadoopBenchmark(object):
    '''
    Hadoop MapReduce benchmarks
    '''
    def __init__(self, master):
        '''
        Constructor
        '''
        self.__master = master
        self.__sshClient = SSHClient()

    def __read_maps_reduces(self):
        number_of_maps = raw_input("Number of maps:")
        number_of_reduces = raw_input("Numer of reduces:")
        return number_of_maps, number_of_reduces

    def __read_input_census_and_wiki(self):
        '''
        Reads the input
        '''
        test_number = raw_input(
            "Test number(0 -> read=write, 1 -> read>write, 2 -> read<write):")
        number_of_maps, number_of_reduces = self.__read_maps_reduces()
        return test_number, number_of_maps, number_of_reduces

    def dfsio(self):
        number_of_files = raw_input("Number of files:")
        file_size = raw_input("File size: ")
        logger.info("Write test started")
        write_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \
                        + "TestDFSIO -write -nrFiles " + number_of_files + " -fileSize " + file_size

        logger.info("Read test started")
        read_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \
                       + "TestDFSIO -read -nrFiles " + number_of_files + " -fileSize " + file_size

        logger.info("Cleaning")
        clean_command = settings.HADOOP_START_TEST_BENCHMARK + "TestDFSIO -clean"
        return [write_command, read_command, clean_command]

    def dfsthroughput(self):
        start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK + "dfsthroughput"
        clean_command = settings.HADOOP_START_TEST_BENCHMARK + "dfsthroughput -clean"
        return [start_command, clean_command]

    def mrbench(self):
        number_of_runs = raw_input("Number of runs:")
        command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \
                  + "mrbench -numRuns " + number_of_runs
        return [command]

    def nnbench(self):
        logger.info("nnbench")
        operation = raw_input(
            "Operation (create_write/open_read/rename/delete):")
        number_of_maps, number_of_reduces = self.__read_maps_reduces()
        number_of_files = raw_input("Number of files:")

        command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK + "nnbench -operation " + operation + " -maps " \
                  + number_of_maps + " -reduces " + number_of_reduces + " -blockSize 1 -bytesToWrite 0 -numberOfFiles "\
                  + number_of_files + " -replicationFactorPerFile 3 -readFileAfterOpen true"
        return [command]

    def pi(self):
        number_of_maps = raw_input("Number of maps:")
        number_of_samples = raw_input("Numer of samples:")
        command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK + "pi " + number_of_maps + \
                  settings.WHITESPACE + number_of_samples
        return [command]

    def __get_number_of_maps_reduces_parameter(self):
        '''
        Computes number of maps, reduces parameter setting
        '''
        number_of_maps, number_of_reduces = self.__read_maps_reduces()
        return settings.HADOOP_SET_NUMBER_OF_MAPS + number_of_maps + settings.WHITESPACE \
               + settings.HADOOP_SET_NUMBER_OF_REDUCES + number_of_reduces + settings.WHITESPACE

    def teragen(self):
        number_of_rows = raw_input("Number of 100 byte rows:")
        number_of_maps = raw_input("Number of maps:")
        teragen_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \
                          + "teragen" + settings.WHITESPACE +  settings.HADOOP_SET_NUMBER_OF_MAPS + number_of_maps \
                          + settings.WHITESPACE + number_of_rows + settings.WHITESPACE + settings.TERAGEN_LOCATION_INPUT
        return [settings.TERAGEN_CLEAN_INPUT, teragen_command]

    def terasort(self):
        maps_reduces_parameter = self.__get_number_of_maps_reduces_parameter()
        terasort_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \
                          + "terasort" + settings.WHITESPACE + maps_reduces_parameter \
                          + settings.WHITESPACE + settings.TERAGEN_LOCATION_INPUT \
                          + settings.WHITESPACE + settings.TERAGEN_LOCATION_OUTPUT
        return [settings.TERAGEN_CLEAN_OUTPUT, terasort_command]

    def teravalidate(self):
        maps_reduces_parameter = self.__get_number_of_maps_reduces_parameter()
        teravalidate_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \
                               + "teravalidate" + settings.WHITESPACE + maps_reduces_parameter \
                               + settings.WHITESPACE + settings.TERAGEN_LOCATION_OUTPUT \
                               + settings.WHITESPACE + settings.TERAGEN_LOCATION_VALIDATE
        return [settings.TERAGEN_CLEAN_VALIDATE, teravalidate_command]

    def census_data(self):
        data_source_directory = raw_input("Input data file/directory on NAS:")
        mkdir_command = settings.HADOOP_MKDIR_COMMAND + settings.CENSUSPROC_LOCATION_INPUT
        move_command = settings.SYSTEM_TIME_COMMAND  + settings.HADOOP_COPY_FROM_LOCAL \
                       + data_source_directory + settings.WHITESPACE + settings.CENSUSPROC_LOCATION_INPUT
        return [settings.CENSUSPROC_CLEAN_INPUT, mkdir_command, move_command]

    def census_bench(self):
        test_number, number_of_maps, number_of_reducers = self.__read_input_census_and_wiki(
        )
        start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_COMMAND + settings.CENSUSPROC_LOCATION_BIN \
                        + settings.WHITESPACE + test_number + settings.WHITESPACE + number_of_maps + settings.WHITESPACE \
                        + number_of_reducers + settings.WHITESPACE + settings.CENSUSPROC_LOCATION_INPUT + settings.WHITESPACE \
                        +settings.CENSUSPROC_LOCATION_OUTPUT
        return [settings.CENSUSPROC_CLEAN_OUTPUT, start_command]

    def wikipedia_data(self):
        data_source_directory = raw_input("Input data file/directory on NAS: ")
        mkdir_command = settings.HADOOP_MKDIR_COMMAND + settings.WIKIPROC_LOCATION_INPUT
        move_command = settings.SYSTEM_TIME_COMMAND + settings.WHITESPACE + settings.HADOOP_COPY_FROM_LOCAL \
                       + data_source_directory + settings.WHITESPACE + settings.WIKIPROC_LOCATION_INPUT
        return [settings.WIKIPROC_CLEAN_INPUT, mkdir_command, move_command]

    def wikipedia_bench(self):
        test_number, number_of_maps, number_of_reducers = self.__read_input_census_and_wiki(
        )
        start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_COMMAND + settings.WIKIPROC_LOCATION_BIN \
                        + settings.WHITESPACE + test_number + settings.WHITESPACE + number_of_maps + settings.WHITESPACE \
                        + number_of_reducers + settings.WHITESPACE + settings.WIKIPROC_LOCATION_INPUT + settings.WHITESPACE \
                        + settings.WIKIPROC_LOCATION_OUTPUT
        return [settings.WIKIPROC_CLEAN_OUTPUT, start_command]

    def run_benchmark(self, name):
        logger.debug("Starting %s benchmark on master node: %s" %
                     (name, self.__master))

        benchmarks = {
            "dfsio": self.dfsio,
            "dfsthroughput": self.dfsthroughput,
            "mrbench": self.mrbench,
            "nnbench": self.nnbench,
            "pi": self.pi,
            "teragen": self.teragen,
            "terasort": self.terasort,
            "teravalidate": self.teravalidate,
            "censusdata": self.census_data,
            "censusbench": self.census_bench,
            "wikidata": self.wikipedia_data,
            "wikibench": self.wikipedia_bench
        }

        benchmark = benchmarks[name]
        commands = benchmark()

        self.__sshClient.run_commands_on_host(
            self.__master, commands, settings.SYSTEM_HADOOP_USER_NAME,
            settings.SYSTEM_HADOOP_USER_PASSWORD)
 def __init__(self, master_host, slave_hosts, storage_mode):
     self.master_host = master_host
     self.slave_hosts = slave_hosts
     self.__storage_mode = storage_mode
     self.__hosts = [master_host] + slave_hosts
     self.sshClient = SSHClient()
class HadoopConfigureNormal(object):
    """
    Hadoop related logic.
    """

    def __init__(self, master_host, slave_hosts, storage_mode):
        self.master_host = master_host
        self.slave_hosts = slave_hosts
        self.__storage_mode = storage_mode
        self.__hosts = [master_host] + slave_hosts
        self.sshClient = SSHClient()

    def __configure_master_host(self):
        # Clear master_host/slave files and add master_host host address to the hadoop master_hosts file
        commands = [
            settings.HADOOP_CLEAN_SLAVES_FILE,
            "echo "
            + settings.SYSTEM_HOSTNAME_PREFIX
            + remove_dots(self.master_host)
            + " > "
            + settings.HADOOP_MASTER_FILE,
        ]

        # add slave hosts ip to hadoop slave_hosts file
        for host in self.slave_hosts:
            commands.append(
                "echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " >> " + settings.HADOOP_SLAVES_FILE
            )

        if self.__storage_mode == "nfs":
            commands.append(settings.HADOOP_START_MAPRED)
        elif self.__storage_mode == "hdfs":
            commands.append(settings.HADOOP_FORMAT_DFS)
            commands.append(settings.HADOOP_START_ALL_SERVICES)

        # run the commands on the master_host host
        self.sshClient.run_commands_on_host(
            self.master_host, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD
        )

        print "Waiting %s seconds for nodes to become ready" % (settings.HADOOP_WAIT_TIME)
        time.sleep(settings.HADOOP_WAIT_TIME)

    def __generate_hosts_update_command(self):
        """
        Generates a hosts update command
        """

        hosts_file_update = [settings.SYSTEM_CLEAN_HOSTS_FILE]
        for host in self.__hosts:
            hosts_file_update.append(
                "echo '"
                + host
                + settings.WHITESPACE
                + settings.SYSTEM_HOSTNAME_PREFIX
                + remove_dots(host)
                + "' >> /etc/hosts"
            )
        return hosts_file_update

    def prepare_environment(self):
        """
        Prepares the system environment (updates hosts list, 
                                         sets hostname,
                                         apply urandom and ulimit fixes)
        """

        hosts_file_update_command = self.__generate_hosts_update_command()
        hosts_dict = {}
        for host in self.__hosts:
            commands = [settings.SYSTEM_URANDOM_FIX, settings.SYSTEM_ULIMIT_FIX]
            commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " > /etc/hostname")
            commands.append("hostname -v " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host))
            commands.extend(hosts_file_update_command)
            hosts_dict.update({host: commands})

        self.sshClient.run_distinct_commands_on_hosts(
            hosts_dict, settings.SYSTEM_ROOT_USER_NAME, settings.SYSTEM_ROOT_USER_PASSWORD
        )

    def start(self):
        self.prepare_environment()

        if self.__storage_mode == "nfs":
            self.configure_slave_hosts_nfs()
        elif self.__storage_mode == "hdfs":
            self.configure_slave_hosts_hdfs()

        self.__configure_master_host()

    def configure_slave_hosts_nfs(self):
        logger.info("Preparing the following VMs with NFS: %s" % self.__hosts)
        commands = [
            settings.SYSTEM_KILL_JAVA,
            settings.SYSTEM_CLEAN_TMP,
            settings.HADOOP_DISABLE_HOST_KEY_CHECK,
            settings.HADOOP_UPDATE_ENV,
        ]

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/mapred-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration>
  <property> 
    <name>mapred.job.tracker</name> 
    <value>"""
            + settings.SYSTEM_HOSTNAME_PREFIX
            + remove_dots(self.master_host)
            + """:8021</value> 
  </property> 

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx"""
            + settings.HADOOP_XMX_SIZE
            + """m -Xmn"""
            + settings.HADOOP_XMN_SIZE
            + """m</value>
  </property>

  <property>
    <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>"""
            + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS
            + """</value>
  </property>

  <property>
    <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>"""
            + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS
            + """</value>
  </property>
  
  <property>
      <name>mapred.local.dir</name>
      <value>"""
            + settings.HADOOP_MAPRED_LOCAL_DIR
            + """</value>
   </property>
   
  <property>
      <name>mapred.system.dir</name>
      <value>"""
            + settings.HADOOP_MAPRED_SYSTEM_DIR
            + """</value>
   </property>
   
  <property>
      <name>mapred.temp.dir</name>
      <value>"""
            + settings.HADOOP_MAPRED_TEMP_DIR
            + """</value>
   </property>
</configuration> 
EOF"""
        )

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/core-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
  <property> 
    <name>fs.default.name</name> 
    <value>file:///</value> 
  </property>

  <property> 
    <name>io.file.buffer.size</name> 
    <value>"""
            + settings.HADOOP_IO_FILE_BUFFER_SIZE
            + """</value> 
  </property>
</configuration>
EOF"""
        )

        self.sshClient.run_same_commands_on_hosts(
            self.__hosts, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD
        )

    def configure_slave_hosts_hdfs(self):
        logger.info("Preparing the following VMs with HDFS: %s" % self.__hosts)
        commands = [
            settings.SYSTEM_KILL_JAVA,
            settings.SYSTEM_CLEAN_TMP,
            settings.HADOOP_DISABLE_HOST_KEY_CHECK,
            settings.HADOOP_UPDATE_ENV,
        ]

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/hdfs-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
    <property> 
        <name>dfs.block.size</name> 
        <value>"""
            + settings.HADOOP_BLOCK_SIZE
            + """</value> 
        <final>true</final>
    </property>

    <property>
       <name>dfs.datanode.max.xcievers</name>
       <value>"""
            + settings.HADOOP_MAX_XCIEVERS
            + """</value>
    </property>
      
    <property> 
        <name>dfs.replication</name> 
        <value>"""
            + settings.HADOOP_RELICATION_FACTOR
            + """</value> 
        <final>true</final>
    </property>
</configuration>
EOF"""
        )

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/mapred-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration>
  <property> 
    <name>mapred.job.tracker</name> 
    <value>"""
            + settings.SYSTEM_HOSTNAME_PREFIX
            + remove_dots(self.master_host)
            + """:8021</value> 
  </property> 

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx"""
            + settings.HADOOP_XMX_SIZE
            + """m -Xmn"""
            + settings.HADOOP_XMN_SIZE
            + """m</value>
  </property>

  <property>
    <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>"""
            + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS
            + """</value>
  </property>

  <property>
    <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>"""
            + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS
            + """</value>
  </property>
</configuration> 
EOF"""
        )

        commands.append(
            """cat >"""
            + settings.HADOOP_INSTALL_DIR
            + """/conf/core-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
  <property> 
    <name>fs.default.name</name> 
    <value>hdfs://"""
            + settings.SYSTEM_HOSTNAME_PREFIX
            + remove_dots(self.master_host)
            + """</value> 
  </property>
  
  <property> 
    <name>io.file.buffer.size</name> 
    <value>hdfs://"""
            + settings.HADOOP_IO_FILE_BUFFER_SIZE
            + """</value> 
  </property>
</configuration>
EOF"""
        )

        self.sshClient.run_same_commands_on_hosts(
            self.__hosts, commands, settings.SYSTEM_HADOOP_USER_NAME, settings.SYSTEM_HADOOP_USER_PASSWORD
        )
Exemplo n.º 10
0
 def __init__(self, master_host, slave_hosts, storage_mode):
     self.master_host = master_host
     self.slave_hosts = slave_hosts
     self.__storage_mode = storage_mode
     self.__hosts = [master_host] + slave_hosts
     self.sshClient = SSHClient()
Exemplo n.º 11
0
class HadoopConfigureNormal(object):
    '''
    Hadoop related logic.
    '''
    
    def __init__(self, master_host, slave_hosts, storage_mode):
        self.master_host = master_host
        self.slave_hosts = slave_hosts
        self.__storage_mode = storage_mode
        self.__hosts = [master_host] + slave_hosts
        self.sshClient = SSHClient()
        
    def __configure_master_host(self):
        # Clear master_host/slave files and add master_host host address to the hadoop master_hosts file
        commands = [settings.HADOOP_CLEAN_SLAVES_FILE,
                    "echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + " > " \
                    + settings.HADOOP_MASTER_FILE]
        
        # add slave hosts ip to hadoop slave_hosts file
        for host in self.slave_hosts:        
            commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " >> " + \
                             settings.HADOOP_SLAVES_FILE)
                    
        if self.__storage_mode == "nfs":
            commands.append(settings.HADOOP_START_MAPRED)
        elif self.__storage_mode == "hdfs":
            commands.append(settings.HADOOP_FORMAT_DFS)
            commands.append(settings.HADOOP_START_ALL_SERVICES)    
            
        # run the commands on the master_host host
        self.sshClient.run_commands_on_host(self.master_host, 
                                            commands, 
                                            settings.SYSTEM_HADOOP_USER_NAME, 
                                            settings.SYSTEM_HADOOP_USER_PASSWORD)
        
        print "Waiting %s seconds for nodes to become ready" % (settings.HADOOP_WAIT_TIME)
        time.sleep(settings.HADOOP_WAIT_TIME)
    
    def __generate_hosts_update_command(self):
        '''
        Generates a hosts update command
        '''
        
        hosts_file_update = [settings.SYSTEM_CLEAN_HOSTS_FILE]
        for host in self.__hosts:
            hosts_file_update.append("echo '" + host + settings.WHITESPACE + settings.SYSTEM_HOSTNAME_PREFIX \
                                      + remove_dots(host) + "' >> /etc/hosts")
        return hosts_file_update
        
    def prepare_environment(self):
        '''
        Prepares the system environment (updates hosts list, 
                                         sets hostname,
                                         apply urandom and ulimit fixes)
        '''
        
        hosts_file_update_command = self.__generate_hosts_update_command()
        hosts_dict = {}
        for host in self.__hosts:
            commands = [settings.SYSTEM_URANDOM_FIX, settings.SYSTEM_ULIMIT_FIX]
            commands.append("echo " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host) + " > /etc/hostname")
            commands.append("hostname -v " + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(host))
            commands.extend(hosts_file_update_command)
            hosts_dict.update({host: commands})
        
        self.sshClient.run_distinct_commands_on_hosts(hosts_dict, 
                                                      settings.SYSTEM_ROOT_USER_NAME, 
                                                      settings.SYSTEM_ROOT_USER_PASSWORD)
    def start(self):
        self.prepare_environment()

        if self.__storage_mode == "nfs":
            self.configure_slave_hosts_nfs()
        elif self.__storage_mode == "hdfs":
            self.configure_slave_hosts_hdfs()
        
        self.__configure_master_host()
        
    def configure_slave_hosts_nfs(self):
        logger.info("Preparing the following VMs with NFS: %s" % self.__hosts)
        commands = [settings.SYSTEM_KILL_JAVA,
                    settings.SYSTEM_CLEAN_TMP,
                    settings.HADOOP_DISABLE_HOST_KEY_CHECK,
                    settings.HADOOP_UPDATE_ENV]

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/mapred-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration>
  <property> 
    <name>mapred.job.tracker</name> 
    <value>''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + ''':8021</value> 
  </property> 

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx''' + settings.HADOOP_XMX_SIZE + '''m -Xmn''' + settings.HADOOP_XMN_SIZE + '''m</value>
  </property>

  <property>
    <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>''' + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS + '''</value>
  </property>

  <property>
    <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>''' + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS + '''</value>
  </property>
  
  <property>
      <name>mapred.local.dir</name>
      <value>''' + settings.HADOOP_MAPRED_LOCAL_DIR + '''</value>
   </property>
   
  <property>
      <name>mapred.system.dir</name>
      <value>''' + settings.HADOOP_MAPRED_SYSTEM_DIR + '''</value>
   </property>
   
  <property>
      <name>mapred.temp.dir</name>
      <value>''' + settings.HADOOP_MAPRED_TEMP_DIR + '''</value>
   </property>
</configuration> 
EOF''')

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/core-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
  <property> 
    <name>fs.default.name</name> 
    <value>file:///</value> 
  </property>

  <property> 
    <name>io.file.buffer.size</name> 
    <value>''' + settings.HADOOP_IO_FILE_BUFFER_SIZE + '''</value> 
  </property>
</configuration>
EOF''')   
            
        self.sshClient.run_same_commands_on_hosts(self.__hosts, 
                                                  commands,
                                                  settings.SYSTEM_HADOOP_USER_NAME, 
                                                  settings.SYSTEM_HADOOP_USER_PASSWORD)
                                                  
    def configure_slave_hosts_hdfs(self):
        logger.info("Preparing the following VMs with HDFS: %s" % self.__hosts)
        commands = [settings.SYSTEM_KILL_JAVA,
                    settings.SYSTEM_CLEAN_TMP,
                    settings.HADOOP_DISABLE_HOST_KEY_CHECK,
                    settings.HADOOP_UPDATE_ENV]

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/hdfs-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
    <property> 
        <name>dfs.block.size</name> 
        <value>''' + settings.HADOOP_BLOCK_SIZE + '''</value> 
        <final>true</final>
    </property>

    <property>
       <name>dfs.datanode.max.xcievers</name>
       <value>''' + settings.HADOOP_MAX_XCIEVERS + '''</value>
    </property>
      
    <property> 
        <name>dfs.replication</name> 
        <value>''' + settings.HADOOP_RELICATION_FACTOR + '''</value> 
        <final>true</final>
    </property>
</configuration>
EOF''')

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/mapred-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration>
  <property> 
    <name>mapred.job.tracker</name> 
    <value>''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + ''':8021</value> 
  </property> 

  <property>
    <name>mapred.child.java.opts</name>
    <value>-Xmx''' + settings.HADOOP_XMX_SIZE + '''m -Xmn''' + settings.HADOOP_XMN_SIZE + '''m</value>
  </property>

  <property>
    <name>mapred.tasktracker.map.tasks.maximum</name>
    <value>''' + settings.HADOOP_MAX_NUMBER_OF_MAP_SLOTS + '''</value>
  </property>

  <property>
    <name>mapred.tasktracker.reduce.tasks.maximum</name>
    <value>''' + settings.HADOOP_MAX_NUMBER_OF_REDUCE_SLOTS + '''</value>
  </property>
</configuration> 
EOF''')

        commands.append('''cat >''' + settings.HADOOP_INSTALL_DIR + '''/conf/core-site.xml <<EOF 
<?xml version="1.0"?> 
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>  
<configuration> 
  <property> 
    <name>fs.default.name</name> 
    <value>hdfs://''' + settings.SYSTEM_HOSTNAME_PREFIX + remove_dots(self.master_host) + '''</value> 
  </property>
  
  <property> 
    <name>io.file.buffer.size</name> 
    <value>hdfs://''' + settings.HADOOP_IO_FILE_BUFFER_SIZE + '''</value> 
  </property>
</configuration>
EOF''')   
            
        self.sshClient.run_same_commands_on_hosts(self.__hosts, 
                                                  commands,
                                                  settings.SYSTEM_HADOOP_USER_NAME, 
                                                  settings.SYSTEM_HADOOP_USER_PASSWORD)
Exemplo n.º 12
0
 def __init__(self, master):
     '''
     Constructor
     ''' 
     self.__master = master
     self.__sshClient = SSHClient()
Exemplo n.º 13
0
class HadoopBenchmark(object):
    '''
    Hadoop MapReduce benchmarks
    '''
    
    def __init__(self, master):
        '''
        Constructor
        ''' 
        self.__master = master
        self.__sshClient = SSHClient()

    def __read_maps_reduces(self):
        number_of_maps = raw_input("Number of maps:")
        number_of_reduces = raw_input("Numer of reduces:")  
        return number_of_maps, number_of_reduces
            
    def __read_input_census_and_wiki(self):
        '''
        Reads the input
        '''
        test_number = raw_input("Test number(0 -> read=write, 1 -> read>write, 2 -> read<write):")
        number_of_maps, number_of_reduces = self.__read_maps_reduces()
        return test_number, number_of_maps, number_of_reduces
            
    def dfsio(self):        
        number_of_files = raw_input("Number of files:")
        file_size = raw_input("File size: ")
        logger.info("Write test started")
        write_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \
                        + "TestDFSIO -write -nrFiles " + number_of_files + " -fileSize " + file_size
        
        logger.info("Read test started")
        read_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \
                       + "TestDFSIO -read -nrFiles " + number_of_files + " -fileSize " + file_size
        
        logger.info("Cleaning")
        clean_command = settings.HADOOP_START_TEST_BENCHMARK + "TestDFSIO -clean"
        return [write_command, read_command, clean_command]

    def dfsthroughput(self):        
        start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK + "dfsthroughput"        
        clean_command = settings.HADOOP_START_TEST_BENCHMARK + "dfsthroughput -clean"
        return [start_command, clean_command] 
    
    def mrbench(self):   
        number_of_runs = raw_input("Number of runs:")
        command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK \
                  + "mrbench -numRuns " + number_of_runs 
        return [command]      

    def nnbench(self):   
        logger.info("nnbench")
        operation = raw_input("Operation (create_write/open_read/rename/delete):")
        number_of_maps, number_of_reduces = self.__read_maps_reduces()
        number_of_files = raw_input("Number of files:")
        
        command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_TEST_BENCHMARK + "nnbench -operation " + operation + " -maps " \
                  + number_of_maps + " -reduces " + number_of_reduces + " -blockSize 1 -bytesToWrite 0 -numberOfFiles "\
                  + number_of_files + " -replicationFactorPerFile 3 -readFileAfterOpen true"
        return [command] 
     
    def pi(self):
        number_of_maps = raw_input("Number of maps:")
        number_of_samples = raw_input("Numer of samples:")       
        command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK + "pi " + number_of_maps + \
                  settings.WHITESPACE + number_of_samples
        return [command] 
        
    def __get_number_of_maps_reduces_parameter(self):
        '''
        Computes number of maps, reduces parameter setting
        '''
        number_of_maps, number_of_reduces = self.__read_maps_reduces()
        return settings.HADOOP_SET_NUMBER_OF_MAPS + number_of_maps + settings.WHITESPACE \
               + settings.HADOOP_SET_NUMBER_OF_REDUCES + number_of_reduces + settings.WHITESPACE
               
    def teragen(self):
        number_of_rows = raw_input("Number of 100 byte rows:")
        number_of_maps = raw_input("Number of maps:")
        teragen_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \
                          + "teragen" + settings.WHITESPACE +  settings.HADOOP_SET_NUMBER_OF_MAPS + number_of_maps \
                          + settings.WHITESPACE + number_of_rows + settings.WHITESPACE + settings.TERAGEN_LOCATION_INPUT
        return [settings.TERAGEN_CLEAN_INPUT, teragen_command]
                
    def terasort(self):
        maps_reduces_parameter = self.__get_number_of_maps_reduces_parameter()
        terasort_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \
                          + "terasort" + settings.WHITESPACE + maps_reduces_parameter \
                          + settings.WHITESPACE + settings.TERAGEN_LOCATION_INPUT \
                          + settings.WHITESPACE + settings.TERAGEN_LOCATION_OUTPUT
        return [settings.TERAGEN_CLEAN_OUTPUT, terasort_command]    

    def teravalidate(self):
        maps_reduces_parameter = self.__get_number_of_maps_reduces_parameter()
        teravalidate_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_EXAMPLE_BENCHMARK \
                               + "teravalidate" + settings.WHITESPACE + maps_reduces_parameter \
                               + settings.WHITESPACE + settings.TERAGEN_LOCATION_OUTPUT \
                               + settings.WHITESPACE + settings.TERAGEN_LOCATION_VALIDATE
        return [settings.TERAGEN_CLEAN_VALIDATE, teravalidate_command]    
        
    def census_data(self):
        data_source_directory = raw_input("Input data file/directory on NAS:")
        mkdir_command = settings.HADOOP_MKDIR_COMMAND + settings.CENSUSPROC_LOCATION_INPUT
        move_command = settings.SYSTEM_TIME_COMMAND  + settings.HADOOP_COPY_FROM_LOCAL \
                       + data_source_directory + settings.WHITESPACE + settings.CENSUSPROC_LOCATION_INPUT
        return [settings.CENSUSPROC_CLEAN_INPUT, mkdir_command, move_command]
        
    def census_bench(self):
        test_number, number_of_maps, number_of_reducers = self.__read_input_census_and_wiki()
        start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_COMMAND + settings.CENSUSPROC_LOCATION_BIN \
                        + settings.WHITESPACE + test_number + settings.WHITESPACE + number_of_maps + settings.WHITESPACE \
                        + number_of_reducers + settings.WHITESPACE + settings.CENSUSPROC_LOCATION_INPUT + settings.WHITESPACE \
                        +settings.CENSUSPROC_LOCATION_OUTPUT
        return [settings.CENSUSPROC_CLEAN_OUTPUT, start_command]

    def wikipedia_data(self):
        data_source_directory = raw_input("Input data file/directory on NAS: ")
        mkdir_command = settings.HADOOP_MKDIR_COMMAND + settings.WIKIPROC_LOCATION_INPUT
        move_command = settings.SYSTEM_TIME_COMMAND + settings.WHITESPACE + settings.HADOOP_COPY_FROM_LOCAL \
                       + data_source_directory + settings.WHITESPACE + settings.WIKIPROC_LOCATION_INPUT
        return [settings.WIKIPROC_CLEAN_INPUT, mkdir_command, move_command]
    
    def wikipedia_bench(self):
        test_number, number_of_maps, number_of_reducers = self.__read_input_census_and_wiki()
        start_command = settings.SYSTEM_TIME_COMMAND + settings.HADOOP_START_COMMAND + settings.WIKIPROC_LOCATION_BIN \
                        + settings.WHITESPACE + test_number + settings.WHITESPACE + number_of_maps + settings.WHITESPACE \
                        + number_of_reducers + settings.WHITESPACE + settings.WIKIPROC_LOCATION_INPUT + settings.WHITESPACE \
                        + settings.WIKIPROC_LOCATION_OUTPUT
        return [settings.WIKIPROC_CLEAN_OUTPUT, start_command]

    def run_benchmark(self, name):
        logger.debug("Starting %s benchmark on master node: %s" %(name, self.__master))

        benchmarks = {
         "dfsio": self.dfsio,
         "dfsthroughput": self.dfsthroughput,
         "mrbench": self.mrbench,
         "nnbench": self.nnbench,
         "pi": self.pi,
         "teragen": self.teragen,
         "terasort": self.terasort,
         "teravalidate": self.teravalidate,
         "censusdata": self.census_data,
         "censusbench": self.census_bench,
         "wikidata": self.wikipedia_data,
         "wikibench": self.wikipedia_bench
        }
    
        benchmark = benchmarks[name]
        commands = benchmark()
        
        self.__sshClient.run_commands_on_host(self.__master, 
                                              commands, 
                                              settings.SYSTEM_HADOOP_USER_NAME, 
                                              settings.SYSTEM_HADOOP_USER_PASSWORD)