Пример #1
0
 def _runCommand(self, cmd, servers, wait_timeout, on_output, on_process_start, on_process_done, factory):
     processes = []
     if servers is None:
         processes.append(executeCommand(cmd, factory=factory))
     else:
         processes.extend(executeRemoteCommand(servers, cmd, factory=factory))
     return waitForProcesses(processes,
                             wait_timeout=wait_timeout,
                             on_output=on_output,
                             on_process_start=on_process_start,
                             on_process_done=on_process_done)
Пример #2
0
    def _getDevices(self, ips):
        links = []
        self._devices = {}
        
        def linkNameParser(line, process):
            debug(line)
            links.append(line)
        
        def deviceNameAndPortParser(line, process):
            debug(line)
            parts = line.split()
            res = RemoteDeviceInfo()
            res.name = parts[0]
            res.port = int(parts[2])
            res.is_up = parts[5] == "(Up)"
            self._devices[process.server] = res

        procs = []
        for ip in ips:
            server = TestEnvironment.Get().getServer(ip)
            procs.extend(executeRemoteCommand([server], "ip -o a s | grep %s | cut -d ' ' -f 2 | cut -d'.' -f1" % ip))
        if not waitForProcesses(procs, wait_timeout=5, on_output=linkNameParser):
            for proc in procs:
                if proc.exception is not None:
                    raise proc.exception
            raise Exception("Internal Error")
        
        procs = []
        for ip in ips:
            server = TestEnvironment.Get().getServer(ip)
            i = len(procs)
            link = links[i]
            procs.extend(executeRemoteCommand([server], "ibdev2netdev | grep %s" % link))
        if not waitForProcesses(procs, wait_timeout=5, on_output=deviceNameAndPortParser):
            for proc in procs:
                if proc.exception is not None:
                    raise proc.exception
            raise Exception("Internal Error")
Пример #3
0
    def _findRemoteProcessIDs(self, processes):
        remote_process_ids = {}
        def parser(line, find_process):
            debug(line)
            key = find_process.name
            remote_process_ids[key] = int(line.split()[0])
       
        res = True
        num_attempts = 0
        max_num_attempts = 3
        while len(remote_process_ids) < len(processes):
            find_processes = []
            for process in processes:
                if process.name in remote_process_ids:
                    continue
                find_process = executeRemoteCommand([process.server],  "ps --no-headers -o\"pid,args\" | grep -e '^ *[0-9]\+ %s$'" % process.tf_command)[0]
                find_process.name = process.name
                find_processes.append(find_process)
                
            waitForProcesses(find_processes, wait_timeout=5, on_output=parser)
            time.sleep(1)
            num_attempts += 1
            if num_attempts == max_num_attempts:
                error("Failed to find remote process IDs. Most likely some processes failed to run.")
                res = False
                break

        table = FormattedTable()
        table.border_style = UniBorder.BORDER_STYLE_SINGLE
        table.addColumn(FormattedTable.Column("IP"))
        table.addColumn(FormattedTable.Column("Job"))
        table.addColumn(FormattedTable.Column("#"))
        table.addColumn(FormattedTable.Column("PID"))
        table.addColumn(FormattedTable.Column("RPID"))
        table.addColumn(FormattedTable.Column("Flags"))
        table.addColumn(FormattedTable.Column("Command"))
        for process in processes:
            if process.name in remote_process_ids:
                process.remote_pid = remote_process_ids[process.name]
            else:
                process.remote_pid = -1
            table.addRow([process.server_info.ip, process.job_name, process.task_id, process.instance.pid, process.remote_pid, process.tf_flags, process.tf_command])
        table.printFormatted(LogWriter(None, LOG_LEVEL_NOTE))
        return res
Пример #4
0
    def perform(self, index):
        Step.perform(self, index)

        ##########
        # Build: #
        ##########
        title("Building:", UniBorder.BORDER_STYLE_SINGLE)
        config_cuda = "--config=cuda" if self.config_cuda else ""
        if self.additional_flags == [""]:
            additional_flags = ""
        else:
            additional_flags = "--copt \"%s\"" % " ".join(
                self.additional_flags)
        cmd = "cd %s; rm -rf tensorflow_pkg; bazel build -c opt %s %s //tensorflow/tools/pip_package:build_pip_package" % (
            self.tensorflow_home, config_cuda, additional_flags)
        res = self.runSeperate(cmd,
                               title="Build %s" % self.tensorflow_home,
                               log_file_path=os.path.join(
                                   self._logs_dir, "build.log"),
                               wait_timeout=3600)
        if not res:
            return False

        cmd = "cd %s; bazel-bin/tensorflow/tools/pip_package/build_pip_package tensorflow_pkg" % (
            self.tensorflow_home)
        res = self.runInline(cmd, wait_timeout=60)
        if not res:
            return False

        ############
        # Install: #
        ############
        servers = TestEnvironment.Get().getServers(self.install_servers)
        title("Installing:", UniBorder.BORDER_STYLE_SINGLE)
        src_dir = os.path.join(self.tensorflow_home, "tensorflow_pkg")
        temp_dir_name = "tmp." + next(tempfile._get_candidate_names()) + next(
            tempfile._get_candidate_names())
        temp_dir = os.path.join(tempfile._get_default_tempdir(), temp_dir_name)
        res = self.runSCP(servers, [src_dir], temp_dir, wait_timeout=10)
        if not res:
            return False

        cmd = "pip install --user --upgrade %s/tensorflow-*" % temp_dir
        process_title = lambda process: "Installing on %s..." % process.server
        log_file_path = lambda process: os.path.join(
            self._logs_dir, "install_%s.log" % re.sub("[^0-9a-zA-Z]", "_",
                                                      process.server))
        res = self.runSeperate(cmd,
                               title=process_title,
                               servers=servers,
                               log_file_path=log_file_path)
        if not res:
            return False

        ##########
        # Clean: #
        ##########
        title("Cleaning:", UniBorder.BORDER_STYLE_SINGLE)
        processes = executeRemoteCommand(servers, "rm -rf %s" % temp_dir)
        res = waitForProcesses(processes, wait_timeout=10)
        if not res:
            return False
        return True
Пример #5
0
    def perform(self, index):
        Step.perform(self, index)
        log("<img src='%s' width=600 style='border:1px solid black'/>" % pkg_resources.resource_filename("mltester", "images/tensorflow.jpg")) #https://www.skylinelabs.in/blog/images/tensorflow.jpg?width=500'/>")
        for attr in self._attributes:
            log(" + %s: %s" % (attr.desc.display_name, str(attr.val)))
        self._perf = TFPerformanceMeasurements()
        self._processes = []
        self._stopping = False
        self._servers = {}
        work_dir_name = "tmp." + next(tempfile._get_candidate_names()) + next(tempfile._get_candidate_names())
        work_dir = os.path.join(tempfile._get_default_tempdir(), work_dir_name)
        script_dir = os.path.dirname(self.script)
        
        user = getuser()
        self._work_dir = work_dir
        ips = self._getIPs()
        servers = TestEnvironment.Get().getServers(ips)
            
        #########################
        # Kill other instances: #
        #########################
        apps_to_kill = ["tf_cnn_benchmarks.py", "ml_monitor"]
        for app in apps_to_kill:
            kill_cmd = "ps -f | grep %s | grep -v grep | grep -v %s | sed -e 's@%s *\\([0-9]\\+\\) .*@\\1@g' | xargs kill -9" % (app, work_dir, user)
            res = self.runInline(kill_cmd, servers, wait_timeout = 5)
            if not res:
                return False
        if self._stop:
            return False
    
        ##################
        # Build cluster: #
        ##################
        port = self.base_port
        self._cluster_ps = []
        self._cluster_workers = []
        for ip in self.ps:
            self._cluster_ps.append("%s:%u" % (ip, port))
            port += 1
        for ip in self.workers:
            self._cluster_workers.append("%s:%u" % (ip, port))
            port += 1
    
        #########
        # Copy: #
        #########
        title("Copying scripts:", UniBorder.BORDER_STYLE_SINGLE)    
        if not self.runSCP(servers, [script_dir], work_dir, wait_timeout = 10): # Also create it
            return False
        if self._stop:
            return False
            
        ########
        # Run: #
        ########
        self._openPerformanceFile()
        self._getDevices(ips)
        
        title("Running:", UniBorder.BORDER_STYLE_SINGLE)
        processes = []
        if self.mode == TFCnnBenchmarksStep.MODE_PARAMETER_SERVER:
            for i in range(len(self.ps)):
                ip = self.ps[i]
                process = self._runJob(work_dir, ip, "ps", i)
                processes.append(process)
        elif self.mode == TFCnnBenchmarksStep.MODE_DISTRIBUTED_ALL_REDUCE:
            process = self._runJob(work_dir, ip, "controller", 0)
            processes.append(process) 
        ################
        # Run workers: #
        ################
        if self.mode == TFCnnBenchmarksStep.MODE_LOCAL:
            process = self._runJob(work_dir, self.workers[0], "worker", 0)
            processes.append(process)
        else:
            for i in range(len(self.workers)):
                ip = self.workers[i]
                process = self._runJob(work_dir, ip, "worker", i)
                processes.append(process)
        
        time.sleep(0.5)
        res = self._findRemoteProcessIDs(processes)
        if not res or self._stop:
            return False
        
        for server in self._servers.values():
            if not self._initServerMonitors(server):
                return False

        res = waitForProcesses(processes, 
                               wait_timeout=600,
                               on_output=self._onOut,
                               on_process_start=self._onJobStart,
                               on_process_done=self._onJobDone)

        for server in self._servers.values():
            res = res and self._stopServerMonitors(server)
        
        if not res or self._stop:
            return False

        self._appendToPerformanceFile()
        
        ############
        # Cleanup: #
        ############
        title("Cleaning:", UniBorder.BORDER_STYLE_SINGLE)
        sources = ["%s:%s %s:%s" % (server, os.path.join(self._work_dir, "graph.txt"), server, os.path.join(self._work_dir, "*.json")) for server in servers]
        dst = self._logs_dir
        cmd = "scp %s %s" % (" ".join(sources), dst)
        self.runInline(cmd)
        processes = executeRemoteCommand(servers, "rm -rf %s" % work_dir)
        waitForProcesses(processes, wait_timeout=10)
        return True
Пример #6
0
    def _runJob(self, work_dir, ip, job_name, task_id):
        hostname = TestEnvironment.Get().getServer(ip)
        server_info = self._getOrCreateServer(hostname, ip)
        device_info = self._devices[server_info.hostname]
        #####################
        # Build TF command: #
        #####################
        tf_flags = ""
        tf_command =  ""
        
        ##################
        # Env variables: #
        ##################
        tf_flags += "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/gdrcopy"
        tf_flags += " TF_CPP_MIN_VLOG_LEVEL=%s" % self.log_level
        tf_flags += " RDMA_DEVICE=%s" % device_info.name
        tf_flags += " RDMA_DEVICE_PORT=%u" % device_info.port
        tf_flags += " RDMA_GID_INDEX=3"
        tf_flags += " RDMA_PKEY=0"
        tf_flags += " RDMA_QUEUE_DEPTH=1024"
        tf_flags += " RDMA_TIMEOUT=10"
        tf_flags += " RDMA_RETRY_CNT=10"
        tf_flags += " RDMA_SL=1"
        tf_flags += " RDMA_MTU=512"
        tf_flags += " RDMA_TRAFFIC_CLASS=8"
        tf_flags += " UCX_NET_DEVICES=%s:%u" % (device_info.name, device_info.port)
        if (job_name in ["ps", "controller"]) or (self.num_gpus == 0):
            tf_flags += " CUDA_VISIBLE_DEVICES="

        ##############  
        # UCX stuff: #
        ##############
        # Ucx should be compiled ./contrib/configure-devel --enable-debug
        if self.server_protocol == "grpc+ucx":
            tf_flags += " UCX_LOG_LEVEL=data"
            tf_flags += " UCX_TLS=rc_x,gdr_copy,cuda_copy"
        #export UCX_IB_ETH_PAUSE_ON=y
        #export UCX_LOG_LEVEL=trace 

        ###############
        # GRPC Debug: #
        ###############
        #export GRPC_VERBOSITY=DEBUG
        #export GRPC_TRACE=api,call_combiner
        #export GRPC_TRACE=queue_pluck,flowctl,http1,http2_stream_state,http,op_failure
        #export GRPC_TRACE=client_channel,call_error,channel,server_channel,channel_stack_builder,connectivity_state  #all

        ##############
        # Arguments: #
        ##############
        # tf_command += " gdb --args"
        tf_command += "python -u %s/tf_cnn_benchmarks.py" % self._work_dir
        if self.mode != TFCnnBenchmarksStep.MODE_LOCAL:
            tf_command += " --job_name=%s" % job_name
            tf_command += " --task_index=%u" % task_id
            tf_command += " --worker_hosts=%s" % ",".join(self._cluster_workers)
        if self.mode == TFCnnBenchmarksStep.MODE_PARAMETER_SERVER:
            tf_command += " --ps_hosts=%s" % ",".join(self._cluster_ps)
        elif self.mode == TFCnnBenchmarksStep.MODE_DISTRIBUTED_ALL_REDUCE:
            tf_command += " --variable_update=distributed_all_reduce"
            tf_command += " --all_reduce_spec=%s" % self.all_reduce_spec
        if job_name in ["worker", "controller"]:
            tf_command += " --model=%s" % self.model
            tf_command += " --batch_size=%s" % self.batch_size
            if self.data_dir != "":
                tf_command += " --data_dir=%s" % self.data_dir
            if self.num_gpus > 0:
                tf_command += " --num_gpus=%s --local_parameter_device=gpu" % self.num_gpus
            if self.trace_file:
                tf_command += "--trace_file=trace_%s_%u.json" % (job_name, task_id)
        if self.mode != TFCnnBenchmarksStep.MODE_LOCAL:
            tf_command += " --server_protocol=%s" % self.server_protocol
        if self.forward_only:
            tf_command += " --forward_only"
        
        if job_name == "worker":
            if self.model_graph_file and (task_id == 0):
                tf_command += " --graph_file=%s" % os.path.join(self._work_dir, "graph.txt")
            if self.trace_file:
                tf_command += " --trace_file=%s" % os.path.join(self._work_dir, "trace_%s_%u.json" % (job_name, task_id))
            
        command = tf_flags + " " + tf_command

        title = "[%s] %s - %u" % (ip, job_name, task_id)
        log_file_path = os.path.join(self._logs_dir, "%s_%u.log" % (job_name, task_id))
        factory = BasicProcess.getFactory(title, log_file_path)
        process = executeRemoteCommand([server_info.hostname], command, factory = factory)[0]
        process.name = "%s_%u" % (job_name, task_id)
        process.job_name = job_name 
        process.task_id = task_id
        process.is_worker = job_name == "worker"
        process.rdma_device = device_info
        process.server_info = server_info
        process.tf_flags = tf_flags
        process.tf_command = tf_command
        process.remote_pid = None
        self._processes.append(process)
        server_info.processes.append(process)
        return process