Пример #1
0
 def checkMPI(self):
     from mpi4casa.MPIInterface import MPIInterface as mpi_clustermanager
     try:
         self.nproc = len(
             mpi_clustermanager.getCluster()._cluster.get_engines())
         return True
     except Exception as e:
         self.nproc = 0
         return False
Пример #2
0
 def test_mpi4casa_log_level_default_to_debug(self):
     """Test changing globally log level from default to debug """
         
     # Change log level globally (test via MPIInterface as it internally uses MPICommandClient so both are tested)
     mpi_interface = MPIInterface()
     mpi_interface.set_log_level("DEBUG")    
             
     # Use a separated log file per server to facilitate analysis
     for server in self.server_list:
         logfile = 'test_mpi4casa_log_level_debug-server-%s.log' % str(server)
         self.client.push_command_request("casalog.setlogfile('%s')" % (logfile),True,server)        
         
     # Run flagdata 
     flagdata(vis=self.vis, mode='summary')  
     
     # Iterate trough log files to see if we find command handling msgs
     for server in self.server_list:
         # Get current working directory (we might be in the 'nosedir' subdirectory)
         cwd = self.client.push_command_request("os.getcwd()",True,server)[0]['ret']
         logfile = '%s/test_mpi4casa_log_level_debug-server-%s.log' % (cwd,str(server))
         content = open(logfile, 'r').read()
         if content.find('flagdata')>0: # Check only server with processed a flagdata sub-job
             self.assertEqual(content.find("MPICommandServer")<0, True, "MPICommandServer msgs should be filtered out")                 
Пример #3
0
    def setupCluster(self):
        # Initialize cluster

        # * Terminal: Client logs + Server logs
        # * casapy-<timestamp>.log: Client logs
        # * casapy-<timestamp>.log-server-<rank>-host-<hostname>-pid-<pid>: Server logs 
        mpi_clustermanager.set_log_mode('redirect');

        self.sc=mpi_clustermanager.getCluster()
        self.sc.set_log_level('DEBUG')

        self.CL=self.sc._cluster
        self.nodeList = self.CL.get_engines();
        numproc=len(self.CL.get_engines())
        numprocperhost=len(self.nodeList)/len(self.nodeList) if (len(self.nodeList) >0 ) else 1

        owd=os.getcwd()
        self.CL.pgc('import os')
        self.CL.pgc('from numpy import array,int32')
        self.CL.pgc('os.chdir("'+owd+'")')
        os.chdir(owd)
        print "Setting up ", numproc, " engines."
        return numproc
Пример #4
0
    def getNParts(self, imprefix='', imexts=[]):

        from mpi4casa.MPIInterface import MPIInterface as mpi_clustermanager
        try:
            self.nproc = len(
                mpi_clustermanager.getCluster()._cluster.get_engines())
        except Exception as e:
            self.nproc = 0

        if (self.nproc > 0):

            imlist = []
            for imext in imexts:
                for part in range(1, self.nproc + 1):
                    imlist.append(imprefix + '.workdirectory/' + imprefix +
                                  '.n' + str(part) + '.' + imext)
            #self.checkall(imexist = imlist)

        else:
            print 'Not a parallel run of CASA'

        return imlist
Пример #5
0
    def test_PyParallelImagerHelper_interface(self):
        
        # Get cluster (getCluster should automatically initialize it)
        self.sc = MPIInterface.getCluster()
        self.CL = self.sc._cluster
        self.assertEqual(self.sc.isClusterRunning(),True,"Error instantiating cluster")

        # Get engines
        engines = self.CL.get_engines()
        self.assertEqual(engines,range(1,MPIEnvironment.mpi_world_size),"Error getting list of engines")
        
        # Get nodes
        if int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])>1:
            nodes = self.CL.get_nodes()
            self.assertTrue(socket.gethostname() in nodes,"Error getting list of nodes")
        
        # Run imports in all engines
        self.CL.pgc('import os')
        self.CL.pgc('from numpy import array,int32')
        os_is_module = self.CL.pgc('os is not None')[0]['ret']
        self.assertEqual(os_is_module,True,"Error importing os module")
        
        # Change current working directory
        cwd=os.getcwd()
        self.CL.pgc('os.chdir("' + cwd + '")')
        res = self.CL.pgc('os.getcwd()')[0]['ret']
        self.assertEqual(res,cwd,"Error changing work directory")
        
        # Get engine working directory
        cwd=os.getcwd()
        res = self.sc.get_engine_store(1)
        self.assertEqual(res,cwd,"Error getting engine store")
        
        # pgc/Pull variable to/from all servers
        self.CL.pgc("initrec = casac.utils().hostinfo()['endian']")
        res = self.CL.pull('initrec')
        self.assertEqual(res[1],casac.utils().hostinfo()['endian'],"Error pulling a variable")
        
        # Push/Pull variable to/from a subset of servers
        var_dict={}
        var_dict['a'] = 33
        var_dict['b'] = {'test':29.2}
        self.CL.push(var_dict,[1,2])
        res = self.CL.pull('a',[1,2])
        self.assertEqual(res[1],var_dict['a'],"Error pulling a variable after a push operation to a subset of servers")
        res = self.CL.pull('b',[1,2])
        self.assertEqual(res[2],var_dict['b'],"Error pulling a variable after a push operation to a subset of servers")      
        
        # Push/Pull variable to/from all servers
        var_dict={}
        var_dict['c'] = False
        var_dict['d'] = "bla"
        self.CL.push(var_dict)
        res = self.CL.pull('c')
        self.assertEqual(res[1],var_dict['c'],"Error pulling a variable after a push operation to all servers")
        res = self.CL.pull('d')
        self.assertEqual(res[2],var_dict['d'],"Error pulling a variable after a push operation to all servers")           
        
        # Run various commands in parallel
        self.CL.pgc({1:'ya=3',2:'ya="b"'})
        res = self.CL.pull('ya',[1,2])
        self.assertEqual(res,{1: 3, 2: 'b'},"Error running various commands in parallel")        
        
        # Async execution of a job in a subset of servers via odo
        jobIds = self.CL.odo("time.sleep(2.5)",1)
        status = self.CL.check_job(jobIds)
        ntries = 0
        while status == False and ntries < 10:
            ntries += 1
            time.sleep(1)
            status = self.CL.check_job(jobIds)        
        self.assertEqual(status,True,"Error executing a job asynchronously via odo")   
        
        # Async execution of a job in a subset of servers via do_and_record with defined target server
        jobIds = self.sc.do_and_record("time.sleep(2.5)",1)
        status = self.CL.check_job(jobIds)
        ntries = 0
        while status == False and ntries < 10:
            ntries += 1
            time.sleep(1)
            status = self.CL.check_job(jobIds)        
        self.assertEqual(status,True,"Error executing a job asynchronously via do_and_record with defined target server")   
        
        # Async execution of a job in a subset of servers via do_and_record with undefined target server
        jobIds = self.sc.do_and_record("time.sleep(2.5)")
        status = self.CL.check_job(jobIds)
        ntries = 0
        while status == False and ntries < 10:
            ntries += 1
            time.sleep(1)
            status = self.CL.check_job(jobIds)        
        self.assertEqual(status,True,"Error executing a job asynchronously via do_and_record with undefined target server")          
        
        # Re-throw exception
        jobIds = self.CL.odo("1/0",[1, 2])
        ntries = 0
        res = False
        rethrow = False
        while res == False and ntries < 10:
            try:
                res = self.CL.check_job(jobIds)
                time.sleep(1)
                ntries += 1
            except:
                rethrow = True
                break
        self.assertEqual(rethrow,True,"Exception not retrown")
        self.assertEqual(str(sys.exc_info()[1]).find("ZeroDivisionError:")>=0, True, "Trace-back should contain ZeroDivisionError")      
        
        # Check queue status
        jobIds = self.CL.odo("time.sleep(5)",1)
        time.sleep(1)
        status = self.sc.get_status()
        self.assertEqual(len(status)-1,len(self.CL.get_command_request_list()),"Error retrieving job queue status")
Пример #6
0
 def setUp(self):
     
     MPIInterface.set_log_mode('redirect')
     self.sc = MPIInterface.getCluster()
     self.CL = self.sc._cluster