def _check_constraints(): gb = 1024 * 1024 * 1024 gpu_list = nvidia_smi.list_gpus() recommended_gpu_ram = 8 * gb if len(gpu_list) == 0: raise HWConstraintViolated( 'No GPU for Neural engine training, the process will take very long time to complete.' ) for gpu in gpu_list: gpu_ram = nvidia_smi.get_ram(gpu) if gpu_ram < recommended_gpu_ram: raise HWConstraintViolated( 'The RAM of GPU %d is only %.fG. More than %.fG of RAM recommended for each GPU.' % (gpu, round( float(gpu_ram) / gb), recommended_gpu_ram / gb))
def _get_default_threads(self): executors = max(len(nvidia_smi.list_gpus()), 1) cluster_info = self._api.info()['cluster'] node_count = max(len(cluster_info['nodes']), 1) return max(10, executors * node_count * 2)