def __init__(self, total_process, mi, batch_template, cb_remote_initialize=None, cb_remote_batch_process=None, args=None): ''' Initialize `SharedData` class with a few hooks Args: total_process: number of processes mi: ModelInterface batch_template: cb_remote_initialize: Callbacks for remote Initialization cb_remote_batch_process: Callbacks for remote process args: additional arguments ''' self.server = ParameterServer(total_process) self.cb_remote_initialize = cb_remote_initialize self.cb_remote_batch_process = cb_remote_batch_process self.args = args # def get_gpu_id(i): return i + 1 def get_gpu_id(i): return 0 # Share only training batches. shared_batches = [] cvs_send = [] cvs_recv = [] qs = [] for i in range(total_process - 1): # gpu_id = get_gpu_id(i) # shared_batches.append( # cpu2gpu(all_batches[train_idx][0], gpu=gpu_id)) shared_batches.append(utils_elf.pin_clone(batch_template)) qs.append(mp.Queue(1)) qs[-1].put(shared_batches[i]) cvs_send.append(Cond()) cvs_recv.append(Cond()) self.cvs_send = cvs_send self.cvs_recv = cvs_recv self.shared_batches = shared_batches self.qs = qs self.b = mp.Barrier(total_process) self.optimizers = [ mp.process(target=self.process_main, args=(i, get_gpu_id(i))) for i in range(total_process - 1) ] for optimizer in self.optimizers: optimizer.start() # Wait until all models have received the shared memory. self.b.wait() self.server.server_send_model(mi)
def __init__(self, total_process, mi, batch_template, cb_remote_initialize=None, cb_remote_batch_process=None, args=None): ''' Initialize `SharedData` class with a few hooks Args: total_process: number of processes mi: ModelInterface batch_template: cb_remote_initialize: Callbacks for remote Initialization cb_remote_batch_process: Callbacks for remote process args: additional arguments ''' self.server = ParameterServer(total_process) self.cb_remote_initialize = cb_remote_initialize self.cb_remote_batch_process = cb_remote_batch_process self.args = args # def get_gpu_id(i): return i + 1 def get_gpu_id(i): return 0 # Share only training batches. shared_batches = [] cvs_send = [] cvs_recv = [] qs = [] for i in range(total_process - 1): # gpu_id = get_gpu_id(i) # shared_batches.append( # cpu2gpu(all_batches[train_idx][0], gpu=gpu_id)) shared_batches.append(utils_elf.pin_clone(batch_template)) qs.append(mp.Queue(1)) qs[-1].put(shared_batches[i]) cvs_send.append(Cond()) cvs_recv.append(Cond()) self.cvs_send = cvs_send self.cvs_recv = cvs_recv self.shared_batches = shared_batches self.qs = qs self.b = mp.Barrier(total_process) self.optimizers = [ mp.process( target=self.process_main, args=( i, get_gpu_id(i))) for i in range( total_process - 1)] for optimizer in self.optimizers: optimizer.start() # Wait until all models have received the shared memory. self.b.wait() self.server.server_send_model(mi)
def __init__(self, total_process, mi, batch_template, cb_remote_initialize=None, cb_remote_batch_process=None, args=None): self.server = ParameterServer(total_process) self.cb_remote_initialize = cb_remote_initialize self.cb_remote_batch_process = cb_remote_batch_process self.args = args #def get_gpu_id(i): return i + 1 def get_gpu_id(i): return 0 # Share only training batches. shared_batches = [] cvs_send = [] cvs_recv = [] qs = [] for i in range(total_process - 1): # gpu_id = get_gpu_id(i) # shared_batches.append(cpu2gpu(all_batches[train_idx][0], gpu=gpu_id)) shared_batches.append(utils_elf.pin_clone(batch_template)) qs.append(mp.Queue(1)) qs[-1].put(shared_batches[i]) cvs_send.append(Cond()) cvs_recv.append(Cond()) self.cvs_send = cvs_send self.cvs_recv = cvs_recv self.shared_batches = shared_batches self.qs = qs self.b = mp.Barrier(total_process) self.optimizers = [ mp.Process(target=self.process_main, args=(i, get_gpu_id(i))) for i in range(total_process - 1) ] for optimizer in self.optimizers: optimizer.start() # Wait until all models have received the shared memory. self.b.wait() self.server.server_send_model(mi)
def __init__(self, total_process, mi, batch_template, cb_remote_initialize=None, cb_remote_batch_process=None, args=None): self.server = ParameterServer(total_process) self.cb_remote_initialize = cb_remote_initialize self.cb_remote_batch_process = cb_remote_batch_process self.args = args #def get_gpu_id(i): return i + 1 def get_gpu_id(i): return 0 # Share only training batches. shared_batches = [] cvs_send = [] cvs_recv = [] qs = [] for i in range(total_process - 1): # gpu_id = get_gpu_id(i) # shared_batches.append(cpu2gpu(all_batches[train_idx][0], gpu=gpu_id)) shared_batches.append(utils_elf.pin_clone(batch_template)) qs.append(mp.Queue(1)) qs[-1].put(shared_batches[i]) cvs_send.append(Cond()) cvs_recv.append(Cond()) self.cvs_send = cvs_send self.cvs_recv = cvs_recv self.shared_batches = shared_batches self.qs = qs self.b = mp.Barrier(total_process) self.optimizers = [mp.Process(target=self.process_main, args=(i, get_gpu_id(i))) for i in range(total_process - 1)] for optimizer in self.optimizers: optimizer.start() # Wait until all models have received the shared memory. self.b.wait() self.server.server_send_model(mi)