def _worker(reader: DatasetReader, input_queue: Queue, output_queue: Queue, index: int) -> None: """ A worker that pulls filenames off the input queue, uses the dataset reader to read them, and places the generated instances on the output queue. When there are no filenames left on the input queue, it puts its ``index`` on the output queue and doesn't do anything else. """ # Keep going until you get a file_path that's None. while True: file_path = input_queue.get() if file_path is None: # Put my index on the queue to signify that I'm finished output_queue.put(index) break logger.info(f"reading instances from {file_path}") for instance in reader.read(file_path): output_queue.put(instance)
def _instances(self, file_path: str, manager: Manager, output_queue: Queue) -> Iterator[Instance]: """ A generator that reads instances off the output queue and yields them up until none are left (signified by all ``num_workers`` workers putting their ids into the queue). """ shards = glob.glob(file_path) num_shards = len(shards) # If we want multiple epochs per read, put shards in the queue multiple times. input_queue = manager.Queue(num_shards * self.epochs_per_read + self.num_workers) for _ in range(self.epochs_per_read): random.shuffle(shards) for shard in shards: input_queue.put(shard) # Then put a None per worker to signify no more files. for _ in range(self.num_workers): input_queue.put(None) processes: List[Process] = [] num_finished = 0 for worker_id in range(self.num_workers): process = Process(target=_worker, args=(self.reader, input_queue, output_queue, worker_id)) logger.info(f"starting worker {worker_id}") process.start() processes.append(process) # Keep going as long as not all the workers have finished. while num_finished < self.num_workers: item = output_queue.get() if isinstance(item, int): # Means a worker has finished, so increment the finished count. num_finished += 1 logger.info(f"worker {item} finished ({num_finished}/{self.num_workers})") else: # Otherwise it's an ``Instance``, so yield it up. yield item for process in processes: process.join() processes.clear()
def main(args): if args.labels: data = [] with open(args.data_dir, encoding="utf-8") as f: for line in csv.reader(f, delimiter="\t"): data.append(line) text, labels = list(zip(*data[1:])) else: text = [] with open(args.data_dir, encoding="utf-8") as f: for line in f.readlines(): text.append(line.strip()) labels = None if isinstance(text, tuple): text = list(text) if "imdb" in args.data_dir or "IMDB" in args.data_dir: text = [clean_for_imdb(t) for t in text] logger.info("Do back-translation for {} sentences".format(len(text))) if args.gpus is not None and len(args.gpus) > 1: logger.info("Use Multiple GPUs: {}".format(", ".join([str(i) for i in args.gpus]))) split_point = len(text) // len(args.gpus) text_splitted = [] for gpu_id in args.gpus: text_splitted.append(text[gpu_id * split_point : (gpu_id + 1) * split_point]) if gpu_id == len(args.gpus) - 1: text_splitted[-1] += text[(gpu_id + 1) * split_point :] assert sum(len(s) for s in text_splitted) == len(text) set_start_method("spawn") q = Queue() procs = [] for i in range(len(args.gpus)): proc = Process(target=multi_translate, args=(args, i, text_splitted[i], q)) procs.append(proc) proc.start() q_result = [] for p in procs: q_result.append(q.get()) back_translated_docs = [] for doc_split in sorted(q_result): back_translated_docs += doc_split[1] q.close() q.join_thread() for proc in procs: proc.join() else: if args.gpus is not None: gpu = args.gpus[0] logger.info("Use only one GPU: {}".format(gpu)) back_translated_docs = translate(args, text, args.gpus[0])[1] else: logger.info("Use cpu") back_translated_docs = translate(args, text) output_file_name = "bt_" + os.path.basename(args.data_dir) output_dir = os.path.join(args.output_dir, output_file_name) folder_name = os.path.dirname(output_dir) if not os.path.isdir(folder_name): os.makedirs(folder_name) if args.return_sentence_pair: # Save original sentence pair filename, ext = os.path.splitext(output_dir) with open(filename + ".pickle", "wb") as f: pickle.dump(back_translated_docs, f) # Save back-translated sentences bt_doc = [" ".join(list(zip(*d))[1]) for d in back_translated_docs] with open(output_dir, "wt") as f: if labels is not None: tsv_writer = csv.writer(f, delimiter="\t") tsv_writer.writerow(data[0]) for line, label in zip(bt_doc, labels): tsv_writer.writerow([line, label]) else: for line in bt_doc: f.write(line) f.write('\n') # Save cross sentences new_back_translated_docs = [] for doc in back_translated_docs: new_doc = [] for j, sent in enumerate(doc): if j % 2 == 0: new_doc.append(sent) else: new_doc.append(sent[::-1]) new_back_translated_docs.append(new_doc) new_docs1, new_docs2 = [], [] for doc in new_back_translated_docs: n1, n2 = list(zip(*doc)) new_docs1.append(" ".join(n1)) new_docs2.append(" ".join(n2)) filename, ext = os.path.splitext(output_dir) with open(filename + "_pair1" + ext, "wt") as f: if labels is not None: tsv_writer = csv.writer(f, delimiter="\t") tsv_writer.writerow(data[0]) for line, label in zip(new_docs1, labels): tsv_writer.writerow([line, label]) else: for line in new_docs1: f.write(line) f.write('\n') with open(filename + "_pair2" + ext, "wt") as f: if labels is not None: tsv_writer = csv.writer(f, delimiter="\t") tsv_writer.writerow(data[0]) for line, label in zip(new_docs2, labels): tsv_writer.writerow([line, label]) else: for line in new_docs2: f.write(line) f.write('\n') else: with open(output_dir, "wt") as f: if labels is not None: tsv_writer = csv.writer(f, delimiter="\t") tsv_writer.writerow(data[0]) for line, label in zip(back_translated_docs, labels): tsv_writer.writerow([line, label]) else: for line in back_translated_docs: f.write(line) f.write('\n') logger.info("Translated documents are saved in {}".format(output_dir))
parser = argparse.ArgumentParser() parser.add_argument('-size', type=int, help='input the sum of node') parser.add_argument('-path', help='the path fo share file system') args = parser.parse_args() print("size:" + str(args.size)) print("path:" + args.path) processes = [] num_blocks = [2, 2, 2, 2] # stop_flag = Value('i', 0) e = Event() buffer_queues = [] buffer_queues.append(Queue(400)) buffer_queues.append(Queue(400)) buffer_queues.append(Queue(400)) buffer_queues.append(Queue(400)) buffer_queues.append(Queue(400)) buffer_queues.append(Queue(400)) layers = [] input_layer = ResInputLayer() input_layer.share_memory() layers.append(input_layer) block1 = ResBlockLayer(BasicBlock, 64, num_blocks[0], 1) block1.share_memory() layers.append(block1)
def train(training_dbs, validation_db, system_config, model, args): # reading arguments from command start_iter = args.start_iter initialize = args.initialize gpu = args.gpu # reading arguments from json file learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain stepsize = system_config.stepsize snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate print("building model...") nnet = NetworkFactory(system_config, model, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # queues storing data for training training_queue = Queue(system_config.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading # parallel read train data to queue # 每个worker对应一份training_db,生成workder个并行读数据的进程 training_tasks = init_parallel_jobs(system_config, training_dbs, training_queue, data_sampling_func, True) if val_iter: validation_tasks = init_parallel_jobs(system_config, [validation_db], validation_queue, data_sampling_func, False) #设置进程信号量,线程负责把数据从training_queue读到pinned_training_queue中 training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate ** (start_iter // stepsize)) learning_rate = max(5e-5,learning_rate) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format(start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start, max iteration {}".format(max_iteration)) nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): # for iteration in range(start_iter + 1, max_iteration + 1): training = pinned_training_queue.get(block=True) training_loss,focal_loss,pull_loss,push_loss,off_loss = nnet.train(training["xs"],training["ys"]) # if display and iteration % display == 0: # print("training loss at iteration {}: {}".format(iteration, training_loss.item())) # # print("[log-loss]:{}={}".format(iteration, training_loss.item())) writer.add_scalar('train_loss', training_loss, global_step=iteration) writer.add_scalar('focal_loss', focal_loss, global_step=iteration) writer.add_scalar('pull_loss', pull_loss, global_step=iteration) writer.add_scalar('push_loss', push_loss, global_step=iteration) writer.add_scalar('off_loss', off_loss, global_step=iteration) del training_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(validation["xs"],validation["ys"]) print("[log-validation-loss]:{}={}".format(iteration, validation_loss.item())) writer.add_scalar('validation_loss', validation_loss, global_step=iteration) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate learning_rate = max(5e-5,learning_rate) nnet.set_lr(learning_rate) print("set learning rate {}".format(learning_rate)) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks) terminate_tasks(validation_tasks) writer.close()
def train(model: nn.Module, data: Union[MoleculeDataset, List[MoleculeDataset]], loss_func: Callable, optimizer: Optimizer, scheduler: _LRScheduler, args: Namespace, n_iter: int = 0, logger: logging.Logger = None, writer: SummaryWriter = None, chunk_names: bool = False, val_smiles: List[str] = None, test_smiles: List[str] = None) -> int: """ Trains a model for an epoch. :param model: Model. :param data: A MoleculeDataset (or a list of MoleculeDatasets if using moe). :param loss_func: Loss function. :param optimizer: An Optimizer. :param scheduler: A learning rate scheduler. :param args: Arguments. :param n_iter: The number of iterations (training examples) trained on so far. :param logger: A logger for printing intermediate results. :param writer: A tensorboardX SummaryWriter. :param chunk_names: Whether to train on the data in chunks. In this case, data must be a list of paths to the data chunks. :param val_smiles: Validation smiles strings without targets. :param test_smiles: Test smiles strings without targets, used for adversarial setting. :return: The total number of iterations (training examples) trained on so far. """ debug = logger.debug if logger is not None else print model.train() if args.dataset_type == 'bert_pretraining': features_loss = nn.MSELoss() if chunk_names: for path, memo_path in tqdm(data, total=len(data)): featurization.SMILES_TO_FEATURES = dict() if os.path.isfile(memo_path): found_memo = True with open(memo_path, 'rb') as f: featurization.SMILES_TO_FEATURES = pickle.load(f) else: found_memo = False with open(path, 'rb') as f: chunk = pickle.load(f) if args.moe: for source in chunk: source.shuffle() else: chunk.shuffle() n_iter = train(model=model, data=chunk, loss_func=loss_func, optimizer=optimizer, scheduler=scheduler, args=args, n_iter=n_iter, logger=logger, writer=writer, chunk_names=False, val_smiles=val_smiles, test_smiles=test_smiles) if not found_memo: with open(memo_path, 'wb') as f: pickle.dump(featurization.SMILES_TO_GRAPH, f, protocol=pickle.HIGHEST_PROTOCOL) return n_iter if not args.moe: data.shuffle() loss_sum, iter_count = 0, 0 if args.adversarial: if args.moe: train_smiles = [] for d in data: train_smiles += d.smiles() else: train_smiles = data.smiles() train_val_smiles = train_smiles + val_smiles d_loss_sum, g_loss_sum, gp_norm_sum = 0, 0, 0 if args.moe: test_smiles = list(test_smiles) random.shuffle(test_smiles) train_smiles = [] for d in data: d.shuffle() train_smiles.append(d.smiles()) num_iters = min(len(test_smiles), min([len(d) for d in data])) elif args.maml: num_iters = args.maml_batches_per_epoch * args.maml_batch_size model.zero_grad() maml_sum_loss = 0 else: num_iters = len(data) if args.last_batch else len( data) // args.batch_size * args.batch_size if args.parallel_featurization: batch_queue = Queue(args.batch_queue_max_size) exit_queue = Queue(1) batch_process = Process(target=async_mol2graph, args=(batch_queue, data, args, num_iters, args.batch_size, exit_queue, args.last_batch)) batch_process.start() currently_loaded_batches = [] iter_size = 1 if args.maml else args.batch_size for i in trange(0, num_iters, iter_size): if args.moe: if not args.batch_domain_encs: model.compute_domain_encs( train_smiles) # want to recompute every batch mol_batch = [ MoleculeDataset(d[i:i + args.batch_size]) for d in data ] train_batch, train_targets = [], [] for b in mol_batch: tb, tt = b.smiles(), b.targets() train_batch.append(tb) train_targets.append(tt) test_batch = test_smiles[i:i + args.batch_size] loss = model.compute_loss(train_batch, train_targets, test_batch) model.zero_grad() loss_sum += loss.item() iter_count += len(mol_batch) elif args.maml: task_train_data, task_test_data, task_idx = data.sample_maml_task( args) mol_batch = task_test_data smiles_batch, features_batch, target_batch = task_train_data.smiles( ), task_train_data.features(), task_train_data.targets(task_idx) # no mask since we only picked data points that have the desired target targets = torch.Tensor(target_batch).unsqueeze(1) if next(model.parameters()).is_cuda: targets = targets.cuda() preds = model(smiles_batch, features_batch) loss = loss_func(preds, targets) loss = loss.sum() / len(smiles_batch) grad = torch.autograd.grad( loss, [p for p in model.parameters() if p.requires_grad]) theta = [ p for p in model.named_parameters() if p[1].requires_grad ] # comes in same order as grad theta_prime = { p[0]: p[1] - args.maml_lr * grad[i] for i, p in enumerate(theta) } for name, nongrad_param in [ p for p in model.named_parameters() if not p[1].requires_grad ]: theta_prime[name] = nongrad_param + torch.zeros( nongrad_param.size()).to(nongrad_param) else: # Prepare batch if args.parallel_featurization: if len(currently_loaded_batches) == 0: currently_loaded_batches = batch_queue.get() mol_batch, featurized_mol_batch = currently_loaded_batches.pop( ) else: if not args.last_batch and i + args.batch_size > len(data): break mol_batch = MoleculeDataset(data[i:i + args.batch_size]) smiles_batch, features_batch, target_batch = mol_batch.smiles( ), mol_batch.features(), mol_batch.targets() if args.dataset_type == 'bert_pretraining': batch = mol2graph(smiles_batch, args) mask = mol_batch.mask() batch.bert_mask(mask) mask = 1 - torch.FloatTensor(mask) # num_atoms features_targets = torch.FloatTensor( target_batch['features'] ) if target_batch[ 'features'] is not None else None # num_molecules x features_size targets = torch.FloatTensor(target_batch['vocab']) # num_atoms if args.bert_vocab_func == 'feature_vector': mask = mask.reshape(-1, 1) else: targets = targets.long() else: batch = smiles_batch mask = torch.Tensor([[x is not None for x in tb] for tb in target_batch]) targets = torch.Tensor([[0 if x is None else x for x in tb] for tb in target_batch]) if next(model.parameters()).is_cuda: mask, targets = mask.cuda(), targets.cuda() if args.dataset_type == 'bert_pretraining' and features_targets is not None: features_targets = features_targets.cuda() if args.class_balance: class_weights = [] for task_num in range(data.num_tasks()): class_weights.append( args.class_weights[task_num][targets[:, task_num].long()]) class_weights = torch.stack( class_weights).t() # num_molecules x num_tasks else: class_weights = torch.ones(targets.shape) if args.cuda: class_weights = class_weights.cuda() # Run model model.zero_grad() if args.parallel_featurization: previous_graph_input_mode = model.encoder.graph_input model.encoder.graph_input = True # force model to accept already processed input preds = model(featurized_mol_batch, features_batch) model.encoder.graph_input = previous_graph_input_mode else: preds = model(batch, features_batch) if args.dataset_type == 'regression_with_binning': preds = preds.view(targets.size(0), targets.size(1), -1) targets = targets.long() loss = 0 for task in range(targets.size(1)): loss += loss_func( preds[:, task, :], targets[:, task] ) * class_weights[:, task] * mask[:, task] # for some reason cross entropy doesn't support multi target loss = loss.sum() / mask.sum() else: if args.dataset_type == 'unsupervised': targets = targets.long().reshape(-1) if args.dataset_type == 'bert_pretraining': features_preds, preds = preds['features'], preds['vocab'] if args.dataset_type == 'kernel': preds = preds.view(int(preds.size(0) / 2), 2, preds.size(1)) preds = model.kernel_output_layer(preds) loss = loss_func(preds, targets) * class_weights * mask if args.predict_features_and_task: loss = (loss.sum() + loss[:, :-args.features_size].sum() * (args.task_weight-1)) \ / (mask.sum() + mask[:, :-args.features_size].sum() * (args.task_weight-1)) else: loss = loss.sum() / mask.sum() if args.dataset_type == 'bert_pretraining' and features_targets is not None: loss += features_loss(features_preds, features_targets) loss_sum += loss.item() iter_count += len(mol_batch) if args.maml: model_prime = build_model(args=args, params=theta_prime) smiles_batch, features_batch, target_batch = task_test_data.smiles( ), task_test_data.features(), [ t[task_idx] for t in task_test_data.targets() ] # no mask since we only picked data points that have the desired target targets = torch.Tensor([[t] for t in target_batch]) if next(model_prime.parameters()).is_cuda: targets = targets.cuda() model_prime.zero_grad() preds = model_prime(smiles_batch, features_batch) loss = loss_func(preds, targets) loss = loss.sum() / len(smiles_batch) loss_sum += loss.item() iter_count += len( smiles_batch ) # TODO check that this makes sense, but it's just for display maml_sum_loss += loss if i % args.maml_batch_size == args.maml_batch_size - 1: maml_sum_loss.backward() optimizer.step() model.zero_grad() maml_sum_loss = 0 else: loss.backward() if args.max_grad_norm is not None: clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() if args.adjust_weight_decay: current_pnorm = compute_pnorm(model) if current_pnorm < args.pnorm_target: for i in range(len(optimizer.param_groups)): optimizer.param_groups[i]['weight_decay'] = max( 0, optimizer.param_groups[i]['weight_decay'] - args.adjust_weight_decay_step) else: for i in range(len(optimizer.param_groups)): optimizer.param_groups[i][ 'weight_decay'] += args.adjust_weight_decay_step if isinstance(scheduler, NoamLR): scheduler.step() if args.adversarial: for _ in range(args.gan_d_per_g): train_val_smiles_batch = random.sample(train_val_smiles, args.batch_size) test_smiles_batch = random.sample(test_smiles, args.batch_size) d_loss, gp_norm = model.train_D(train_val_smiles_batch, test_smiles_batch) train_val_smiles_batch = random.sample(train_val_smiles, args.batch_size) test_smiles_batch = random.sample(test_smiles, args.batch_size) g_loss = model.train_G(train_val_smiles_batch, test_smiles_batch) # we probably only care about the g_loss honestly d_loss_sum += d_loss * args.batch_size gp_norm_sum += gp_norm * args.batch_size g_loss_sum += g_loss * args.batch_size n_iter += len(mol_batch) # Log and/or add to tensorboard if (n_iter // args.batch_size) % args.log_frequency == 0: lrs = scheduler.get_lr() pnorm = compute_pnorm(model) gnorm = compute_gnorm(model) loss_avg = loss_sum / iter_count if args.adversarial: d_loss_avg, g_loss_avg, gp_norm_avg = d_loss_sum / iter_count, g_loss_sum / iter_count, gp_norm_sum / iter_count d_loss_sum, g_loss_sum, gp_norm_sum = 0, 0, 0 loss_sum, iter_count = 0, 0 lrs_str = ', '.join(f'lr_{i} = {lr:.4e}' for i, lr in enumerate(lrs)) debug( f'Loss = {loss_avg:.4e}, PNorm = {pnorm:.4f}, GNorm = {gnorm:.4f}, {lrs_str}' ) if args.adversarial: debug( f'D Loss = {d_loss_avg:.4e}, G Loss = {g_loss_avg:.4e}, GP Norm = {gp_norm_avg:.4}' ) if writer is not None: writer.add_scalar('train_loss', loss_avg, n_iter) writer.add_scalar('param_norm', pnorm, n_iter) writer.add_scalar('gradient_norm', gnorm, n_iter) for i, lr in enumerate(lrs): writer.add_scalar(f'learning_rate_{i}', lr, n_iter) if args.parallel_featurization: exit_queue.put( 0) # dummy var to get the subprocess to know that we're done batch_process.join() return n_iter
def train(training_dbs, validation_db, system_config, model, args): # reading arguments from command start_iter = args.start_iter distributed = args.distributed world_size = args.world_size initialize = args.initialize gpu = args.gpu rank = args.rank # reading arguments from json file batch_size = system_config.batch_size print(batch_size) learning_rate = system_config.learning_rate max_iteration = system_config.max_iter pretrained_model = system_config.pretrain stepsize = system_config.stepsize snapshot = system_config.snapshot val_iter = system_config.val_iter display = system_config.display decay_rate = system_config.decay_rate stepsize = system_config.stepsize print("Process {}: building model...".format(rank)) nnet = NetworkFactory(system_config, model, distributed=distributed, gpu=gpu) if initialize: nnet.save_params(0) exit(0) # queues storing data for training training_queue = Queue(system_config.prefetch_size) # validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_config.prefetch_size) # pinned_validation_queue = queue.Queue(5) # allocating resources for parallel reading training_tasks = init_parallel_jobs(system_config, training_dbs, training_queue, data_sampling_func, True) # if val_iter: # validation_tasks = init_parallel_jobs(system_config, [validation_db], validation_queue, data_sampling_func, False) training_pin_semaphore = threading.Semaphore() # validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() # validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() # validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) # validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) # validation_pin_thread.daemon = True # validation_pin_thread.start() if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("Process {}: loading from pretrained model".format(rank)) nnet.load_pretrained_params(pretrained_model) if start_iter: nnet.load_params(start_iter) learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.set_lr(learning_rate) print( "Process {}: training starts from iteration {} with learning_rate {}" .format(rank, start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) if rank == 0: print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in range(start_iter + 1, max_iteration + 1): training = pinned_training_queue.get(block=True) training_loss = nnet.train(**training) if display and iteration % display == 0: print("Process {}: training loss at iteration {}: {}".format( rank, iteration, training_loss.item())) del training_loss # if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: # nnet.eval_mode() # validation = pinned_validation_queue.get(block=True) # validation_loss = nnet.validate(**validation) # print("Process {}: validation loss at iteration {}: {}".format(rank, iteration, validation_loss.item())) # nnet.train_mode() if iteration % snapshot == 0 and rank == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() # validation_pin_semaphore.release() # terminating data fetching processes terminate_tasks(training_tasks)
class SkipGramModel(nn.Module): """ Negative sampling based skip-gram """ def __init__( self, emb_size, emb_dimension, batch_size, only_cpu, only_gpu, only_fst, only_snd, mix, neg_weight, negative, lr, lap_norm, fast_neg, record_loss, async_update, num_threads, ): """ initialize embedding on CPU Paremeters ---------- emb_size int : number of nodes emb_dimension int : embedding dimension batch_size int : number of node sequences in each batch only_cpu bool : training with CPU only_gpu bool : training with GPU only_fst bool : only embedding for first-order proximity only_snd bool : only embedding for second-order proximity mix bool : mixed training with CPU and GPU negative int : negative samples for each positve node pair neg_weight float : negative weight lr float : initial learning rate lap_norm float : weight of laplacian normalization fast_neg bool : do negative sampling inside a batch record_loss bool : print the loss during training use_context_weight : give different weights to the nodes in a context window async_update : asynchronous training """ super(SkipGramModel, self).__init__() self.emb_size = emb_size self.batch_size = batch_size self.only_cpu = only_cpu self.only_gpu = only_gpu if only_fst: self.fst = True self.snd = False self.emb_dimension = emb_dimension elif only_snd: self.fst = False self.snd = True self.emb_dimension = emb_dimension else: self.fst = True self.snd = True self.emb_dimension = int(emb_dimension / 2) self.mixed_train = mix self.neg_weight = neg_weight self.negative = negative self.lr = lr self.lap_norm = lap_norm self.fast_neg = fast_neg self.record_loss = record_loss self.async_update = async_update self.num_threads = num_threads # initialize the device as cpu self.device = torch.device("cpu") # embedding initrange = 1.0 / self.emb_dimension if self.fst: self.fst_u_embeddings = nn.Embedding(self.emb_size, self.emb_dimension, sparse=True) init.uniform_(self.fst_u_embeddings.weight.data, -initrange, initrange) if self.snd: self.snd_u_embeddings = nn.Embedding(self.emb_size, self.emb_dimension, sparse=True) init.uniform_(self.snd_u_embeddings.weight.data, -initrange, initrange) self.snd_v_embeddings = nn.Embedding(self.emb_size, self.emb_dimension, sparse=True) init.constant_(self.snd_v_embeddings.weight.data, 0) # lookup_table is used for fast sigmoid computing self.lookup_table = torch.sigmoid(torch.arange(-6.01, 6.01, 0.01)) self.lookup_table[0] = 0. self.lookup_table[-1] = 1. if self.record_loss: self.logsigmoid_table = torch.log( torch.sigmoid(torch.arange(-6.01, 6.01, 0.01))) self.loss_fst = [] self.loss_snd = [] # indexes to select positive/negative node pairs from batch_walks self.index_emb_negu, self.index_emb_negv = init_emb2neg_index( self.negative, self.batch_size) # adam if self.fst: self.fst_state_sum_u = torch.zeros(self.emb_size) if self.snd: self.snd_state_sum_u = torch.zeros(self.emb_size) self.snd_state_sum_v = torch.zeros(self.emb_size) def create_async_update(self): """ Set up the async update subprocess. """ self.async_q = Queue(1) self.async_p = mp.Process(target=async_update, args=(self.num_threads, self, self.async_q)) self.async_p.start() def finish_async_update(self): """ Notify the async update subprocess to quit. """ self.async_q.put((None, None, None, None, None)) self.async_p.join() def share_memory(self): """ share the parameters across subprocesses """ if self.fst: self.fst_u_embeddings.weight.share_memory_() self.fst_state_sum_u.share_memory_() if self.snd: self.snd_u_embeddings.weight.share_memory_() self.snd_v_embeddings.weight.share_memory_() self.snd_state_sum_u.share_memory_() self.snd_state_sum_v.share_memory_() def set_device(self, gpu_id): """ set gpu device """ self.device = torch.device("cuda:%d" % gpu_id) print("The device is", self.device) self.lookup_table = self.lookup_table.to(self.device) if self.record_loss: self.logsigmoid_table = self.logsigmoid_table.to(self.device) self.index_emb_negu = self.index_emb_negu.to(self.device) self.index_emb_negv = self.index_emb_negv.to(self.device) def all_to_device(self, gpu_id): """ move all of the parameters to a single GPU """ self.device = torch.device("cuda:%d" % gpu_id) self.set_device(gpu_id) if self.fst: self.fst_u_embeddings = self.fst_u_embeddings.cuda(gpu_id) self.fst_state_sum_u = self.fst_state_sum_u.to(self.device) if self.snd: self.snd_u_embeddings = self.snd_u_embeddings.cuda(gpu_id) self.snd_v_embeddings = self.snd_v_embeddings.cuda(gpu_id) self.snd_state_sum_u = self.snd_state_sum_u.to(self.device) self.snd_state_sum_v = self.snd_state_sum_v.to(self.device) def fast_sigmoid(self, score): """ do fast sigmoid by looking up in a pre-defined table """ idx = torch.floor((score + 6.01) / 0.01).long() return self.lookup_table[idx] def fast_logsigmoid(self, score): """ do fast logsigmoid by looking up in a pre-defined table """ idx = torch.floor((score + 6.01) / 0.01).long() return self.logsigmoid_table[idx] def fast_pos_bp(self, emb_pos_u, emb_pos_v, first_flag): """ get grad for positve samples """ pos_score = torch.sum(torch.mul(emb_pos_u, emb_pos_v), dim=1) pos_score = torch.clamp(pos_score, max=6, min=-6) # [batch_size, 1] score = (1 - self.fast_sigmoid(pos_score)).unsqueeze(1) if self.record_loss: if first_flag: self.loss_fst.append( torch.mean(self.fast_logsigmoid(pos_score)).item()) else: self.loss_snd.append( torch.mean(self.fast_logsigmoid(pos_score)).item()) # [batch_size, dim] if self.lap_norm > 0: grad_u_pos = score * emb_pos_v + self.lap_norm * (emb_pos_v - emb_pos_u) grad_v_pos = score * emb_pos_u + self.lap_norm * (emb_pos_u - emb_pos_v) else: grad_u_pos = score * emb_pos_v grad_v_pos = score * emb_pos_u return grad_u_pos, grad_v_pos def fast_neg_bp(self, emb_neg_u, emb_neg_v, first_flag): """ get grad for negative samples """ neg_score = torch.sum(torch.mul(emb_neg_u, emb_neg_v), dim=1) neg_score = torch.clamp(neg_score, max=6, min=-6) # [batch_size * negative, 1] score = -self.fast_sigmoid(neg_score).unsqueeze(1) if self.record_loss: if first_flag: self.loss_fst.append( self.negative * self.neg_weight * torch.mean(self.fast_logsigmoid(-neg_score)).item()) else: self.loss_snd.append( self.negative * self.neg_weight * torch.mean(self.fast_logsigmoid(-neg_score)).item()) grad_u_neg = self.neg_weight * score * emb_neg_v grad_v_neg = self.neg_weight * score * emb_neg_u return grad_u_neg, grad_v_neg def fast_learn(self, batch_edges, neg_nodes=None): """ Learn a batch of edges in a fast way. It has the following features: 1. It calculating the gradients directly without the forward operation. 2. It does sigmoid by a looking up table. Specifically, for each positive/negative node pair (i,j), the updating procedure is as following: score = self.fast_sigmoid(u_embedding[i].dot(v_embedding[j])) # label = 1 for positive samples; label = 0 for negative samples. u_embedding[i] += (label - score) * v_embedding[j] v_embedding[i] += (label - score) * u_embedding[j] Parameters ---------- batch_edges list : a list of node sequnces neg_nodes torch.LongTensor : a long tensor of sampled true negative nodes. If neg_nodes is None, then do negative sampling randomly from the nodes in batch_walks as an alternative. Usage example ------------- batch_walks = torch.LongTensor([[1,2], [3,4], [5,6]]) neg_nodes = None """ lr = self.lr # [batch_size, 2] nodes = batch_edges if self.only_gpu: nodes = nodes.to(self.device) if neg_nodes is not None: neg_nodes = neg_nodes.to(self.device) bs = len(nodes) if self.fst: emb_u = self.fst_u_embeddings(nodes[:, 0]).view( -1, self.emb_dimension).to(self.device) emb_v = self.fst_u_embeddings(nodes[:, 1]).view( -1, self.emb_dimension).to(self.device) ## Postive emb_pos_u, emb_pos_v = emb_u, emb_v grad_u_pos, grad_v_pos = self.fast_pos_bp(emb_pos_u, emb_pos_v, True) ## Negative emb_neg_u = emb_pos_u.repeat((self.negative, 1)) if bs < self.batch_size: index_emb_negu, index_emb_negv = init_emb2neg_index( self.negative, bs) index_emb_negu = index_emb_negu.to(self.device) index_emb_negv = index_emb_negv.to(self.device) else: index_emb_negu = self.index_emb_negu index_emb_negv = self.index_emb_negv if neg_nodes is None: emb_neg_v = torch.index_select(emb_v, 0, index_emb_negv) else: emb_neg_v = self.fst_u_embeddings.weight[neg_nodes].to( self.device) grad_u_neg, grad_v_neg = self.fast_neg_bp(emb_neg_u, emb_neg_v, True) ## Update grad_u_pos.index_add_(0, index_emb_negu, grad_u_neg) grad_u = grad_u_pos if neg_nodes is None: grad_v_pos.index_add_(0, index_emb_negv, grad_v_neg) grad_v = grad_v_pos else: grad_v = grad_v_pos # use adam optimizer grad_u = adam(grad_u, self.fst_state_sum_u, nodes[:, 0], lr, self.device, self.only_gpu) grad_v = adam(grad_v, self.fst_state_sum_u, nodes[:, 1], lr, self.device, self.only_gpu) if neg_nodes is not None: grad_v_neg = adam(grad_v_neg, self.fst_state_sum_u, neg_nodes, lr, self.device, self.only_gpu) if self.mixed_train: grad_u = grad_u.cpu() grad_v = grad_v.cpu() if neg_nodes is not None: grad_v_neg = grad_v_neg.cpu() else: grad_v_neg = None if self.async_update: grad_u.share_memory_() grad_v.share_memory_() nodes.share_memory_() if neg_nodes is not None: neg_nodes.share_memory_() grad_v_neg.share_memory_() self.async_q.put( (grad_u, grad_v, grad_v_neg, nodes, neg_nodes, True)) if not self.async_update: self.fst_u_embeddings.weight.data.index_add_( 0, nodes[:, 0], grad_u) self.fst_u_embeddings.weight.data.index_add_( 0, nodes[:, 1], grad_v) if neg_nodes is not None: self.fst_u_embeddings.weight.data.index_add_( 0, neg_nodes, grad_v_neg) if self.snd: emb_u = self.snd_u_embeddings(nodes[:, 0]).view( -1, self.emb_dimension).to(self.device) emb_v = self.snd_v_embeddings(nodes[:, 1]).view( -1, self.emb_dimension).to(self.device) ## Postive emb_pos_u, emb_pos_v = emb_u, emb_v grad_u_pos, grad_v_pos = self.fast_pos_bp(emb_pos_u, emb_pos_v, False) ## Negative emb_neg_u = emb_pos_u.repeat((self.negative, 1)) if bs < self.batch_size: index_emb_negu, index_emb_negv = init_emb2neg_index( self.negative, bs) index_emb_negu = index_emb_negu.to(self.device) index_emb_negv = index_emb_negv.to(self.device) else: index_emb_negu = self.index_emb_negu index_emb_negv = self.index_emb_negv if neg_nodes is None: emb_neg_v = torch.index_select(emb_v, 0, index_emb_negv) else: emb_neg_v = self.snd_v_embeddings.weight[neg_nodes].to( self.device) grad_u_neg, grad_v_neg = self.fast_neg_bp(emb_neg_u, emb_neg_v, False) ## Update grad_u_pos.index_add_(0, index_emb_negu, grad_u_neg) grad_u = grad_u_pos if neg_nodes is None: grad_v_pos.index_add_(0, index_emb_negv, grad_v_neg) grad_v = grad_v_pos else: grad_v = grad_v_pos # use adam optimizer grad_u = adam(grad_u, self.snd_state_sum_u, nodes[:, 0], lr, self.device, self.only_gpu) grad_v = adam(grad_v, self.snd_state_sum_v, nodes[:, 1], lr, self.device, self.only_gpu) if neg_nodes is not None: grad_v_neg = adam(grad_v_neg, self.snd_state_sum_v, neg_nodes, lr, self.device, self.only_gpu) if self.mixed_train: grad_u = grad_u.cpu() grad_v = grad_v.cpu() if neg_nodes is not None: grad_v_neg = grad_v_neg.cpu() else: grad_v_neg = None if self.async_update: grad_u.share_memory_() grad_v.share_memory_() nodes.share_memory_() if neg_nodes is not None: neg_nodes.share_memory_() grad_v_neg.share_memory_() self.async_q.put( (grad_u, grad_v, grad_v_neg, nodes, neg_nodes, False)) if not self.async_update: self.snd_u_embeddings.weight.data.index_add_( 0, nodes[:, 0], grad_u) self.snd_v_embeddings.weight.data.index_add_( 0, nodes[:, 1], grad_v) if neg_nodes is not None: self.snd_v_embeddings.weight.data.index_add_( 0, neg_nodes, grad_v_neg) return def get_embedding(self): if self.fst: embedding_fst = self.fst_u_embeddings.weight.cpu().data.numpy() embedding_fst /= np.sqrt(np.sum(embedding_fst * embedding_fst, 1)).reshape(-1, 1) if self.snd: embedding_snd = self.snd_u_embeddings.weight.cpu().data.numpy() embedding_snd /= np.sqrt(np.sum(embedding_snd * embedding_snd, 1)).reshape(-1, 1) if self.fst and self.snd: embedding = np.concatenate((embedding_fst, embedding_snd), 1) embedding /= np.sqrt(np.sum(embedding * embedding, 1)).reshape(-1, 1) elif self.fst and not self.snd: embedding = embedding_fst elif self.snd and not self.fst: embedding = embedding_snd else: pass return embedding def save_embedding(self, dataset, file_name): """ Write embedding to local file. Only used when node ids are numbers. Parameter --------- dataset DeepwalkDataset : the dataset file_name str : the file name """ embedding = self.get_embedding() np.save(file_name, embedding) def save_embedding_pt(self, dataset, file_name): """ For ogb leaderboard. """ embedding = torch.Tensor(self.get_embedding()).cpu() embedding_empty = torch.zeros_like(embedding.data) valid_nodes = torch.LongTensor(dataset.valid_nodes) valid_embedding = embedding.data.index_select(0, valid_nodes) embedding_empty.index_add_(0, valid_nodes, valid_embedding) torch.save(embedding_empty, file_name)
def evaluate(config, directories, seeds_per_thread=5, repeats=1, model_workers=None, average_weights=False): models = [] if model_workers is None: model_workers = config['training']['num_threads_model_workers'] threads = config['training']['num_threads_exploring_virtual'] + config['training']['num_threads_exploiting_virtual'] for model_directory in directories: models.append(load_model(model_directory)) if average_weights: average_model = create_model(config['model']) average_model.train() average_model.to(torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')) #print("start averaging") average_update(average_model.actor, [model.actor for model in models]) average_model = AverageModel([average_model], config, repeats) else: average_model = AverageModel(models, config, repeats) processes = [] results = Queue() observation_queue = Queue() action_queue = Queue() observation_conns = [mp.Pipe(duplex=False) for _ in range(threads)] action_conns = [mp.Pipe(duplex=False) for _ in range(threads)] try: for p_id in range(model_workers): p = mp.Process( target=client_model_worker, args=(average_model, observation_queue, action_queue) ) p.start() processes.append(p) in_observation_conns = _get_in_connections(observation_conns) out_observation_conns = _get_out_connections(observation_conns) p = mp.Process( target=client_observation_worker, args=(in_observation_conns, observation_queue) ) p.start() processes.append(p) in_action_conns = _get_in_connections(action_conns) out_action_conns = _get_out_connections(action_conns) p = mp.Process( target=client_action_worker, args=(out_action_conns, action_queue) ) p.start() processes.append(p) for p_id in range(threads): p = mp.Process( target=evaluate_single_thread, args=( p_id, RemoteModel(in_action_conns[p_id], out_observation_conns[p_id]), config, seeds_per_thread, results ) ) p.start() processes.append(p) rewards_total = [] rewards_without_falling = [] modified_rewards_total = [] step_counts_total = [] infos_total = [] for _ in range(threads): rewards, modified_rewards, step_counts, infos = results.get() rewards_total += rewards modified_rewards_total += modified_rewards step_counts_total += step_counts infos_total += infos for r, s in zip(rewards, step_counts): if s > 999: rewards_without_falling.append(r) finally: for p in processes: p.terminate() return rewards_total, modified_rewards_total, step_counts_total, infos_total, rewards_without_falling
class QIterable(Iterable[Instance]): """ You can't set attributes on Iterators, so this is just a dumb wrapper that exposes the output_queue. """ def __init__(self, output_queue_size, epochs_per_read, num_workers, reader, file_path) -> None: self.output_queue = Queue(output_queue_size) self.epochs_per_read = epochs_per_read self.num_workers = num_workers self.reader = reader self.file_path = file_path # Initialized in start. self.input_queue: Optional[Queue] = None self.processes: List[Process] = [] # The num_active_workers and num_inflight_items counts in conjunction # determine whether there could be any outstanding instances. self.num_active_workers: Optional[Value] = None self.num_inflight_items: Optional[Value] = None def __iter__(self) -> Iterator[Instance]: self.start() # Keep going as long as not all the workers have finished or there are items in flight. while self.num_active_workers.value > 0 or self.num_inflight_items.value > 0: # Inner loop to minimize locking on self.num_active_workers. while True: try: # Non-blocking to handle the empty-queue case. yield self.output_queue.get(block=False, timeout=1.0) with self.num_inflight_items.get_lock(): self.num_inflight_items.value -= 1 except Empty: # The queue could be empty because the workers are # all finished or because they're busy processing. # The outer loop distinguishes between these two # cases. break self.join() def start(self) -> None: shards = glob.glob(self.file_path) # Ensure a consistent order before shuffling for testing. shards.sort() num_shards = len(shards) # If we want multiple epochs per read, put shards in the queue multiple times. self.input_queue = Queue(num_shards * self.epochs_per_read + self.num_workers) for _ in range(self.epochs_per_read): np.random.shuffle(shards) for shard in shards: self.input_queue.put(shard) # Then put a None per worker to signify no more files. for _ in range(self.num_workers): self.input_queue.put(None) assert ( not self.processes ), "Process list non-empty! You must call QIterable.join() before restarting." self.num_active_workers = Value("i", self.num_workers) self.num_inflight_items = Value("i", 0) for worker_id in range(self.num_workers): process = Process( target=_worker, args=( self.reader, self.input_queue, self.output_queue, self.num_active_workers, self.num_inflight_items, worker_id, ), ) logger.info(f"starting worker {worker_id}") process.start() self.processes.append(process) def join(self) -> None: for process in self.processes: process.join() self.processes.clear() def __del__(self) -> None: """ Terminate processes if the user hasn't joined. This is necessary as leaving stray processes running can corrupt shared state. In brief, we've observed shared memory counters being reused (when the memory was free from the perspective of the parent process) while the stray workers still held a reference to them. For a discussion of using destructors in Python in this manner, see https://eli.thegreenplace.net/2009/06/12/safely-using-destructors-in-python/. """ for process in self.processes: process.terminate()
def train_controller(current_time): """ Train the controllers by using the CMA-ES algorithm to improve candidature solutions by testing them in parallel using multiprocessing """ current_time = str(current_time) number_generations = 1 games = GAMES levels = LEVELS current_game = False result_queue = Queue() vae, lstm, best_controller, solver, checkpoint = init_models( current_time, sequence=1, load_vae=True, load_controller=True, load_lstm=True) if checkpoint: current_ctrl_version = checkpoint["version"] current_solver_version = checkpoint["solver_version"] new_results = solver.result() current_best = new_results[1] else: current_ctrl_version = 1 current_solver_version = 1 current_best = 0 while True: solutions = solver.ask() fitlist = np.zeros(POPULATION) eval_left = 0 ## Once a level is beaten, remove it from the training set of levels if current_best > SCORE_CAP or not current_game: if not current_game or len(levels[current_game]) == 0: current_game = games[0] games.remove(current_game) current_best = 0 current_level = np.random.choice(levels[current_game]) levels[current_game].remove(current_level) print("[CONTROLLER] Current game: %s and level is: %s" % (current_game, current_level)) while eval_left < POPULATION: jobs = [] todo = PARALLEL if eval_left + PARALLEL <= POPULATION else ( eval_left + PARALLEL) % POPULATION ## Create the child processes to evaluate in parallel print("[CONTROLLER] Starting new batch") for job in range(todo): process_id = eval_left + job ## Assign new weights to the controller, given by the CMA controller = Controller(PARAMS_CONTROLLER, ACTION_SPACE).to(DEVICE) init_controller(controller, solutions[process_id]) ## Start the evaluation new_game = VAECGame(process_id, vae, lstm, controller, current_game, current_level, result_queue) new_game.start() jobs.append(new_game) ## Wait for the evaluation to be completed for p in jobs: p.join() eval_left = eval_left + todo print("[CONTROLLER] Done with batch") ## Get the results back from the processes times = create_results(result_queue, fitlist) ## For display current_score = np.max(fitlist) average_score = np.mean(fitlist) ## Update solver with results max_idx = np.argmax(fitlist) fitlist = rankmin(fitlist) solver.tell(fitlist) new_results = solver.result() ## Display print("[CONTROLLER] Total duration for generation: %.3f seconds, average duration:" " %.3f seconds per process, %.3f seconds per run" % ((np.sum(times), \ np.mean(times), np.mean(times) / REPEAT_ROLLOUT))) print("[CONTROLLER] Creating generation: {} ...".format( number_generations + 1)) print("[CONTROLLER] Current best score: {}, new run best score: {}". format(current_best, current_score)) print( "[CONTROLLER] Best score ever: {}, current number of improvements: {}" .format(current_best, current_ctrl_version)) print( "[CONTROLLER] Average score on all of the processes: {}\n".format( average_score)) ## Save the new best controller if current_score > current_best: init_controller(best_controller, solutions[max_idx]) state = { 'version': current_ctrl_version, 'solver_version': current_solver_version, 'score': current_score, 'level': current_level, 'game': current_game, 'generation': number_generations } save_checkpoint(best_controller, "controller", state, current_time) current_ctrl_version += 1 current_best = current_score ## Save solver and change level to a random one if number_generations % SAVE_SOLVER_TICK == 0: dir_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), \ 'saved_models', current_time, "{}-solver.pkl".format(current_solver_version)) pickle.dump(solver, open(dir_path, 'wb')) current_solver_version += 1 current_level = np.random.choice(levels[current_game]) number_generations += 1
frames = torch.cat(frames, dim=0).cuda() H, W = frames.size()[2:] frames = F.interpolate(frames, size=(768, 768), mode='bilinear', align_corners=False) # must be divisible by 32 out = net(frames)[0] out = F.interpolate(out, size=(H, W), mode='bilinear', align_corners=False).argmax(dim=1).detach().cpu() out_q.put(out) if __name__ == '__main__': torch.multiprocessing.set_start_method('spawn') in_q = Queue(1024) out_q = Queue(1024) in_worker = Process(target=get_func, args=(args.input, in_q)) out_worker = Process(target=save_func, args=(args.input, args.output, out_q)) in_worker.start() out_worker.start() net = get_model() frames = [] while True: frame = in_q.get() if frame == 'quit': break
class KGEmbedding: """Sparse Embedding for Knowledge Graph It is used to store both entity embeddings and relation embeddings. Parameters ---------- num : int Number of embeddings. dim : int Embedding dimention size. device : th.device Device to store the embedding. """ def __init__(self, device): self.emb = None self.is_train = False self.async_q = None self.device = device def init(self, emb_init, lr, async_threads, num=-1, dim=-1, init_strat='uniform', optimizer='Adagrad', device=None): """Initializing the embeddings for training. Parameters ---------- emb_init : float or tuple The intial embedding range should be [-emb_init, emb_init]. """ self.async_threads = async_threads if device is not None: self.device = device if self.emb is None: self.emb = th.empty(num, dim, dtype=th.float32, device=self.device) self.num = self.emb.shape[0] self.dim = self.emb.shape[1] if optimizer == 'Adagrad': self.optim = Adagrad(self.emb, device=self.device, lr=lr) elif optimizer == 'Adam': self.optim = Adam(self.emb, device=self.device, lr=lr) else: raise NotImplementedError(f'optimizer {optimizer} is not supported by dglke yet.') self.trace = [] self.has_cross_rel = False if init_strat == 'uniform': INIT.uniform_(self.emb, -emb_init, emb_init) elif init_strat == 'normal': if type(emb_init) is tuple or type(emb_init) is list: if len(emb_init) == 0: mean = emb_init std = 1 else: mean, std = emb_init INIT.normal_(self.emb.data, mean, std) else: init_size = emb_init INIT.normal_(self.emb.data) self.emb.data *= init_size elif init_strat == 'random': if type(emb_init) is tuple: x, y = emb_init self.emb.data = th.rand(num, dim, dtype=th.float32, device=self.device) * x + y elif init_strat == 'xavier': INIT.xavier_normal_(self.emb.data) elif init_strat == 'constant': INIT.constant_(self.emb.data, emb_init) def clone(self, device): clone_emb = copy.deepcopy(self) clone_emb.device = device clone_emb.emb = clone_emb.emb.to(device) clone_emb.optim = clone_emb.optim.to(device) return clone_emb def load(self, path, name): """Load embeddings. Parameters ---------- path : str Directory to load the embedding. name : str Embedding name. """ file_name = os.path.join(path, name) self.emb = th.Tensor(np.load(file_name)) def load_emb(self, emb_array): """Load embeddings from numpy array. Parameters ---------- emb_array : numpy.array or torch.tensor Embedding array in numpy array or torch.tensor """ if isinstance(emb_array, np.ndarray): self.emb = th.Tensor(emb_array) else: self.emb = emb_array def save(self, path, name): """Save embeddings. Parameters ---------- path : str Directory to save the embedding. name : str Embedding name. """ file_name = os.path.join(path, name) np.save(file_name, self.emb.cpu().detach().numpy()) def train(self): self.is_train = True def eval(self): self.is_train = False def setup_cross_rels(self, cross_rels, global_emb): cpu_bitmap = th.zeros((self.num,), dtype=th.bool) for i, rel in enumerate(cross_rels): cpu_bitmap[rel] = 1 self.cpu_bitmap = cpu_bitmap self.has_cross_rel = True self.global_emb = global_emb def get_noncross_idx(self, idx): cpu_mask = self.cpu_bitmap[idx] gpu_mask = ~cpu_mask return idx[gpu_mask] def share_memory(self): """Use torch.tensor.share_memory_() to allow cross process tensor access """ self.emb.share_memory_() self.optim.share_memory() def __call__(self, idx, gpu_id=-1, trace=True): """ Return sliced tensor. Parameters ---------- idx : th.tensor Slicing index gpu_id : int Which gpu to put sliced data in. trace : bool If True, trace the computation. This is required in training. If False, do not trace the computation. Default: True """ # for inference or evaluation if self.is_train is False: return self.emb[idx].cuda(gpu_id, non_blocking=True) if self.has_cross_rel: cpu_idx = idx.cpu() cpu_mask = self.cpu_bitmap[cpu_idx] cpu_idx = cpu_idx[cpu_mask] cpu_idx = th.unique(cpu_idx) if cpu_idx.shape[0] != 0: cpu_emb = self.global_emb.emb[cpu_idx] self.emb[cpu_idx] = cpu_emb.cuda(gpu_id, non_blocking=True) s = self.emb[idx] if gpu_id >= 0: s = s.cuda(gpu_id, non_blocking=True) # During the training, we need to trace the computation. # In this case, we need to record the computation path and compute the gradients. if trace: data = s.clone().detach().requires_grad_(True) self.trace.append((idx, data)) else: data = s return data def update(self, gpu_id=-1): """ Update embeddings in a sparse manner Sparse embeddings are updated in mini batches. we maintains gradient states for each embedding so they can be updated separately. Parameters ---------- gpu_id : int Which gpu to accelerate the calculation. if -1 is provided, cpu is used. """ with th.no_grad(): for idx, data in self.trace: grad = data.grad.data # the update is non-linear so indices must be unique grad_indices = idx grad_values = grad if self.async_q is not None: grad_indices.share_memory_() grad_values.share_memory_() self.async_q.put((grad_indices, grad_values, gpu_id)) else: if self.has_cross_rel: cpu_mask = self.cpu_bitmap[grad_indices] cpu_idx = grad_indices[cpu_mask] if cpu_idx.shape[0] > 0: cpu_grad = grad_values[cpu_mask] self.global_emb.optim.step(cpu_idx, self.global_emb.emb, cpu_grad, gpu_id) self.optim.step(grad_indices, self.emb, grad_values, gpu_id) self.trace = [] def create_async_update(self): """Set up the async update subprocess. """ self.async_q = Queue(1) self.async_p = mp.Process(target=self.async_update) self.async_p.start() def finish_async_update(self): """Notify the async update subprocess to quit. """ self.async_q.put((None, None, None)) self.async_p.join() def async_update(self): th.set_num_threads(self.async_threads) while True: (grad_indices, grad_values, gpu_id) = self.async_q.get() if grad_indices is None: return with th.no_grad(): if self.has_cross_rel: cpu_mask = self.cpu_bitmap[grad_indices] cpu_idx = grad_indices[cpu_mask] if cpu_idx.shape[0] > 0: cpu_grad = grad_values[cpu_mask] self.global_emb.optim.step(cpu_idx, self.global_emb.emb, cpu_grad, gpu_id) self.optim.step(grad_indices, self.emb, grad_values, gpu_id) def curr_emb(self): """Return embeddings in trace. """ data = [data for _, data in self.trace] return th.cat(data, 0)
def create_async_update(self): """Set up the async update subprocess. """ self.async_q = Queue(1) self.async_p = mp.Process(target=self.async_update) self.async_p.start()
if __name__ == '__main__': opt = TestOptions().parse() data_info = data.dataset_info() datanum = data_info.get_dataset(opt)[0] folderlevel = data_info.folder_level[datanum] dataloaders = data.create_dataloader_test(opt) visualizer = Visualizer(opt) iter_counter = IterationCounter(opt, len(dataloaders[0]) * opt.render_thread) # create a webpage that summarizes the all results testing_queue = Queue(10) ngpus = opt.device_count render_gpu_ids = list(range(ngpus - opt.render_thread, ngpus)) render_layer_list = [] for gpu in render_gpu_ids: opt.gpu_ids = gpu render_layer = TestRender(opt) render_layer_list.append(render_layer) opt.gpu_ids = list(range(0, ngpus - opt.render_thread)) print('Testing gpu ', opt.gpu_ids) if opt.names is None: model = TestModel(opt) model.eval()
def call_mods(args): print("[main]call_mods starts..") start = time.time() model_path = os.path.abspath(args.model_path) if not os.path.exists(model_path): raise ValueError("--model_path is not set right!") input_path = os.path.abspath(args.input_path) if not os.path.exists(input_path): raise ValueError("--input_path does not exist!") success_file = input_path.rstrip("/") + "." + str( uuid.uuid1()) + ".success" if os.path.exists(success_file): os.remove(success_file) if os.path.isdir(input_path): motif_seqs, chrom2len, fast5s_q, len_fast5s, positions = _extract_preprocess( input_path, str2bool(args.recursively), args.motifs, str2bool(args.is_dna), args.reference_path, args.f5_batch_size, args.positions) if use_cuda: _call_mods_from_fast5s_gpu(motif_seqs, chrom2len, fast5s_q, len_fast5s, positions, model_path, success_file, args) else: _call_mods_from_fast5s_cpu2(motif_seqs, chrom2len, fast5s_q, len_fast5s, positions, model_path, success_file, args) else: # features_batch_q = mp.Queue() features_batch_q = Queue() p_rf = mp.Process(target=_read_features_file, args=(input_path, features_batch_q, args.batch_size)) p_rf.daemon = True p_rf.start() # pred_str_q = mp.Queue() pred_str_q = Queue() predstr_procs = [] if use_cuda: nproc_dp = args.nproc_gpu if nproc_dp < 1: nproc_dp = 1 else: nproc = args.nproc if nproc < 3: print("--nproc must be >= 3!!") nproc = 3 nproc_dp = nproc - 2 if nproc_dp > nproc_to_call_mods_in_cpu_mode: nproc_dp = nproc_to_call_mods_in_cpu_mode for _ in range(nproc_dp): p = mp.Process(target=_call_mods_q, args=(model_path, features_batch_q, pred_str_q, success_file, args)) p.daemon = True p.start() predstr_procs.append(p) # print("write_process started..") p_w = mp.Process(target=_write_predstr_to_file, args=(args.result_file, pred_str_q)) p_w.daemon = True p_w.start() for p in predstr_procs: p.join() # print("finishing the write_process..") pred_str_q.put("kill") p_rf.join() p_w.join() if os.path.exists(success_file): os.remove(success_file) print("[main]call_mods costs %.2f seconds.." % (time.time() - start))
def main(method): args = built_parser(method=method) env = gym.make(args.env_name) state_dim = env.observation_space.shape action_dim = env.action_space.shape[0] args.state_dim = state_dim args.action_dim = action_dim action_high = env.action_space.high action_low = env.action_space.low args.action_high = action_high.tolist() args.action_low = action_low.tolist() args.seed = np.random.randint(0, 30) args.init_time = time.time() if args.alpha == 'auto' and args.target_entropy == 'auto': delta_a = np.array(args.action_high, dtype=np.float32) - np.array( args.action_low, dtype=np.float32) args.target_entropy = -1 * args.action_dim + sum(np.log(delta_a / 2)) Q_net1 = QNet(args) Q_net1.train() Q_net1.share_memory() Q_net1_target = QNet(args) Q_net1_target.train() Q_net1_target.share_memory() Q_net2 = QNet(args) Q_net2.train() Q_net2.share_memory() Q_net2_target = QNet(args) Q_net2_target.train() Q_net2_target.share_memory() actor1 = PolicyNet(args) if args.code_model == "eval": actor1.load_state_dict( torch.load('./' + args.env_name + '/method_' + str(args.method) + '/model/policy_' + str(args.max_train) + '.pkl')) actor1.train() actor1.share_memory() actor1_target = PolicyNet(args) actor1_target.train() actor1_target.share_memory() actor2 = PolicyNet(args) actor2.train() actor2.share_memory() actor2_target = PolicyNet(args) actor2_target.train() actor2_target.share_memory() Q_net1_target.load_state_dict(Q_net1.state_dict()) Q_net2_target.load_state_dict(Q_net2.state_dict()) actor1_target.load_state_dict(actor1.state_dict()) actor2_target.load_state_dict(actor2.state_dict()) Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(), lr=args.critic_lr) Q_net1_optimizer.share_memory() Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(), lr=args.critic_lr) Q_net2_optimizer.share_memory() actor1_optimizer = my_optim.SharedAdam(actor1.parameters(), lr=args.actor_lr) actor1_optimizer.share_memory() actor2_optimizer = my_optim.SharedAdam(actor2.parameters(), lr=args.actor_lr) actor2_optimizer.share_memory() log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True) log_alpha.share_memory_() alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr) alpha_optimizer.share_memory() share_net = [ Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target, actor2, actor2_target, log_alpha ] share_optimizer = [ Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer, alpha_optimizer ] experience_in_queue = [] experience_out_queue = [] for i in range(args.num_buffers): experience_in_queue.append(Queue(maxsize=10)) experience_out_queue.append(Queue(maxsize=10)) shared_queue = [experience_in_queue, experience_out_queue] step_counter = mp.Value('i', 0) stop_sign = mp.Value('i', 0) iteration_counter = mp.Value('i', 0) shared_value = [step_counter, stop_sign, iteration_counter] lock = mp.Lock() procs = [] if args.code_model == "train": for i in range(args.num_actors): procs.append( Process(target=actor_agent, args=(args, shared_queue, shared_value, [actor1, Q_net1], lock, i))) for i in range(args.num_buffers): procs.append( Process(target=buffer, args=(args, shared_queue, shared_value, i))) procs.append( Process(target=evaluate_agent, args=(args, shared_value, share_net))) for i in range(args.num_learners): #device = torch.device("cuda") device = torch.device("cpu") procs.append( Process(target=leaner_agent, args=(args, shared_queue, shared_value, share_net, share_optimizer, device, lock, i))) elif args.code_model == "simu": procs.append(Process(target=simu_agent, args=(args, shared_value))) for p in procs: p.start() for p in procs: p.join()
class Base(object): def __init__(self): self.epoch = 0 self.iteration = 0 self.offset = 0 # for multiprocessing self._epoch = 0 # Setting for multiprocessing self.preloading_process = None self.queue = Queue() self.queue_size = 0 def count_vocab_size(self, dict_path): vocab_count = 1 # for <blank> with codecs.open(dict_path, 'r', 'utf-8') as f: for line in f: if line.strip() != '': vocab_count += 1 return vocab_count def __len__(self): return len(self.df) def __getitem__(self, index): raise NotImplementedError() def __iter__(self): """Returns self.""" return self @property def epoch_detail(self): # Floating point version of epoch return self.epoch + (self.offset / len(self)) def __next__(self, batch_size=None): """Generate each mini-batch. Args: batch_size (int): the size of mini-batch Returns: batch (tuple): is_new_epoch (bool): If true, 1 epoch is finished """ if batch_size is None: batch_size = self.batch_size if self.nques is None: if self.max_epoch is not None and self.epoch >= self.max_epoch: raise StopIteration() # NOTE: max_epoch == None means infinite loop data_indices, is_new_epoch = self.sample_index(batch_size) batch = self.make_batch(data_indices) self.iteration += len(data_indices) else: # Clean up multiprocessing if self.preloading_process is not None and self.queue_size == 0: self.preloading_process.terminate() self.preloading_process.join() if self.max_epoch is not None and self.epoch >= self.max_epoch: # Clean up multiprocessing self.preloading_process.terminate() self.preloading_process.join() raise StopIteration() # NOTE: max_epoch == None means infinite loop # Enqueue mini-batches if self.queue_size == 0: self.data_indices_list = [] self.is_new_epoch_list = [] for _ in six.moves.range(self.nques): data_indices, is_new_epoch = self.sample_index(batch_size) self.data_indices_list.append(data_indices) self.is_new_epoch_list.append(is_new_epoch) self.preloading_process = Process( self.preloading_loop, args=(self.queue, self.data_indices_list)) self.preloading_process.start() self.queue_size += self.nques time.sleep(3) # print(self.queue.qsize()) # print(self.queue_size) self.iteration += len(self.data_indices_list[self.nques - self.queue_size]) self.queue_size -= 1 batch = self.queue.get() is_new_epoch = self.is_new_epoch_list.pop(0) if is_new_epoch: self.epoch += 1 return batch, is_new_epoch def next(self, batch_size=None): # For python2 return self.__next__(batch_size) def sample_index(self, batch_size): """Sample data indices of mini-batch. Args: batch_size (int): the size of mini-batch Returns: data_indices (np.ndarray): is_new_epoch (bool): """ is_new_epoch = False if self.sort_by_input_length or not self.shuffle: if self.sort_by_input_length: # Change batch size dynamically min_num_frames_batch = self.df[self.offset:self.offset + 1]['x_len'].values[0] _batch_size = self.select_batch_size(batch_size, min_num_frames_batch) else: _batch_size = batch_size if len(self.rest) > _batch_size: data_indices = list(self.df[self.offset:self.offset + _batch_size].index) self.rest -= set(data_indices) # NOTE: rest is in uttrance length order when sort_by_input_length == True # NOTE: otherwise in name length order when shuffle == False self.offset += len(data_indices) else: # Last mini-batch data_indices = list(self.df[self.offset:self.offset + len(self.rest)].index) self._reset() is_new_epoch = True self._epoch += 1 if self._epoch == self.sort_stop_epoch: self.sort_by_input_length = False self.shuffle = True # Sort in the descending order for pytorch data_indices = data_indices[::-1] else: # Randomly sample uttrances if len(self.rest) > batch_size: data_indices = random.sample(list(self.rest), batch_size) self.rest -= set(data_indices) else: # Last mini-batch data_indices = list(self.rest) self._reset() is_new_epoch = True self._epoch += 1 self.offset += len(data_indices) return data_indices, is_new_epoch def select_batch_size(self, batch_size, min_num_frames_batch): if not self.dynamic_batching: return batch_size if min_num_frames_batch <= 800: pass elif min_num_frames_batch <= 1600: batch_size = int(batch_size / 2) else: batch_size = int(batch_size / 4) if batch_size < 1: batch_size = 1 return batch_size def reset(self): self._reset() self.queue = Queue() self.queue_size = 0 # Clean up multiprocessing if self.preloading_process is not None: self.preloading_process.terminate() self.preloading_process.join() def _reset(self): """Reset data counter and offset.""" self.rest = set(list(self.df.index)) self.offset = 0 def preloading_loop(self, queue, data_indices_list): """. Args: queue (): data_indices_list (np.ndarray): """ # print("Pre-loading started.") for i in six.moves.range(len(data_indices_list)): queue.put(self.make_batch(data_indices_list[i]))
def train(training_dbs, validation_db, start_iter=0, freeze=False): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize batch_size = system_configs.batch_size # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) # 5 validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) # 5 pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) # "sample.coco" sample_data = importlib.import_module(data_file).sample_data # print(type(sample_data)) # function # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() print("building model...") nnet = NetworkFactory(flag=True) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.cuda() nnet.train_mode() header = None metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) metric_logger.add_meter( 'class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) with stdout_to_tqdm() as save_stdout: for iteration in metric_logger.log_every(tqdm(range( start_iter + 1, max_iteration + 1), file=save_stdout, ncols=67), print_freq=10, header=header): training = pinned_training_queue.get(block=True) viz_split = 'train' save = True if (display and iteration % display == 0) else False (set_loss, loss_dict) \ = nnet.train(iteration, save, viz_split, **training) (loss_dict_reduced, loss_dict_reduced_unscaled, loss_dict_reduced_scaled, loss_value) = loss_dict metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) metric_logger.update(class_error=loss_dict_reduced['class_error']) metric_logger.update(lr=learning_rate) del set_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() viz_split = 'val' save = True validation = pinned_validation_queue.get(block=True) (val_set_loss, val_loss_dict) \ = nnet.validate(iteration, save, viz_split, **validation) (loss_dict_reduced, loss_dict_reduced_unscaled, loss_dict_reduced_scaled, loss_value) = val_loss_dict print('[VAL LOG]\t[Saving training and evaluating images...]') metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) metric_logger.update( class_error=loss_dict_reduced['class_error']) metric_logger.update(lr=learning_rate) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) if iteration % (training_size // batch_size) == 0: metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
class ExternalEmbedding: """Sparse Embedding for Knowledge Graph It is used to store both entity embeddings and relation embeddings. Parameters ---------- args : Global configs. num : int Number of embeddings. dim : int Embedding dimention size. device : th.device Device to store the embedding. """ def __init__(self, args, num, dim, device): self.gpu = args.gpu self.args = args self.num = num self.trace = [] self.emb = th.empty(num, dim, dtype=th.float32, device=device) self.state_sum = self.emb.new().resize_(self.emb.size(0)).zero_() self.state_step = 0 self.has_cross_rel = False # queue used by asynchronous update self.async_q = None # asynchronous update process self.async_p = None def init(self, emb_init): """Initializing the embeddings. Parameters ---------- emb_init : float The intial embedding range should be [-emb_init, emb_init]. """ INIT.uniform_(self.emb, -emb_init, emb_init) INIT.zeros_(self.state_sum) def setup_cross_rels(self, cross_rels, global_emb): cpu_bitmap = th.zeros((self.num, ), dtype=th.bool) for i, rel in enumerate(cross_rels): cpu_bitmap[rel] = 1 self.cpu_bitmap = cpu_bitmap self.has_cross_rel = True self.global_emb = global_emb def get_noncross_idx(self, idx): cpu_mask = self.cpu_bitmap[idx] gpu_mask = ~cpu_mask return idx[gpu_mask] def share_memory(self): """Use torch.tensor.share_memory_() to allow cross process tensor access """ self.emb.share_memory_() self.state_sum.share_memory_() def __call__(self, idx, gpu_id=-1, trace=True): """ Return sliced tensor. Parameters ---------- idx : th.tensor Slicing index gpu_id : int Which gpu to put sliced data in. trace : bool If True, trace the computation. This is required in training. If False, do not trace the computation. Default: True """ if self.has_cross_rel: cpu_idx = idx.cpu() cpu_mask = self.cpu_bitmap[cpu_idx] cpu_idx = cpu_idx[cpu_mask] cpu_idx = th.unique(cpu_idx) if cpu_idx.shape[0] != 0: cpu_emb = self.global_emb.emb[cpu_idx] self.emb[cpu_idx] = cpu_emb.cuda(gpu_id) s = self.emb[idx] if gpu_id >= 0: s = s.cuda(gpu_id) # During the training, we need to trace the computation. # In this case, we need to record the computation path and compute the gradients. if trace: data = s.clone().detach().requires_grad_(True) self.trace.append((idx, data)) else: data = s return data def update(self, gpu_id=-1): """ Update embeddings in a sparse manner Sparse embeddings are updated in mini batches. we maintains gradient states for each embedding so they can be updated separately. Parameters ---------- gpu_id : int Which gpu to accelerate the calculation. if -1 is provided, cpu is used. """ self.state_step += 1 with th.no_grad(): for idx, data in self.trace: grad = data.grad.data clr = self.args.lr #clr = self.args.lr / (1 + (self.state_step - 1) * group['lr_decay']) # the update is non-linear so indices must be unique grad_indices = idx grad_values = grad if self.async_q is not None: grad_indices.share_memory_() grad_values.share_memory_() self.async_q.put((grad_indices, grad_values, gpu_id)) else: grad_sum = (grad_values * grad_values).mean(1) device = self.state_sum.device if device != grad_indices.device: grad_indices = grad_indices.to(device) if device != grad_sum.device: grad_sum = grad_sum.to(device) if self.has_cross_rel: cpu_mask = self.cpu_bitmap[grad_indices] cpu_idx = grad_indices[cpu_mask] if cpu_idx.shape[0] > 0: cpu_grad = grad_values[cpu_mask] cpu_sum = grad_sum[cpu_mask].cpu() cpu_idx = cpu_idx.cpu() self.global_emb.state_sum.index_add_( 0, cpu_idx, cpu_sum) std = self.global_emb.state_sum[cpu_idx] if gpu_id >= 0: std = std.cuda(gpu_id) std_values = std.sqrt_().add_(1e-10).unsqueeze(1) tmp = (-clr * cpu_grad / std_values) tmp = tmp.cpu() self.global_emb.emb.index_add_(0, cpu_idx, tmp) self.state_sum.index_add_(0, grad_indices, grad_sum) std = self.state_sum[grad_indices] # _sparse_mask if gpu_id >= 0: std = std.cuda(gpu_id) std_values = std.sqrt_().add_(1e-10).unsqueeze(1) tmp = (-clr * grad_values / std_values) if tmp.device != device: tmp = tmp.to(device) # TODO(zhengda) the overhead is here. self.emb.index_add_(0, grad_indices, tmp) self.trace = [] def create_async_update(self): """Set up the async update subprocess. """ self.async_q = Queue(1) self.async_p = mp.Process(target=async_update, args=(self.args, self, self.async_q)) self.async_p.start() def finish_async_update(self): """Notify the async update subprocess to quit. """ self.async_q.put((None, None, None)) self.async_p.join() def curr_emb(self): """Return embeddings in trace. """ data = [data for _, data in self.trace] return th.cat(data, 0) def save(self, path, name): """Save embeddings. Parameters ---------- path : str Directory to save the embedding. name : str Embedding name. """ file_name = os.path.join(path, name + '.npy') np.save(file_name, self.emb.cpu().detach().numpy()) def load(self, path, name): """Load embeddings. Parameters ---------- path : str Directory to load the embedding. name : str Embedding name. """ file_name = os.path.join(path, name + '.npy') self.emb = th.Tensor(np.load(file_name))
with torch.no_grad(): r_gen = RolloutGenerator(args.logdir, device, time_limit) while e_queue.empty(): if p_queue.empty(): sleep(.1) else: s_id, params = p_queue.get() r_queue.put((s_id, r_gen.rollout(params))) ################################################################################ # Define queues and start workers # ################################################################################ p_queue = Queue() r_queue = Queue() e_queue = Queue() for p_index in range(num_workers): Process(target=slave_routine, args=(p_queue, r_queue, e_queue, p_index)).start() ################################################################################ # Evaluation # ################################################################################ def evaluate(solutions, results, rollouts=100): """ Give current controller evaluation. Evaluation is minus the cumulated reward averaged over rollout runs.
class PlmVectorizationPredictor(object): def __init__(self, model_info, config=None, vectorization=False): if isinstance(model_info, dict): self.args = model_info['config'] self.model = model_info['model'] self.tokenizer = model_info['tokenizer'] elif isinstance(model_info, str): self.model_path = model_info self.args = self.load_config(config) model = PlmModel(self.args) self.model = self.load_model(model, model_info) self.tokenizer = self.model.tokenizer else: raise ValueError('error..') if self.args.data_type == 'query': self.id_field = 'description_id' else: self.id_field = 'paper_id' if vectorization: self.dest_filename = self.args.dest_filename self.output_queue = Queue(-1) self.worker = Process(target=self.np2str) self.worker.daemon = True self.worker.start() def predict(self, src_filename, dest_filename): self.model.eval() existed_ids = set() for item in read_jsonline_lazy(dest_filename, default=[]): existed_ids.add(item[self.id_field]) loader = VectorizationDataLoader(src_filename, self.tokenizer, self.args) cos = nn.CosineSimilarity(dim=1) tp_count = 0 total_count = 0 for batch in loader: with torch.no_grad(): query_embed = self.model(batch, 'query') true_embed = self.model(batch, 'true') false_embed = self.model(batch, 'false') true_scores = cos(query_embed, true_embed) false_scores = cos(query_embed, false_embed) print(true_scores, false_scores) total_count += query_embed.size(0) tp_count += (true_scores > false_scores).sum().cpu().numpy().tolist() accuray = tp_count / total_count return accuray def np2str(self): while True: batch = self.output_queue.get(block=True) lines = [] for sent_embed, data_id in zip(batch['vector'], batch['index']): vec_str = np.array2string(sent_embed, separator=' ', floatmode='maxprec')[1:-1] vec_str = ' '.join([line.strip() for line in vec_str.splitlines(False)]) line = data_id + ' ' + vec_str lines.append(line) # if not getattr(self,'dest_filename',None): # print(lines, batch) append_lines(self.dest_filename, lines) def vectorize(self, src_filename, dest_filename): self.dest_filename = dest_filename loader = VectorizationDataLoader(src_filename, self.tokenizer, self.args) for batch in loader: with torch.no_grad(): sent_embed_list = self.model(batch, prefix=None).cpu().numpy() self.output_queue.put({'vector': sent_embed_list, 'index': batch['data_ids']}) def load_model(self, model, model_path): if torch.cuda.is_available(): checkpoint = torch.load(model_path) else: checkpoint = torch.load(model_path, map_location=torch.device('cpu')) state_dict = OrderedDict() # avoid error when load parallel trained model for k, v in checkpoint.items(): if k.startswith('module.'): k = k[7:] state_dict[k] = v model.load_state_dict(state_dict) if torch.cuda.is_available(): model = model.cuda() return model def load_config(self, custom_config): # default_config = vars(parse_args(parser=self.parser)) config_path = os.path.splitext(self.model_path)[0] + '.json' model_config = read_json(config_path) if custom_config: config_dict = {**model_config, **custom_config} else: config_dict = model_config config = Munch(config_dict) return config
class VariationalAutoEncoder(nn.Module): ''' Implementation of a Variational AutoEncoder in pytorch. Currently two decoder/encoder units are supported. The first unit features a two layer dense neural network and the second a deep convolutional net. The number of laternt units can be specified and is usually around 4-12 units. The implmentation supports cuda. If cuda is not used the multiprocessing framework is/can be used to send the computation to the background, so the jupyter notebook it runs in will not be blocked. ''' def __init__(self, n_latent_units, drop_ratio, convolutional=False): ''' Constructor :param n_latent_units: :param drop_ratio: ''' super(VariationalAutoEncoder, self).__init__() self.encoder = Encoder.Encoder(n_latent_units, drop_ratio) if not convolutional \ else ConvEncoder.Encoder(n_latent_units, drop_ratio) self.decoder = Decoder.Decoder(n_latent_units, drop_ratio) if not convolutional \ else ConvDecoder.Decoder(n_latent_units, drop_ratio) self.proc = None self.counter_epoch = Counter() self.counter_interation = Counter() self.loss_queue = Queue() self.stop_signal = Signal() self.losses = [] def forward(self, x): ''' The forward method, calles the encoder and decoder :param x: :return: ''' z, mu, log_std = self.encoder.forward(x) self.mu = mu self.log_std = log_std return self.decoder.forward(z) def loss(self, _in, _out, mu, log_std): ''' The loss function, the loss is calculated as the reconstruction error and the error given by the deviation of latent variable from the normal distirbution :param _in: :param _out: :param mu: :param log_std: :return: ''' # img_loss = self.img_loss_func(_in, _out) # img_loss = F.mse_loss(_in, _out) img_loss = _in.sub(_out).pow(2).sum() mean_sq = mu * mu # -0.5 * tf.reduce_sum(1.0 + 2.0 * logsd - tf.square(mn) - tf.exp(2.0 * logsd), 1) latent_loss = -0.5 * torch.sum(1.0 + 2.0 * log_std - mean_sq - torch.exp(2.0 * log_std)) return img_loss + latent_loss, img_loss, latent_loss def start(self, train=None): ''' This runs the training in the background. Currently only works with the cpu version (cuda not supported atm) :param train: :return: ''' if self.proc is not None: raise Exception("Process already started.") self.share_memory() self.losses = [] if train is None: train = VariationalAutoEncoder._get_training_test_method() self.proc = mp.Process(target=train, args=(self, self.train_loader, self.test_loader, self.counter_epoch, self.counter_interation, self.loss_queue, self.stop_signal)) self.proc.start() def restart(self, train=None): ''' Running in the background can be stopped. This method should be used if the computation should be resumed. As with start(), does currently not work with cuda. :param train: :return: ''' if self.proc is None: raise Exception("Process has not been started before.") if self.proc.is_alive(): raise Exception("Process is still active.") self.stop_signal.set_signal(False) if train is None: train = VariationalAutoEncoder._get_training_test_method() self.proc = mp.Process(target=train, args=(self, self.train_loader, self.test_loader, self.counter_epoch, self.counter_interation, self.loss_queue, self.stop_signal)) self.proc.start() def stop(self): ''' This functions sends a stop signal to the background process. :return: ''' if self.proc is None: raise Exception("Process has been started.") if not self.proc.is_alive(): raise Exception("Process is not alive.") self.stop_signal.set_signal(True) self.proc.join() self.stop_signal.set_signal(False) def get_progress(self): ''' Functions gets the progress of the computation running in the background. :return: ''' while self.loss_queue.qsize() > 0: self.losses.append(self.loss_queue.get()) return self.losses def set_train_loader(self, train_loader, test_loader=None): self.train_loader = train_loader self.test_loader = test_loader def cuda(self): super(VariationalAutoEncoder, self).cuda() self.decoder.cuda() self.encoder.cuda() @staticmethod def _get_training_test_method(): def train(model, train_loader, test_loader, counter_epoch, counter_iterations, loss_queue, stop_signal): print("started", stop_signal.value) train_op = optim.Adam(model.parameters(), lr=0.0005) while not stop_signal.value: loss_train = [] loss_test = [] n_train = [] n_test = [] for _, data in enumerate(train_loader): # data = Variable(data.view(-1,784)) data = Variable(data) train_op.zero_grad() dec = model(data) loss, loss_1, loss_2 = model.loss(dec, data, model.mu, model.log_std) loss_train.append( (loss.data[0], loss_1.data[0], loss_2.data[0])) n_train.append(len(data)) loss.backward() train_op.step() counter_iterations.increment() for _, data in enumerate(test_loader): # data = Variable(data.view(-1,784)) data = Variable(data) dec = model(data) loss, _, _ = model.loss(dec, data, model.mu, model.log_std) loss_test.append(loss.data[0]) n_test.append(len(data)) counter_epoch.increment() epoch = counter_epoch.value loss_train_mean = numpy.mean(loss_train, axis=0) # / numpy.sum(n_train) loss_test_mean = numpy.mean(loss_test) # / numpy.sum(n_test) loss_queue.put((epoch, loss_train_mean, loss_test_mean)) #print("{}: ".format(epoch), loss_train_mean, loss_test_mean) return train @staticmethod def get_MNIST_train_loader(batch_size=32, keep_classes=False): train_loader = torch.utils.data.DataLoader(datasets.MNIST( './data/datasets/MNIST', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])), batch_size=batch_size) test_loader = torch.utils.data.DataLoader(datasets.MNIST( './data/datasets/MNIST', train=False, transform=transforms.Compose([transforms.ToTensor()])), batch_size=batch_size) if keep_classes: return train_loader, test_loader return DataIterator(train_loader), DataIterator(test_loader) @staticmethod def get_FashionMNIST_train_loader(batch_size=32, keep_classes=False): train_loader = torch.utils.data.DataLoader(datasets.FashionMNIST( './data/datasets/FMNIST', train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])), batch_size=batch_size) test_loader = torch.utils.data.DataLoader(datasets.FashionMNIST( './data/datasets/FMNIST', train=False, transform=transforms.Compose([transforms.ToTensor()])), batch_size=batch_size) if keep_classes: return train_loader, test_loader return DataIterator(train_loader), DataIterator(test_loader)
def controller_train_proc(ctrl_dir, controller, vae, mdrnn, target_return=950, skip_train=False, display=True): step_log('4-2. controller_train_proc START!!') # define current best and load parameters cur_best = None if not os.path.exists(ctrl_dir): os.mkdir(ctrl_dir) ctrl_file = os.path.join(ctrl_dir, 'best.tar') p_queue = Queue() r_queue = Queue() #e_queue = Queue() # pipaek : not necessary if not multiprocessing print("Attempting to load previous best...") if os.path.exists(ctrl_file): #state = torch.load(ctrl_file, map_location={'cuda:0': 'cpu'}) state = torch.load(ctrl_file) cur_best = -state['reward'] controller.load_state_dict(state['state_dict']) print("Previous best was {}...".format(-cur_best)) if skip_train: return # pipaek : 트레이닝을 통한 모델 개선을 skip하고 싶을 때.. def evaluate(solutions, results, rollouts=100): # pipaek : rollout 100 -> 10 , originally 100 """ Give current controller evaluation. Evaluation is minus the cumulated reward averaged over rollout runs. :args solutions: CMA set of solutions :args results: corresponding results :args rollouts: number of rollouts :returns: minus averaged cumulated reward """ index_min = np.argmin(results) best_guess = solutions[index_min] restimates = [] for s_id in range(rollouts): print('p_queue.put(), s_id=%d' % s_id) p_queue.put((s_id, best_guess)) print('>>>rollout_routine!!') rollout_routine() # pipaek : 여기서도 p_queue.put 하자마자 바로 처리.. print(">>>Evaluating...") for _ in tqdm(range(rollouts)): #while r_queue.empty(): # sleep(.1) # pipaek : multi-process가 아니므로 if not r_queue.empty( ): # pipaek : 20180718 r_queue.get()에서 stuck되어 있는 것을 방지하기 위해 체크!! #print('r_queue.get()') #restimates.append(r_queue.get()[1]) r_s_id, r = r_queue.get() print( 'in evaluate r_queue.get() r_s_id=%d, r_queue remain=%d' % (r_s_id, r_queue.qsize())) restimates.append(r) else: print('r_queue.empty() -> break!!') break return best_guess, np.mean(restimates), np.std(restimates) def rollout_routine(): """ Thread routine. Threads interact with p_queue, the parameters queue, r_queue, the result queue and e_queue the end queue. They pull parameters from p_queue, execute the corresponding rollout, then place the result in r_queue. Each parameter has its own unique id. Parameters are pulled as tuples (s_id, params) and results are pushed as (s_id, result). The same parameter can appear multiple times in p_queue, displaying the same id each time. As soon as e_queue is non empty, the thread terminate. When multiple gpus are involved, the assigned gpu is determined by the process index p_index (gpu = p_index % n_gpus). :args p_queue: queue containing couples (s_id, parameters) to evaluate :args r_queue: where to place results (s_id, results) :args e_queue: as soon as not empty, terminate :args p_index: the process index """ # init routine #gpu = p_index % torch.cuda.device_count() #device = torch.device('cuda:{}'.format(gpu) if torch.cuda.is_available() else 'cpu') # redirect streams #if not os.path.exists(tmp_dir): # os.mkdir(tmp_dir) #sys.stdout = open(os.path.join(tmp_dir, 'rollout.out'), 'a') #sys.stderr = open(os.path.join(tmp_dir, 'rollout.err'), 'a') with torch.no_grad(): r_gen = RolloutGenerator(vae, mdrnn, controller, device, rollout_time_limit) while not p_queue.empty(): print('in rollout_routine, p_queue.get()') s_id, params = p_queue.get() print('r_queue.put() sid=%d' % s_id) r_queue.put((s_id, r_gen.rollout(params))) print('r_gen.rollout OK, r_queue.put()') #r_queue.qsize() parameters = controller.parameters() es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1, {'popsize': C_POP_SIZE}) print("CMAEvolutionStrategy start OK!!") epoch = 0 log_step = 3 while not es.stop(): print("--------------------------------------") print("CURRENT EPOCH = %d" % epoch) if cur_best is not None and -cur_best > target_return: print("Already better than target, breaking...") break r_list = [0] * C_POP_SIZE # result list solutions = es.ask() print("CMAEvolutionStrategy-ask") # push parameters to queue for s_id, s in enumerate( solutions): # pipaek : 이 for가 C_POP_SIZE 만큼 반복된다. #for _ in range(C_POP_SIZE * C_N_SAMPLES): for _ in range(C_N_SAMPLES): print('in controller_train_proc p_queue.put() s_id : %d' % s_id) p_queue.put((s_id, s)) #print("p_queue.put %d" % s_id) rollout_routine( ) # pipaek : p_queue.put 하자마자 바로 get해서 rollout하고 나서 r_queue에 결과 입력. print("rollout_routine OK, r_queue size=%d" % r_queue.qsize()) # retrieve results if display: pbar = tqdm(total=C_POP_SIZE * C_N_SAMPLES) #for idx in range(C_POP_SIZE * C_N_SAMPLES): while not r_queue.empty( ): # pipaek : 20180718 여기서 r_queue.get을 못해서 영원히 걸려있는 상태를 방지하기 위해 for문을 while문으로 바꾼다. #while r_queue.empty(): # sleep(.1) try: r_s_id, r = r_queue.get() print( 'in controller_train_proc r_queue.get() r_s_id=%d, r_queue remain=%d' % (r_s_id, r_queue.qsize())) r_list[r_s_id] += r / C_N_SAMPLES if display: pbar.update(1) except IndexError as err: print('IndexError during r_queue.get()') print('cur r_list size:%d, index:%d' % (len(r_list), r_s_id)) if display: pbar.close() es.tell(solutions, r_list) # pipaek : solution array에다가 r_list 결과를 업데이트.. es.disp() # evaluation and saving if epoch % log_step == log_step - 1: print(">>>> TRYING EVALUATION, CURRENT EPOCH = %d" % epoch) best_params, best, std_best = evaluate( solutions, r_list, rollouts=100 ) # pipaek : evaluate을 위해서 rollout은 10번만 하자.. originally 100 print("Current evaluation: {}".format(best)) if not cur_best or cur_best > best: cur_best = best print("Saving new best with value {}+-{}...".format( -cur_best, std_best)) load_parameters(best_params, controller) torch.save( { 'epoch': epoch, 'reward': -cur_best, 'state_dict': controller.state_dict() }, os.path.join(ctrl_dir, 'best.tar')) if -best > target_return: print( "Terminating controller training with value {}...".format( best)) break epoch += 1 print("es.stop!!") es.result_pretty()
class VideoProcessingPipeline(object): """ Manages the acquisition and preprocessing of video frames from the webcam. A pipeline with two processes is used: the first process denoises frames and queues the result to the second process which calculates the optical flows on CPU, and queues back the moving average to the main process. This moving average is used as attention prior by the model. """ def __init__(self, img_size, img_cfg, frames_window=13, flows_window=5, skip_frames=2, cam_res=(640, 480), denoising=True): """ :param img_size: the images input size of the neural network. :param img_cfg: the config parameters for image processing. :param frames_window: the number of webcam frames input at once into the neural network to make a prediction step. Best results tend to be obtained for roughly a bit less than one second. :param flows_window: the number of optical flows used to calculate an attention prior. Defaults to 5. Change at your own risks. :param skip_frames: down-sampling factor of the webcam frames. Defaults to 2 in order to roughly obtain 15 FPS with a 30 FPS webcam. This down-sampling is basic and could be improved to support ratios such as 2/3 to obtain 20 FPS. :param cam_res: webcam resolution (width, height). The application was only tested in 640x480. Change at your own risks. :param denoising: activate the denoising process. Defaults to True. Most usefull with low quality webcams. """ if frames_window not in [9, 13, 17, 21]: raise ValueError('Invalid window size for webcam frames: `%s`' % str(frames_window)) if flows_window not in [3, 5, 7, 9]: raise ValueError('Invalid window size for optical flows: `%s`' % str(flows_window)) if flows_window > frames_window: raise ValueError( 'Optical flow window cannot be wider than camera frames window' ) self.img_size = img_size # optical flows can be computed in lower resolution w/o harming results self.opt_size = img_size // 2 self.frames_window = frames_window self.flows_window = flows_window self.skip_frames = skip_frames self.total_frames = 0 # total number of frames acquired self.cam_res = cam_res self.denoising = denoising self.img_frames = [ np.zeros((self.img_size, self.img_size, 3), dtype=np.uint8) ] * (self.frames_window // 2) self.gray_frames = [ np.zeros((self.opt_size, self.opt_size), dtype=np.uint8) ] * (self.frames_window // 2) self.priors = [] # init multiprocessing self.q_parent, self.q_prior = Queue(), Queue() # start denoising process if self.denoising: self.q_denoise = Queue() self.p_denoise = Process( target=denoise_frame, args=(self.q_denoise, self.q_prior, img_cfg.getint('h'), img_cfg.getint('template_window_size'), img_cfg.getint('search_window_size'))) self.p_denoise.start() print('Denoising enabled') else: print('Denoising disabled') # start prior calculation process self.p_prior = Process(target=calc_attention_prior, args=(self.opt_size, self.flows_window, self.q_prior, self.q_parent)) self.p_prior.start() # initialise camera self.cap = cv.VideoCapture(0) if self.cap.isOpened(): self.cap_fps = int(round(self.cap.get(cv.CAP_PROP_FPS))) self.cap.set(3, self.cam_res[0]) self.cap.set(4, self.cam_res[1]) print('Device @%d FPS' % self.cap_fps) else: raise IOError('Failed to open webcam capture') # raw images self.last_frame = collections.deque(maxlen=self.cap_fps) # cropped region of the raw images self.last_cropped_frame = collections.deque(maxlen=self.cap_fps) # acquire and preprocess the exact number of frames needed # to make the first prior map for i in range((frames_window // 2) + 1): self.acquire_next_frame(enable_skip=False) # now wait for the first prior to be returned while len(self.priors) == 0: if not self.q_parent.empty(): # de-queue a prior prior, flow = self.q_parent.get(block=False) self.priors.append(prior) # sleep while the queue is empty time.sleep(0.01) def _center_crop(self, img, target_shape): """ Returns a center crop of the provided image. :param img: the image to crop. :param target_shape: the dimensions of the crop. :return the cropped image """ h, w = target_shape y, x = img.shape[:2] start_y = max(0, y // 2 - (h // 2)) start_x = max(0, x // 2 - (w // 2)) return img[start_y:start_y + h, start_x:start_x + w] def acquire_next_frame(self, enable_skip=True): """ Reads the next frame from the webcam and starts the asynchronous preprocessing. The video stream is down-sampled as necessary to reach the desired FPS. :param enable_skip: enables down-sampling of the webcam stream. Must be True except during initialisation. :return: the last frame acquired or None if that frame was skipped due to down-sampling of the webcam stream. """ ret, frame = self.cap.read() if not ret: self.terminate() raise IOError('Failed to read the next frame from webcam') self.total_frames += 1 if not enable_skip: return self._preprocess_frame(frame) elif (self.total_frames % self.skip_frames) == 0: return self._preprocess_frame(frame) return None def _preprocess_frame(self, frame): """ Crops, change to gray scale, resizes and sends the newly acquired webcam frame to the preprocessing pipeline. :param frame: the last acquired frame. :return the last acquired frame. """ # crop a square at the center of the frame rgb = cv.cvtColor(frame, cv.COLOR_BGR2RGB) rgb = self._center_crop(rgb, (self.cam_res[1], self.cam_res[1])) self.last_frame.append(frame) self.last_cropped_frame.append(rgb) # convert to gray scale and resize gray = cv.cvtColor(rgb, cv.COLOR_RGB2GRAY) gray = cv.resize(gray, (self.opt_size, self.opt_size)) rgb = cv.resize(rgb, (self.img_size, self.img_size)) # queue to relevant child process if self.denoising: self.q_denoise.put(gray) else: self.q_prior.put(gray) self.img_frames.append(rgb) self.gray_frames.append(gray) return frame def get_model_input(self, dequeue=True): """ Gets the list of images and the prior needed for the inference of the current frame. Use `dequeue` to retrieve the next prior from the queue. The caller must first verify that the queue is non-empty. :param dequeue: must be set to True except during initialisation. :return: images ndarray and the corresponding prior """ # de-queue a prior if dequeue: prior, flow = self.q_parent.get(block=False) self.priors.append(prior) # ensure enough frames have been preprocessed n_frames = self.frames_window assert len(self.img_frames) >= n_frames assert len(self.gray_frames) >= n_frames assert len(self.priors) == 1 imgs = np.stack(self.img_frames[:self.frames_window], axis=0) self.img_frames.pop(0) # slide window to the right self.gray_frames.pop(0) return imgs, [self.priors.pop(0)] def terminate(self): """Terminates processes, closes queues and releases video capture.""" if self.denoising: self.q_denoise.put(None) time.sleep(0.2) self.p_denoise.terminate() else: self.q_prior.put(None) time.sleep(0.2) self.p_prior.terminate() time.sleep(0.1) if self.denoising: self.p_denoise.join(timeout=0.5) self.p_prior.join(timeout=0.5) if self.denoising: self.q_denoise.close() self.q_parent.close() self.cap.release()
def dynamic_power(model, input_shape): q = Queue() power_return = Queue() interval_return = Queue() latency_return = Queue() input_tensor_queue = Queue() model_queue = Queue() input_tensor = torch.ones([*input_shape]) input_tensor_queue.put(input_tensor) model.share_memory() model_queue.put(model) context = torch.multiprocessing.get_context('spawn') p_thread = context.Process(target=power_thread, args=(power_return, interval_return, q)) l_thread = context.Process(target=latency_thread, args=(model_queue, input_tensor_queue, latency_return, q)) l_thread.start() p_thread.start() power_l = list() # GPU power list interval_l = list() # power interval list latency_l = list() # latency list l_thread.join() while True: if not power_return.empty(): power_l.append(power_return.get()) if not interval_return.empty(): interval_l.append(interval_return.get()) if not latency_return.empty(): latency_l.append(latency_return.get()) if power_return.empty() and interval_return.empty( ) and latency_return.empty(): break power_return.close() interval_return.close() latency_return.close() q.close() del q del power_return del latency_return del interval_return return latency_l, power_l, interval_l
def __init__(self, img_size, img_cfg, frames_window=13, flows_window=5, skip_frames=2, cam_res=(640, 480), denoising=True): """ :param img_size: the images input size of the neural network. :param img_cfg: the config parameters for image processing. :param frames_window: the number of webcam frames input at once into the neural network to make a prediction step. Best results tend to be obtained for roughly a bit less than one second. :param flows_window: the number of optical flows used to calculate an attention prior. Defaults to 5. Change at your own risks. :param skip_frames: down-sampling factor of the webcam frames. Defaults to 2 in order to roughly obtain 15 FPS with a 30 FPS webcam. This down-sampling is basic and could be improved to support ratios such as 2/3 to obtain 20 FPS. :param cam_res: webcam resolution (width, height). The application was only tested in 640x480. Change at your own risks. :param denoising: activate the denoising process. Defaults to True. Most usefull with low quality webcams. """ if frames_window not in [9, 13, 17, 21]: raise ValueError('Invalid window size for webcam frames: `%s`' % str(frames_window)) if flows_window not in [3, 5, 7, 9]: raise ValueError('Invalid window size for optical flows: `%s`' % str(flows_window)) if flows_window > frames_window: raise ValueError( 'Optical flow window cannot be wider than camera frames window' ) self.img_size = img_size # optical flows can be computed in lower resolution w/o harming results self.opt_size = img_size // 2 self.frames_window = frames_window self.flows_window = flows_window self.skip_frames = skip_frames self.total_frames = 0 # total number of frames acquired self.cam_res = cam_res self.denoising = denoising self.img_frames = [ np.zeros((self.img_size, self.img_size, 3), dtype=np.uint8) ] * (self.frames_window // 2) self.gray_frames = [ np.zeros((self.opt_size, self.opt_size), dtype=np.uint8) ] * (self.frames_window // 2) self.priors = [] # init multiprocessing self.q_parent, self.q_prior = Queue(), Queue() # start denoising process if self.denoising: self.q_denoise = Queue() self.p_denoise = Process( target=denoise_frame, args=(self.q_denoise, self.q_prior, img_cfg.getint('h'), img_cfg.getint('template_window_size'), img_cfg.getint('search_window_size'))) self.p_denoise.start() print('Denoising enabled') else: print('Denoising disabled') # start prior calculation process self.p_prior = Process(target=calc_attention_prior, args=(self.opt_size, self.flows_window, self.q_prior, self.q_parent)) self.p_prior.start() # initialise camera self.cap = cv.VideoCapture(0) if self.cap.isOpened(): self.cap_fps = int(round(self.cap.get(cv.CAP_PROP_FPS))) self.cap.set(3, self.cam_res[0]) self.cap.set(4, self.cam_res[1]) print('Device @%d FPS' % self.cap_fps) else: raise IOError('Failed to open webcam capture') # raw images self.last_frame = collections.deque(maxlen=self.cap_fps) # cropped region of the raw images self.last_cropped_frame = collections.deque(maxlen=self.cap_fps) # acquire and preprocess the exact number of frames needed # to make the first prior map for i in range((frames_window // 2) + 1): self.acquire_next_frame(enable_skip=False) # now wait for the first prior to be returned while len(self.priors) == 0: if not self.q_parent.empty(): # de-queue a prior prior, flow = self.q_parent.get(block=False) self.priors.append(prior) # sleep while the queue is empty time.sleep(0.01)
def main(): q = Queue() idx_q = Queue() epochs = 3 learning_rate = 0.001 batch_size = 32 test_batch_size = 16 log_interval = 100 cpu_pth_path = "/home/yoon/Yoon/pytorch/research/part_train/cpu.pth" gpu_pth_path = "/home/yoon/Yoon/pytorch/research/part_train/gpu.pth" #print(torch.cuda.get_device_name(0)) print(torch.cuda.is_available()) use_cuda = torch.cuda.is_available() print("use_cude : ", use_cuda) #device = torch.device("cuda" if use_cuda else "cpu") device1 = "cpu" device2 = "cuda" nThreads = 1 if use_cuda else 2 if platform.system() == 'Windows': nThreads = 0 #if you use windows transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, ), (0.5, ))]) # datasets testset = torchvision.datasets.FashionMNIST('./data', download=True, train=False, transform=transform) test_loader = torch.utils.data.DataLoader(testset, batch_size=test_batch_size, shuffle=False, num_workers=nThreads) # constant for classes classes = ('T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle Boot') # model model1 = Net(q).to(device1) model1.share_memory() # imshow example model2 = Net2(q).to(device2) model2.share_memory() # Freeze model weights for param in model1.parameters(): # 전체 layer train해도 파라미터 안바뀌게 프리징 param.requires_grad = False for param in model2.parameters(): # 전체 layer train해도 파라미터 안바뀌게 프리징 param.requires_grad = False proc1 = Process(target=my_run, args=(model1, testset, device1, cpu_pth_path, idx_q)) proc2 = Process(target=my_run, args=(model2, testset, device2, gpu_pth_path, idx_q)) num_processes = (proc2, proc1) processes = [] for procs in num_processes: procs.start() processes.append(procs) for proc in processes: proc.join()
def data_runner(queue1: Queue, queue2: Queue): queue1.get() queue2.put(1) queue1.get() queue2.put(1)
def train(training_dbs, validation_db, start_iter=0): learning_rate = system_configs.learning_rate max_iteration = system_configs.max_iter pretrained_model = system_configs.pretrain snapshot = system_configs.snapshot val_iter = system_configs.val_iter display = system_configs.display decay_rate = system_configs.decay_rate stepsize = system_configs.stepsize # getting the size of each database training_size = len(training_dbs[0].db_inds) validation_size = len(validation_db.db_inds) # queues storing data for training training_queue = Queue(system_configs.prefetch_size) validation_queue = Queue(5) # queues storing pinned data for training pinned_training_queue = queue.Queue(system_configs.prefetch_size) pinned_validation_queue = queue.Queue(5) # load data sampling function data_file = "sample.{}".format(training_dbs[0].data) sample_data = importlib.import_module(data_file).sample_data # allocating resources for parallel reading training_tasks = init_parallel_jobs(training_dbs, training_queue, sample_data, True) if val_iter: validation_tasks = init_parallel_jobs([validation_db], validation_queue, sample_data, False) training_pin_semaphore = threading.Semaphore() validation_pin_semaphore = threading.Semaphore() training_pin_semaphore.acquire() validation_pin_semaphore.acquire() training_pin_args = (training_queue, pinned_training_queue, training_pin_semaphore) training_pin_thread = threading.Thread(target=pin_memory, args=training_pin_args) training_pin_thread.daemon = True training_pin_thread.start() validation_pin_args = (validation_queue, pinned_validation_queue, validation_pin_semaphore) validation_pin_thread = threading.Thread(target=pin_memory, args=validation_pin_args) validation_pin_thread.daemon = True validation_pin_thread.start() print("building model...") nnet = NetworkFactory(training_dbs[0]) if pretrained_model is not None: if not os.path.exists(pretrained_model): raise ValueError("pretrained model does not exist") print("loading from pretrained model") nnet.load_pretrained_params(pretrained_model) if start_iter: learning_rate /= (decay_rate**(start_iter // stepsize)) nnet.load_params(start_iter) nnet.set_lr(learning_rate) print("training starts from iteration {} with learning_rate {}".format( start_iter + 1, learning_rate)) else: nnet.set_lr(learning_rate) print("training start...") nnet.cuda() nnet.train_mode() with stdout_to_tqdm() as save_stdout: for iteration in tqdm(range(start_iter + 1, max_iteration + 1), file=save_stdout, ncols=80): training = pinned_training_queue.get(block=True) training_loss, focal_loss, pull_loss, push_loss, regr_loss = nnet.train( **training) #training_loss, focal_loss, pull_loss, push_loss, regr_loss, cls_loss = nnet.train(**training) display = 1250 if display and iteration % display == 0: print("training loss at iteration {}: {}".format( iteration, training_loss.item())) print("focal loss at iteration {}: {}".format( iteration, focal_loss.item())) print("pull loss at iteration {}: {}".format( iteration, pull_loss.item())) print("push loss at iteration {}: {}".format( iteration, push_loss.item())) print("regr loss at iteration {}: {}".format( iteration, regr_loss.item())) #print("cls loss at iteration {}: {}\n".format(iteration, cls_loss.item())) del training_loss, focal_loss, pull_loss, push_loss, regr_loss #, cls_loss if val_iter and validation_db.db_inds.size and iteration % val_iter == 0: nnet.eval_mode() validation = pinned_validation_queue.get(block=True) validation_loss = nnet.validate(**validation) print("validation loss at iteration {}: {}".format( iteration, validation_loss.item())) # testing(validation_db, nnet, result_dir, debug=debug) nnet.train_mode() if iteration % snapshot == 0: nnet.save_params(iteration) if iteration % stepsize == 0: learning_rate /= decay_rate nnet.set_lr(learning_rate) # sending signal to kill the thread training_pin_semaphore.release() validation_pin_semaphore.release() # terminating data fetching processes for training_task in training_tasks: training_task.terminate() for validation_task in validation_tasks: validation_task.terminate()
class MultiprocessAsyncGameExecutor(AsyncGameExecutor): def __init__(self, game_factory: GameExecutorFactory, network: nn.Module, device: torch.device, processes: int, batches_ahead: int, batch_size: int, states_on_device: bool): self._states_on_device = states_on_device self._device = device self._experience_queue = Queue(maxsize=processes + 1) block_size = max(1, batches_ahead - processes) self.block_buffer = [] print('* starting %d workers (batch size: %d, block size: %d)' % (processes, batch_size, block_size)) self._processes = [] self._request_queues = [] for i in range(processes): request_queue = Queue(maxsize=10) # Transfer to GPU in the other process does not work.. it does not throw an error, but training does not converge p = Process(target=_run_game, args=( i, game_factory, network, device, request_queue, self._experience_queue, batch_size, block_size, False, )) p.start() self._request_queues.append(request_queue) self._processes.append(p) def _send_to_all(self, request, block=False): for request_queue in self._request_queues: request_queue.put(request, block=block) def get_experiences(self): if len(self.block_buffer) == 0: block_buffer = self._experience_queue.get(block=True) if self._states_on_device: for eps, exps in block_buffer: exps = [e.to_device(self._device) for e in exps] self.block_buffer.append((eps, exps)) else: self.block_buffer.extend(block_buffer) return self.block_buffer.pop() def update_exploration_rate(self, exploration_rate): self._send_to_all( _RunGameRequest(set_exploration_rate=exploration_rate), block=True) def close(self): print('* shutting down workers') self._send_to_all(_RunGameRequest(do_terminate=True)) # wake the workers try: while not self._experience_queue.empty(): try: self._experience_queue.get(block=False) except queue.Empty: pass except ConnectionResetError: pass except FileNotFoundError: pass self._experience_queue.close() for p in self._processes: p.join(1000) for q in self._request_queues: q.close() self._experience_queue.close()
def _call_mods_from_fast5s_cpu2(motif_seqs, chrom2len, fast5s_q, len_fast5s, positions, model_path, success_file, args): # features_batch_q = mp.Queue() # errornum_q = mp.Queue() features_batch_q = Queue() errornum_q = Queue() # pred_str_q = mp.Queue() pred_str_q = Queue() nproc = args.nproc nproc_call_mods = nproc_to_call_mods_in_cpu_mode if nproc <= nproc_call_mods + 1: nproc = nproc_call_mods + 1 + 1 fast5s_q.put("kill") features_batch_procs = [] for _ in range(nproc - nproc_call_mods - 1): p = mp.Process(target=_read_features_fast5s_q, args=(fast5s_q, features_batch_q, errornum_q, motif_seqs, chrom2len, positions, args)) p.daemon = True p.start() features_batch_procs.append(p) call_mods_gpu_procs = [] for _ in range(nproc_call_mods): p_call_mods_gpu = mp.Process(target=_call_mods_q, args=(model_path, features_batch_q, pred_str_q, success_file, args)) p_call_mods_gpu.daemon = True p_call_mods_gpu.start() call_mods_gpu_procs.append(p_call_mods_gpu) # print("write_process started..") p_w = mp.Process(target=_write_predstr_to_file, args=(args.result_file, pred_str_q)) p_w.daemon = True p_w.start() errornum_sum = 0 while True: running = any(p.is_alive() for p in features_batch_procs) while not errornum_q.empty(): errornum_sum += errornum_q.get() if not running: break for p in features_batch_procs: p.join() features_batch_q.put("kill") for p_call_mods_gpu in call_mods_gpu_procs: p_call_mods_gpu.join() # print("finishing the write_process..") pred_str_q.put("kill") p_w.join() print("%d of %d fast5 files failed.." % (errornum_sum, len_fast5s))