def multiprocess_generator(self, max_queue_size=32, num_processes=8): # Re-shuffle file list if self.shuffle and cfg.NUM_TRAINERS > 1: np.random.RandomState(self.shuffle_seed).shuffle(self.all_lines) num_lines = len(self.all_lines) // cfg.NUM_TRAINERS self.lines = self.all_lines[num_lines * cfg.TRAINER_ID:num_lines * (cfg.TRAINER_ID + 1)] self.shuffle_seed += 1 elif self.shuffle: np.random.shuffle(self.lines) # Create multiple sharding generators according to num_processes for multiple processes generators = [] for pid in range(num_processes): generators.append(self.sharding_generator(pid, num_processes)) try: enqueuer = GeneratorEnqueuer(generators) enqueuer.start(max_queue_size=max_queue_size, workers=num_processes) while True: generator_out = None while enqueuer.is_running(): if not enqueuer.queue.empty(): generator_out = enqueuer.queue.get(timeout=5) break else: time.sleep(0.01) if generator_out is None: break yield generator_out finally: if enqueuer is not None: enqueuer.stop()
def reader(): cnt = 0 try: enqueuer = GeneratorEnqueuer( infinite_reader(), use_multiprocessing=use_multiprocess_reader) enqueuer.start(max_queue_size=max_queue, workers=num_workers) generator_out = None while True: while enqueuer.is_running(): if not enqueuer.queue.empty(): generator_out = enqueuer.queue.get() break else: time.sleep(0.02) yield generator_out cnt += 1 if cnt >= total_iter: enqueuer.stop() return generator_out = None except Exception as e: print("Exception occured in reader: {}".format(str(e))) finally: if enqueuer: enqueuer.stop()
def reader(): try: enqueuer = GeneratorEnqueuer( batches, use_multiprocessing=use_multiprocessing) enqueuer.start(max_queue_size=max_queue, workers=num_workers) generator_out = None for i in range(total_step): while enqueuer.is_running(): if not enqueuer.queue.empty(): generator_out = enqueuer.queue.get() break else: time.sleep(0.02) yield generator_out generator_out = None enqueuer.stop() finally: if enqueuer is not None: enqueuer.stop()
def reader(): cnt = 0 try: enqueuer = GeneratorEnqueuer( infinite_reader(), use_multiprocessing=use_multiprocess_reader) enqueuer.start(max_queue_size=max_queue, workers=num_workers) generator_out = None while True: while enqueuer.is_running(): if not enqueuer.queue.empty(): generator_out = enqueuer.queue.get() break else: time.sleep(0.02) yield generator_out cnt += 1 if cnt >= total_iter: enqueuer.stop() return generator_out = None finally: if enqueuer is not None: enqueuer.stop()
def train(net, args): current_dir = os.path.dirname(os.path.realpath(__file__)) if os.path.exists(os.path.join(current_dir, 'results/')) == False: os.mkdir(os.path.join(current_dir, 'results/')) save_path = 'results/%s/'%args.dataset if os.path.exists(os.path.join(current_dir, save_path)) == False: os.mkdir(os.path.join(current_dir, save_path)) save_path += '%s/'%args.model_name if os.path.exists(os.path.join(current_dir, save_path)) == False: os.mkdir(os.path.join(current_dir, save_path)) logger = Logger(save_path + 'logs/') train_file_path, val_file_path, data_dir, label_dir = get_dataset_path(args.dataset) classes = get_dataset_classes(args.dataset) transformer = DataTransformer(ch_mean=args.ch_mean, ch_std=args.ch_std, resize_size=args.resize_size, pad_size=args.pad_size, crop_mode=args.crop_mode, crop_size=args.crop_size, zoom_range=[0.5, 2.0], horizontal_flip=True, color_jittering_range=20., fill_mode='constant', cval=0., label_cval=255, data_format='channels_first', color_format='RGB', x_dtype=np.float32) dataloader = VOC12(data_list_file=train_file_path, data_source_dir=data_dir, label_source_dir=label_dir, data_transformer=transformer, batch_size=args.batch_size, shuffle=True) ctx = [gpu(i) for i in args.gpus] net = net(classes) net.collect_params().initialize(ctx=ctx) net.load_base_model(ctx) #net.hybridize() #print(net) num_sample = dataloader.get_num_sample() num_steps = num_sample//args.batch_size if num_sample % args.batch_size > 0: num_steps += 1 enqueuer = GeneratorEnqueuer(generator=dataloader) enqueuer.start(workers=args.workers, max_queue_size=args.max_queue_size) output_generator = enqueuer.get() trainer = gluon.Trainer(net.collect_params(), 'nag', {'momentum': 0.9, 'wd': 0.0001, 'learning_rate': args.base_lr, 'lr_scheduler': PolyScheduler(args.base_lr, args.lr_power, num_steps*args.epochs)}) loss = MySoftmaxCrossEntropyLoss(axis=1, ignore_label=255) metrics = [AccuracyWithIgnoredLabel(axis=1, ignore_label=255)] for epoch in range(args.epochs): print('training epoch %d/%d:'%(epoch+1, args.epochs)) for m in metrics: m.reset() train_loss = 0. train_acc = 0. for i in range(num_steps): batch_x, batch_y = next(output_generator) batch_x = mx.nd.array(batch_x) batch_y = mx.nd.array(batch_y) losses = train_batch(batch_x, batch_y, ctx, net, trainer, loss, metrics) train_loss += mx.nd.mean(mx.nd.add_n(*losses)).asscalar()/len(args.gpus) info = 'loss: %.3f' % (train_loss/(i+1)) for m in metrics: name, value = m.get() info += ' | %s: %.3f'%(name, value) progress_bar(i, num_steps, info) # write logs for this epoch logger.scalar_summary('loss', train_loss/num_steps, epoch) for m in metrics: name, value = m.get() logger.scalar_summary(name, value, epoch) mx.nd.waitall() net.save_params(save_path+'checkpoint.params') enqueuer.stop()
color_jittering_range=20., fill_mode='constant', cval=0., label_cval=255, data_format='channels_first', color_format='RGB', x_dtype=np.float32) data_dir = '/home/aurora/Learning/Data/VOC2012/JPEGImages' label_dir = '/home/aurora/Learning/Data/VOC2012/SegmentationClass' val_file_path = '/home/aurora/Learning/Data/VOC2012/ImageSets/Segmentation/val.txt' dataloader = VOC12(data_list_file=val_file_path, data_source_dir=data_dir, label_source_dir=label_dir, data_transformer=transformer, batch_size=1, shuffle=True) enqueuer = GeneratorEnqueuer(generator=dataloader) enqueuer.start(workers=1, max_queue_size=10) output_generator = enqueuer.get() x, y = next(output_generator) img_y = Image.open(os.path.join(label_dir, '2007_000033.png')) result_x = array_to_img(x[0], 'channels_first') result_y = Image.fromarray(y[0, 0, :, :], mode='P') result_y.putpalette(img_y.getpalette()) result_x.show(title='result_x', command=None) result_y.show(title='result_y', command=None) enqueuer.stop()
def train(net, args): current_dir = os.path.dirname(os.path.realpath(__file__)) if os.path.exists(os.path.join(current_dir, 'results/')) == False: os.mkdir(os.path.join(current_dir, 'results/')) save_path = 'results/%s/' % args.dataset if os.path.exists(os.path.join(current_dir, save_path)) == False: os.mkdir(os.path.join(current_dir, save_path)) save_path += '%s/' % args.model_name if os.path.exists(os.path.join(current_dir, save_path)) == False: os.mkdir(os.path.join(current_dir, save_path)) logger = Logger(save_path + 'logs/') train_file_path, val_file_path, data_dir, label_dir = get_dataset_path( args.dataset) classes = get_dataset_classes(args.dataset) transformer = DataTransformer(ch_mean=args.ch_mean, ch_std=args.ch_std, resize_size=args.resize_size, pad_size=args.pad_size, crop_mode=args.crop_mode, crop_size=args.crop_size, zoom_range=[0.5, 2.0], horizontal_flip=True, color_jittering_range=20., fill_mode='constant', cval=0., label_cval=255, data_format='channels_first', color_format='RGB', x_dtype=np.float32) dataloader = VOC12(data_list_file=train_file_path, data_source_dir=data_dir, label_source_dir=label_dir, data_transformer=transformer, batch_size=args.batch_size, shuffle=True) num_sample = dataloader.get_num_sample() num_steps = num_sample // args.batch_size if num_sample % args.batch_size > 0: num_steps += 1 enqueuer = GeneratorEnqueuer(generator=dataloader) enqueuer.start(workers=args.workers, max_queue_size=args.max_queue_size) output_generator = enqueuer.get() net = net(classes) net.cuda() net = torch.nn.DataParallel(net, device_ids=args.gpus) cudnn.benchmark = True optimizer = torch.optim.SGD(net.parameters(), lr=args.base_lr, momentum=0.9, weight_decay=args.weight_decay, nesterov=True) scheduler = get_polyscheduler(optimizer, args.lr_power, args.epochs) loss_functions = [nn.CrossEntropyLoss(ignore_index=255)] metric_functions = [SparseAccuracy(ignore_label=255, name='Acc')] for epoch in range(args.epochs): scheduler.step() print('training epoch %d/%d, lr=%.4f:' % (epoch + 1, args.epochs, optimizer.state_dict()['param_groups'][0]['lr'])) for m in metric_functions: m.reset() train_loss = 0. for i in range(num_steps): batch_x, batch_y = next(output_generator) batch_x, batch_y = torch.Tensor(batch_x).cuda(), torch.LongTensor( np.squeeze(batch_y).astype(int)).cuda() losses = train_batch(batch_x, batch_y, net, optimizer, loss_functions, metric_functions) info = '' train_loss += sum([loss.cpu().data.numpy()[0] for loss in losses]) info += '| loss: %.3f' % (train_loss / (i + 1)) for m in metric_functions: name, value = m.get() info += ' | %s: %.3f' % (name, value) progress_bar(i, num_steps, info) # write logs for this epoch logger.scalar_summary('loss', train_loss / num_steps, epoch) for m in metric_functions: name, value = m.get() logger.scalar_summary(name, value, epoch) torch.save(net.state_dict(), save_path + 'checkpoint.params') enqueuer.stop()