def disable_configs(service, accessor, cluster, conf_file): logger.debug("Reading SSL configs from {0}".format(conf_file)) ssl_configs = read_service_configs(service, conf_file) logger.debug("ssl_configs for {0} are {1}".format(service, ssl_configs)) for section in ssl_configs: config_type = section['config_type'] keys = section.keys() del section['config_type'] if "delete" in keys: del section['delete'] for k in section: try: configs.update_config(cluster, config_type, configs.delete_specific_property(k), accessor) except Exception: logger.warn("Unable to get/delete configs for config_type:{0} from Ambari".format(config_type)) return 1 logger.info("Disabled SSL for service {0}[{1}]".format(service, config_type)) else: try: config = get_configs(accessor, cluster, config_type) except Exception: logger.warn("Unable to get configs for config_type:{0} from Ambari".format(config_type)) return 1 for k in section: if section[k] == "$historyserver": section[k] = config[0].get("yarn.log.server.url").replace('https:', 'http:').replace('19890', '19888') elif section[k] == "$timelineserver": section[k] = config[0].get("yarn.log.server.web-service.url").replace('https:', 'http:').replace('8190', '8188') config[0].update({k: section[k]}) logger.debug("New configs for {0} are :{1}".format(config_type, json.dumps(config, indent=2))) updater = put_configs(config) configs.update_config(cluster, config_type, updater, accessor) logger.info("Disabled SSL for service {0}[{1}]".format(service, config_type)) return
def update_configs_ambari(services, accessor, cluster, conf_file): config = {} for s_name in services.split(','): logger.debug("Reading SSL configs from {0}".format(conf_file)) ssl_configs = read_service_configs(s_name.upper(), conf_file) logger.debug("ssl_configs for {0} are {1}".format(s_name.upper(), ssl_configs)) for section in ssl_configs: config_type = section['config_type'] del section['config_type'] try: config = get_configs(accessor, cluster, config_type) except Exception: logger.warn("Unable to get configs for config_type:{0} from Ambari".format(config_type)) return 1 for k in section: if section[k] == "$keystore": section[k] = KEYSTORE_LOCATION elif section[k] == "$truststore": section[k] = TRUSTSTORE_LOCATION elif section[k] == "$keystorepassword": section[k] = keystorepassword elif section[k] == "$truststorepassword": section[k] = truststorepassword elif section[k] == "$historyserver": section[k] = config[0].get("yarn.log.server.url").replace('http:', 'https:').replace('19888', '19890') elif section[k] == "$timelineserver": section[k] = config[0].get("yarn.log.server.web-service.url").replace('http:', 'https:').replace('8188', '8190') config[0].update({k: section[k]}) updater = put_configs(config) configs.update_config(cluster, config_type, updater, accessor) logger.info("Updated configurations for service {0}[{1}]".format(s_name, config_type)) return
def delete_properties(cluster, config_type, args, accessor): logger.info('### Performing "delete":') if len(args) == 0: logger.error("Not enough arguments. Expected config key.") return -1 config_name = args[0] logger.info('### on property "{0}"'.format(config_name)) configs.update_config(cluster, config_type, configs.delete_specific_property(config_name), accessor) return 0
def run(params): """ Encodes midi into image according to style. :param params: a Config object contains all required parameters. :return: """ config_update_map = dict() config_update_map['input'] = params.input config_update_map['output'] = params.output conf = configs.CONFIG_MAP[params.config] conf = configs.update_config(conf, config_update_map) encoder_style = conf.encoder_style encoder_style.encode_midi(conf.input, conf.output, conf.style_params)
help='you can choose a directory contain several frames in one video') parser.add_argument( '--cfg', dest='yaml_file', default='', help='experiment configure file name, e.g. configs/fcos_detector.yaml', type=str) parser.add_argument('opts', help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER) args = parser.parse_args() if __name__ == '__main__': update_config(cfg, args) f_track = open('data/tracjectory.txt', 'a') # detector detector = FCOS(cfg) assert cfg.MODEL.DETECTION_WEIGHTS != '' load_eval_model(cfg.MODEL.DETECTION_WEIGHTS, detector) detector.cuda().eval() # appearance emb emb_size = cfg.MODEL.APPEARANCE.EMB_SIZE emb = InceptionResnetV1(pretrained='vggface2', classify=False) assert cfg.MODEL.APPEARANCE.WEIGHTS != '' load_eval_model(cfg.MODEL.APPEARANCE.WEIGHTS, emb) emb.cuda().eval() # read test frame images
def main_per_worker(process_index, ngpus_per_node, args): update_config(cfg, args) # torch seed torch.cuda.manual_seed(random.random()) # cudnn cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED #proc_rank proc_rank = args.rank * ngpus_per_node + process_index #create logger logger, output_dir = create_logger(cfg, proc_rank) # logger.info(pprint.pformat(args)) # logger.info(cfg) model = get_model(cfg, cfg.MODEL.FILE, cfg.MODEL.NAME) emb = InceptionResnetV1(pretrained='vggface2', classify=False) assert cfg.MODEL.APPEARANCE.WEIGHTS != '' load_eval_model(cfg.MODEL.APPEARANCE.WEIGHTS, emb) # TODO change based on the paper optimizer = get_optimizer(cfg, model) model, optimizer, last_iter = load_checkpoint(cfg, model, optimizer) lr_scheduler = get_lr_scheduler(cfg, optimizer, last_iter) transform = FacenetInferenceTransform(size=(cfg.TRAIN.INPUT_MIN, cfg.TRAIN.INPUT_MAX)) train_dataset = TrackletpairDataset(cfg.DATASET.ROOT, transform=transform, is_train=True) eval_dataset = TrackletpairDataset(cfg.DATASET.ROOT, transform=transform, is_train=False) # distribution if args.distributed: logger.info( f'Init process group: dist_url: {args.dist_url}, ' f'world_size: {args.world_size}, ' f'machine: {args.rank}, ' f'rank:{proc_rank}' ) dist.init_process_group( backend=cfg.DIST_BACKEND, init_method=args.dist_url, world_size=args.world_size, rank=proc_rank ) torch.cuda.set_device(process_index) model.cuda() emb.cuda() model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[process_index] ) emb = torch.nn.parallel.DistributedDataParallel( emb, device_ids=[process_index] ) train_sampler = BalancedBatchSampler( train_dataset ) batch_size = cfg.DATASET.IMG_NUM_PER_GPU else: assert proc_rank == 0, ('proc_rank != 0, it will influence ' 'the evaluation procedure') model = torch.nn.DataParallel(model).cuda() emb = torch.nn.DataParallel(emb).cuda() train_sampler = BalancedBatchSampler( train_dataset ) batch_size = cfg.DATASET.IMG_NUM_PER_GPU * ngpus_per_node train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=(train_sampler is None), drop_last=False, collate_fn=tracklet_pair_collect, num_workers=cfg.WORKERS, pin_memory=True, sampler=train_sampler ) eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=tracklet_pair_collect, num_workers=cfg.WORKERS ) criterion = nn.CrossEntropyLoss() Trainer = trackletpairConnectTrainer( cfg, model, optimizer, lr_scheduler, criterion, output_dir, 'acc', last_iter, proc_rank, pre_ap_model=emb, ) while True: Trainer.train(train_loader, eval_loader) # eval Trainer.evaluate(eval_loader)
def main_per_worker(): args = parse_args() update_config(cfg, args) ngpus_per_node = torch.cuda.device_count() device = torch.device(cfg.DEVICE) if not os.path.exists(cfg.OUTPUT_ROOT): os.makedirs(cfg.OUTPUT_ROOT) logging.basicConfig(filename=f'{cfg.OUTPUT_ROOT}/eval.log', level=logging.INFO) # model module = importlib.import_module(cfg.MODEL.FILE) model, criterion, postprocessors = getattr(module, 'build_model')(cfg, device) model = torch.nn.DataParallel(model).to(device) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # load model checkpoints resume_path = cfg.MODEL.RESUME_PATH if os.path.exists(resume_path): checkpoint = torch.load(resume_path, map_location='cpu') # resume if 'state_dict' in checkpoint: model.module.load_state_dict(checkpoint['state_dict'], strict=True) logging.info(f'==> model pretrained from {resume_path}') # get datset module = importlib.import_module(cfg.DATASET.FILE) Dataset = getattr(module, cfg.DATASET.NAME) data_root = os.path.join(cfg.DATASET.ROOT, 'test') if not os.path.exists(data_root): logging.info(f'==> Cannot found data: {data_root}') raise FileNotFoundError eval_transform = EvalTransform( mean=cfg.DATASET.MEAN, std=cfg.DATASET.STD, max_size=cfg.DATASET.MAX_SIZE ) logging.info(f'==> load val sub set: {data_root}') eval_dataset = Dataset(cfg, data_root, eval_transform) if eval_dataset is not None: logging.info(f'==> the size of eval dataset is {len(eval_dataset)}') eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=1, shuffle=False, drop_last=False, collate_fn=collect, num_workers=cfg.WORKERS ) # start evaluate in Trainer module = importlib.import_module(cfg.TRAINER.FILE) Trainer = getattr(module, cfg.TRAINER.NAME)( cfg, model, criterion=criterion, optimizer=None, lr_scheduler=None, postprocessors=postprocessors, log_dir=cfg.OUTPUT_ROOT+'/output', performance_indicator=cfg.PI, last_iter=-1, rank=0, device=device, max_norm=None ) logging.info(f'==> start eval...') assert cfg.TEST.MODE in ['hico'] Trainer.evaluate(eval_loader, cfg.TEST.MODE)
def main_per_worker(process_index, ngpus_per_node, args): update_config(cfg, args) # torch seed torch.cuda.manual_seed(random.random()) # cudnn cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED #proc_rank proc_rank = args.rank * ngpus_per_node + process_index #create logger logger, output_dir = create_logger(cfg, proc_rank) # logger.info(pprint.pformat(args)) # logger.info(cfg) model = InceptionResnetV1(pretrained='vggface2', classify=False, path=[cfg.MODEL.FEATURE_PATH, cfg.MODEL.LOGITS_PATH]) optimizer = get_optimizer(cfg, model) model, optimizer, last_iter = load_checkpoint(cfg, model, optimizer) lr_scheduler = get_lr_scheduler(cfg, optimizer, last_iter) train_transform = FacenetTransform(size=(cfg.TRAIN.INPUT_MIN, cfg.TRAIN.INPUT_MAX)) train_dataset = FacenetTripletDataset(cfg.DATASET.ROOT, transform=train_transform, is_train=True) eval_transform = FacenetTransform(size=cfg.TEST.TEST_SIZE) eval_dataset = FacenetTripletDataset(cfg.DATASET.ROOT, transform=eval_transform, is_train=False) # distribution if args.distributed: logger.info( f'Init process group: dist_url: {args.dist_url}, ' f'world_size: {args.world_size}, ' f'machine: {args.rank}, ' f'rank:{proc_rank}' ) dist.init_process_group( backend=cfg.DIST_BACKEND, init_method=args.dist_url, world_size=args.world_size, rank=proc_rank ) torch.cuda.set_device(process_index) model.cuda() model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[process_index] ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset ) batch_size = cfg.DATASET.IMG_NUM_PER_GPU else: assert proc_rank == 0, ('proc_rank != 0, it will influence ' 'the evaluation procedure') model = torch.nn.DataParallel(model).cuda() train_sampler = None batch_size = cfg.DATASET.IMG_NUM_PER_GPU * ngpus_per_node train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=(train_sampler is None), drop_last=True, collate_fn=facenet_triplet_collect, num_workers=cfg.WORKERS, pin_memory=True, sampler=train_sampler ) eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=facenet_triplet_collect, num_workers=cfg.WORKERS ) criterion = triplet_loss Trainer = get_trainer( cfg, model, optimizer, lr_scheduler, criterion, output_dir, last_iter, proc_rank, ) while True: Trainer.train(train_loader, eval_loader) # eval Trainer.evaluate(eval_loader)
def run(config_map, tf_file_reader=tf.data.TFRecordDataset, file_reader=tf.python_io.tf_record_iterator): """Load model params, save config file and start trainer. Args: config_map: Dictionary mapping configuration name to Config object. tf_file_reader: The tf.data.Dataset class to use for reading files. file_reader: The Python reader to use for reading files. Raises: ValueError: if required flags are missing or invalid. """ if not FLAGS.run_dir: raise ValueError('Invalid run directory: %s' % FLAGS.run_dir) run_dir = os.path.expanduser(FLAGS.run_dir) train_dir = os.path.join(run_dir, 'train') if FLAGS.mode not in ['train', 'eval']: raise ValueError('Invalid mode: %s' % FLAGS.mode) if FLAGS.config not in config_map: raise ValueError('Invalid config: %s' % FLAGS.config) config = config_map[FLAGS.config] if FLAGS.hparams: config.hparams.parse(FLAGS.hparams) config_update_map = {} if FLAGS.examples_path: config_update_map['%s_examples_path' % FLAGS.mode] = os.path.expanduser(FLAGS.examples_path) if FLAGS.tfds_name: if FLAGS.examples_path: raise ValueError( 'At most one of --examples_path and --tfds_name can be set.') config_update_map['tfds_name'] = FLAGS.tfds_name config_update_map['eval_examples_path'] = None config_update_map['train_examples_path'] = None config = configs.update_config(config, config_update_map) if FLAGS.num_sync_workers: config.hparams.batch_size //= FLAGS.num_sync_workers if FLAGS.mode == 'train': is_training = True elif FLAGS.mode == 'eval': is_training = False else: raise ValueError('Invalid mode: {}'.format(FLAGS.mode)) def dataset_fn(): return data.get_dataset(config, tf_file_reader=tf_file_reader, num_threads=FLAGS.num_data_threads, is_training=is_training, cache_dataset=FLAGS.cache_dataset) if is_training: train( train_dir, config=config, dataset_fn=dataset_fn, checkpoints_to_keep=FLAGS.checkpoints_to_keep, keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours, num_steps=FLAGS.num_steps, master=FLAGS.master, num_sync_workers=FLAGS.num_sync_workers, num_ps_tasks=FLAGS.num_ps_tasks, task=FLAGS.task) else: num_batches = FLAGS.eval_num_batches or data.count_examples( config.eval_examples_path, config.tfds_name, config.data_converter, file_reader) // config.hparams.batch_size eval_dir = os.path.join(run_dir, 'eval' + FLAGS.eval_dir_suffix) evaluate(train_dir, eval_dir, config=config, dataset_fn=dataset_fn, num_batches=num_batches, master=FLAGS.master)
def main_per_worker(): args = parse_args() update_config(cfg, args) ngpus_per_node = torch.cuda.device_count() print(cfg.OUTPUT_ROOT) if 'SLURM_PROCID' in os.environ.keys(): proc_rank = int(os.environ['SLURM_PROCID']) local_rank = proc_rank % ngpus_per_node args.world_size = int(os.environ['SLURM_NTASKS']) else: proc_rank = 0 local_rank = 0 args.world_size = 1 args.distributed = (args.world_size > 1 or args.distributed) #create logger if proc_rank == 0: logger, output_dir = create_logger(cfg, proc_rank) # distribution if args.distributed: dist_url = get_ip(os.environ['SLURM_STEP_NODELIST']) if proc_rank == 0: logger.info( f'Init process group: dist_url: {dist_url}, ' f'world_size: {args.world_size}, ' f'proc_rank: {proc_rank}, ' f'local_rank:{local_rank}' ) dist.init_process_group( backend=cfg.DIST_BACKEND, init_method=dist_url, world_size=args.world_size, rank=proc_rank ) torch.distributed.barrier() # torch seed seed = cfg.SEED + misc.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.cuda.set_device(local_rank) device = torch.device(cfg.DEVICE) model, criterion, postprocessors = get_model(cfg, device) model.to(device) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True ) train_dataset, eval_dataset = get_dataset(cfg) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset ) batch_size = cfg.DATASET.IMG_NUM_PER_GPU else: assert proc_rank == 0, ('proc_rank != 0, it will influence ' 'the evaluation procedure') # torch seed seed = cfg.SEED torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if cfg.DEVICE == 'cuda': torch.cuda.set_device(local_rank) device = torch.device(cfg.DEVICE) model, criterion, postprocessors = get_model(cfg, device) model = torch.nn.DataParallel(model).to(device) train_dataset, eval_dataset = get_dataset(cfg) train_sampler = None if ngpus_per_node == 0: batch_size = cfg.DATASET.IMG_NUM_PER_GPU else: batch_size = cfg.DATASET.IMG_NUM_PER_GPU * ngpus_per_node model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "rel" in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "rel" not in n and p.requires_grad], "lr": cfg.TRAIN.LR_BACKBONE, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=cfg.TRAIN.LR, weight_decay=cfg.TRAIN.WEIGHT_DECAY) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, cfg.TRAIN.LR_DROP) model, optimizer, lr_scheduler, last_iter = load_checkpoint(cfg, model, optimizer, lr_scheduler, device) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, # shuffle=False, shuffle=(train_sampler is None), drop_last=True, collate_fn=collect, num_workers=cfg.WORKERS, pin_memory=True, sampler=train_sampler ) eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=collect, num_workers=cfg.WORKERS ) Trainer = get_trainer( cfg, model, criterion=criterion, optimizer=optimizer, lr_scheduler=lr_scheduler, postprocessors=postprocessors, log_dir='output', performance_indicator='mAP', last_iter=last_iter, rank=proc_rank, device=device, max_norm=cfg.TRAIN.CLIP_MAX_NORM ) print('start training...') while True: Trainer.train(train_loader, eval_loader)
def main_per_worker(process_index, ngpus_per_node, args): update_config(cfg, args) # torch seed torch.cuda.manual_seed(random.random()) # cudnn cudnn.benchmark = cfg.CUDNN.BENCHMARK torch.backends.cudnn.enabled = cfg.CUDNN.ENABLED #proc_rank proc_rank = args.rank * ngpus_per_node + process_index #create logger logger, output_dir = create_logger(cfg, proc_rank) # logger.info(pprint.pformat(args)) # logger.info(cfg) model = get_model(cfg, cfg.MODEL.FILE, cfg.MODEL.NAME) optimizer = get_optimizer(cfg, model) model, optimizer, last_iter = load_checkpoint(cfg, model, optimizer) lr_scheduler = get_lr_scheduler(cfg, optimizer, last_iter) train_dataset, eval_dataset = get_dataset(cfg) # distribution if args.distributed: logger.info(f'Init process group: dist_url: {args.dist_url}, ' f'world_size: {args.world_size}, ' f'machine: {args.rank}, ' f'rank:{proc_rank}') dist.init_process_group(backend=cfg.DIST_BACKEND, init_method=args.dist_url, world_size=args.world_size, rank=proc_rank) torch.cuda.set_device(process_index) model.cuda() model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[process_index]) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) batch_size = cfg.DATASET.IMG_NUM_PER_GPU else: assert proc_rank == 0, ('proc_rank != 0, it will influence ' 'the evaluation procedure') model = torch.nn.DataParallel(model).cuda() train_sampler = None batch_size = cfg.DATASET.IMG_NUM_PER_GPU * ngpus_per_node print('BATCH_SIZE: ', batch_size) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=(train_sampler is None), drop_last=True, collate_fn=objtrack_collect, num_workers=cfg.WORKERS, pin_memory=True, sampler=train_sampler) eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=batch_size, shuffle=False, drop_last=False, collate_fn=objtrack_collect, num_workers=cfg.WORKERS) criterion = get_det_criterion(cfg) Trainer = get_trainer( cfg, model, optimizer, lr_scheduler, criterion, output_dir, last_iter, proc_rank, ) while True: Trainer.train(train_loader, eval_loader)