示例#1
0
def evaluate(args):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    print('The image is {:}'.format(args.image))
    print('The model is {:}'.format(args.model))
    snapshot = Path(args.model)
    assert snapshot.exists(), 'The model path {:} does not exist'
    facebox=face_detect(args.image,args.face_detector)

    print('The face bounding box is {:}'.format(facebox))
    assert len(facebox)==4,'Invalid face input : {:}'.format(facebox)
    snapshot = torch.load(str(snapshot))

    # General Data Argumentation
    mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]])
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    param = snapshot['args']
    eval_transform = transforms.Compose(
        [transforms.PreCrop(param.pre_crop_expand), transforms.TrainScale2WH((param.crop_width, param.crop_height)),
         transforms.ToTensor(), normalize])
    model_config = load_configure(param.model_config, None)
    dataset = GeneralDataset(eval_transform, param.sigma, model_config.downsample, param.heatmap_type, param.data_indicator)
    dataset.reset(param.num_pts)

    net = obtain_model(model_config, param.num_pts + 1)
    net = net.cuda()
    weights = remove_module_dict(snapshot['state_dict'])
    net.load_state_dict(weights)
    print('Prepare input data')
    [image, _, _, _, _, _, cropped_size], meta = dataset.prepare_input(args.image, facebox)
    inputs = image.unsqueeze(0).cuda()
    # network forward
    with torch.no_grad():
        batch_heatmaps, batch_locs, batch_scos = net(inputs)
    # obtain the locations on the image in the orignial size
    cpu = torch.device('cpu')
    np_batch_locs, np_batch_scos, cropped_size = batch_locs.to(cpu).numpy(), batch_scos.to(
        cpu).numpy(), cropped_size.numpy()
    locations, scores = np_batch_locs[0, :-1, :], np.expand_dims(np_batch_scos[0, :-1], -1)

    scale_h, scale_w = cropped_size[0] * 1. / inputs.size(-2), cropped_size[1] * 1. / inputs.size(-1)

    locations[:, 0], locations[:, 1] = locations[:, 0] * scale_w + cropped_size[2], locations[:, 1] * scale_h + \
                                       cropped_size[3]
    prediction = np.concatenate((locations, scores), axis=1).transpose(1, 0)

    print('the coordinates for {:} facial landmarks:'.format(param.num_pts))
    for i in range(param.num_pts):
        point = prediction[:, i]
        print('the {:02d}/{:02d}-th point : ({:.1f}, {:.1f}), score = {:.2f}'.format(i+1, param.num_pts, float(point[0]),
                                                                                     float(point[1]), float(point[2])))
    image = draw_image_by_points(args.image, prediction, 2, (255, 0, 0), facebox, None,None)
    image.show()
    image.save(args.image.split('.')[0]+'_result.jpg')
示例#2
0
def build_transforms(config):
    transform_train = T.Compose([
        T.RandomCroping(config.DATA.HEIGHT,
                        config.DATA.WIDTH,
                        p=config.AUG.RC_PROB),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        T.RandomErasing(probability=config.AUG.RE_PROB)
    ])

    transform_test = T.Compose([
        T.Resize((config.DATA.HEIGHT, config.DATA.WIDTH)),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    return transform_train, transform_test
示例#3
0
def Generate_transform_Dict(origin_width=256,
                            width=227,
                            ratio=0.16,
                            rot=0,
                            args=None):

    std_value = 1.0 / 255.0
    if (args is not None) and ("ResNet" in args.net):
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        cc = []
    else:
        normalize = transforms.Normalize(
            mean=[104 / 255.0, 117 / 255.0, 128 / 255.0],
            std=[1.0 / 255, 1.0 / 255, 1.0 / 255])
        print("bgr init")
        cc = [transforms.CovertBGR()]
    transform_dict = {}

    transform_dict['rand-crop'] = \
    transforms.Compose(cc +
                [transforms.Resize((origin_width)),
                transforms.RandomResizedCrop(scale=(ratio, 1), size=width),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
               ])

    transform_dict['center-crop'] = \
    transforms.Compose(cc +
                [
                    transforms.Resize((origin_width)),
                    transforms.CenterCrop(width),
                    transforms.ToTensor(),
                    normalize,
                ])

    transform_dict['resize'] = \
    transforms.Compose(cc +
                [    transforms.Resize((width)),
                    transforms.ToTensor(),
                    normalize,
                ])
    return transform_dict
    def __init__(
            self,
            file_name,
            sequence_len: int,
            hop: int,
            sr: int = 44100,
            fft_size: int = 4096,
            fft_hop: int = 441,
            n_freq_bins: int = 256,
            freq_compression: str = "linear",
            f_min: int = 200,
            f_max: int = 18000,
            cache_dir=None  #added
    ):
        self.sequence_len = sequence_len
        self.hop = hop

        self.audio = T.load_audio_file(file_name, sr=sr, mono=True)
        self.n_frames = self.audio.shape[1]

        self.t = [
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(fft_size, fft_hop, center=False),
        ]

        if freq_compression == "linear":
            self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max))
        elif freq_compression == "mel":
            self.t.append(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
        elif freq_compression == "mfcc":
            t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
            self.t.append(T.Compose(t_mel, T.M2MFCC()))
        else:
            raise "Undefined frequency compression"
        self.t.append(
            T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]))
        self.t.append(
            T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
            ))

        #self.file_reader = AsyncFileReader()
        self.t = T.CachedSpectrogram(
            cache_dir=cache_dir,
            spec_transform=T.Compose(self.t),
            n_fft=fft_size,
            hop_length=fft_hop,
            #file_reader=AsyncFileReader()
        )
    def __init__(
        self,
        file_name,
        sequence_len: int,
        hop: int,
        sr: int = 44100,
        fft_size: int = 4096,
        fft_hop: int = 441,
        n_freq_bins: int = 256,
        freq_compression: str = "linear",
        f_min: int = 200,
        f_max: int = 18000,
        center=True
    ):

        self.sp = signal.signal_proc()

        self.hop = hop
        self.center = center
        self.filename = file_name
        self.sequence_len = sequence_len
        self.audio = T.load_audio_file(file_name, sr=sr, mono=True)
        self.n_frames = self.audio.shape[1]

        spec_t = [
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(fft_size, fft_hop, center=self.center),
        ]

        self.spec_transforms = T.Compose(spec_t)

        if freq_compression == "linear":
            self.t_compr_f = (T.Interpolate(n_freq_bins, sr, f_min, f_max))
        elif freq_compression == "mel":
            self.t_compr_f = (T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
        elif freq_compression == "mfcc":
            t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
            self.t_compr_f = (T.Compose(t_mel, T.M2MFCC()))
        else:
            raise "Undefined frequency compression"

        self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])

        self.t_norm = T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"]
        )
    def __init__(self,
                 file_name,
                 sequence_len: int,
                 hop: int,
                 sr: int = 44100,
                 fft_size: int = 4096,
                 fft_hop: int = 441,
                 n_freq_bins: int = 256,
                 freq_compression: str = "linear",
                 f_min: int = 200,
                 f_max: int = 18000):
        self.sequence_len = sequence_len
        self.hop = hop

        self.audio = T.load_audio_file(file_name, sr=sr, mono=True)
        self.n_frames = self.audio.shape[
            1]  # total num of samples in the audio (transposed mono)

        self.t = [
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(fft_size, fft_hop, center=False),
        ]

        if freq_compression == "linear":
            self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max))
        elif freq_compression == "mel":
            self.t.append(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
        elif freq_compression == "mfcc":
            t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
            self.t.append(T.Compose(t_mel, T.M2MFCC()))
        else:
            raise "Undefined frequency compression"
        self.t.append(
            T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]))
        self.t.append(
            T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
            ))

        self.t = T.Compose(self.t)
def get_transform(train, dataset_name):
    base_size = cfg.DATA_TRANSFORM.LOADSIZE
    crop_size = cfg.DATA_TRANSFORM.CROPSIZE
    ignore_label = cfg.DATASET.IGNORE_LABEL

    if dataset_name == cfg.DATASET.SOURCE:
        input_size = cfg.DATA_TRANSFORM.INPUT_SIZE_S
    else:
        input_size = cfg.DATA_TRANSFORM.INPUT_SIZE_T

    min_size = int((1.0 if train else 1.0) * base_size)
    max_size = int((1.3 if train else 1.0) * base_size)

    transforms = []
    if cfg.DATA_TRANSFORM.RANDOM_RESIZE_AND_CROP:
        if train:
            transforms.append(T.RandomResize(min_size, max_size))
            transforms.append(T.RandomHorizontalFlip(0.5))
            transforms.append(
                T.RandomCrop(crop_size, ignore_label=ignore_label))
        else:
            transforms.append(T.Resize(cfg.DATA_TRANSFORM.INPUT_SIZE_T, True))
    else:
        if train:
            transforms.append(T.Resize(input_size))
            transforms.append(T.RandomHorizontalFlip(0.5))
        else:
            transforms.append(T.Resize(input_size, True))

    mapping = get_label_map(cfg.DATASET.SOURCE, cfg.DATASET.TARGET)
    transforms.append(T.LabelRemap(mapping[dataset_name]))
    transforms.append(T.ToTensor(cfg.DATASET.IMG_MODE))
    if cfg.DATASET.IMG_MODE == "BGR":
        mean = (104.00698793, 116.66876762, 122.67891434)
        std = (1.0, 1.0, 1.0)
    else:
        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)

    transforms.append(T.Normalize(mean, std))
    return T.Compose(transforms)
def get_transform(dataset_name):
    base_size = cfg.DATA_TRANSFORM.LOADSIZE
    ignore_label = cfg.DATASET.IGNORE_LABEL

    min_size = base_size
    max_size = base_size

    transforms = []
    transforms.append(T.Resize(cfg.DATA_TRANSFORM.INPUT_SIZE_T, True))

    mapping = get_label_map(cfg.DATASET.SOURCE, cfg.DATASET.TARGET)
    transforms.append(T.LabelRemap(mapping[dataset_name]))
    transforms.append(T.ToTensor(cfg.DATASET.IMG_MODE))
    if cfg.DATASET.IMG_MODE == "BGR":
        mean = (104.00698793, 116.66876762, 122.67891434)
        std = (1.0, 1.0, 1.0)
    else:
        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)
    transforms.append(T.Normalize(mean, std))

    return T.Compose(transforms)
示例#9
0
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    prepare_seed(args.rand_seed)
    logstr = 'seed-{:}-time-{:}'.format(args.rand_seed, time_for_file())
    logger = Logger(args.save_path, logstr)
    logger.log('Main Function with logger : {:}'.format(logger))
    logger.log('Arguments : -------------------------------')
    for name, value in args._get_kwargs():
        logger.log('{:16} : {:}'.format(name, value))
    logger.log("Python  version : {}".format(sys.version.replace('\n', ' ')))
    logger.log("Pillow  version : {}".format(PIL.__version__))
    logger.log("PyTorch version : {}".format(torch.__version__))
    logger.log("cuDNN   version : {}".format(torch.backends.cudnn.version()))

    # General Data Argumentation
    mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]])
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    assert args.arg_flip == False, 'The flip is : {}, rotate is {}'.format(args.arg_flip, args.rotate_max)
    train_transform = [transforms.PreCrop(args.pre_crop_expand)]
    train_transform += [transforms.TrainScale2WH((args.crop_width, args.crop_height))]
    train_transform += [transforms.AugScale(args.scale_prob, args.scale_min, args.scale_max)]
    # if args.arg_flip:
    #  train_transform += [transforms.AugHorizontalFlip()]
    if args.rotate_max:
        train_transform += [transforms.AugRotate(args.rotate_max)]
    train_transform += [transforms.AugCrop(args.crop_width, args.crop_height, args.crop_perturb_max, mean_fill)]
    train_transform += [transforms.ToTensor(), normalize]
    train_transform = transforms.Compose(train_transform)

    eval_transform = transforms.Compose(
        [transforms.PreCrop(args.pre_crop_expand), transforms.TrainScale2WH((args.crop_width, args.crop_height)),
         transforms.ToTensor(), normalize])
    assert (args.scale_min + args.scale_max) / 2 == args.scale_eval, 'The scale is not ok : {},{} vs {}'.format(
        args.scale_min, args.scale_max, args.scale_eval)

    # Model Configure Load
    model_config = load_configure(args.model_config, logger)
    args.sigma = args.sigma * args.scale_eval
    logger.log('Real Sigma : {:}'.format(args.sigma))

    # Training Dataset
    train_data = GeneralDataset(train_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator)
    train_data.load_list(args.train_lists, args.num_pts, True)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size,
                                               shuffle=True,num_workers=args.workers,
                                               pin_memory=True)
    # Evaluation Dataloader
    eval_loaders = []

    if args.eval_ilists is not None:
        for eval_ilist in args.eval_ilists:
            eval_idata = GeneralDataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type,
                                 args.data_indicator)
            eval_idata.load_list(eval_ilist, args.num_pts, True)
            eval_iloader = torch.utils.data.DataLoader(eval_idata, batch_size=args.batch_size, shuffle=False,
                                                       num_workers=args.workers, pin_memory=True)
            eval_loaders.append((eval_iloader, False))

    # Define network
    logger.log('configure : {:}'.format(model_config))
    net = obtain_model(model_config, args.num_pts + 1)

    assert model_config.downsample == net.downsample, 'downsample is not correct : {} vs {}'.format(
        model_config.downsample, net.downsample)
    logger.log("=> network :\n {}".format(net))

    logger.log('Training-data : {:}'.format(train_data))
    for i, eval_loader in enumerate(eval_loaders):
        eval_loader, is_video = eval_loader
        logger.log('The [{:2d}/{:2d}]-th testing-data [{:}] = {:}'.format(i, len(eval_loaders),
                                                                          'video' if is_video else 'image',
                                                                          eval_loader.dataset))
    logger.log('arguments : {:}'.format(args))
    opt_config = load_configure(args.opt_config, logger)

    if hasattr(net, 'specify_parameter'):
        net_param_dict = net.specify_parameter(opt_config.LR, opt_config.Decay)
    else:
        net_param_dict = net.parameters()

    optimizer, scheduler, criterion = obtain_optimizer(net_param_dict, opt_config, logger)
    logger.log('criterion : {:}'.format(criterion))
    net, criterion = net.cuda(), criterion.cuda()
    net = torch.nn.DataParallel(net)

    last_info = logger.last_info()
    if last_info.exists():
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info))
        last_info = torch.load(str(last_info))
        start_epoch = last_info['epoch'] + 1
        checkpoint = torch.load(last_info['last_checkpoint'])
        assert last_info['epoch'] == checkpoint['epoch'], 'Last-Info is not right {:} vs {:}'.format(last_info,
                                                                                                     checkpoint[
                                                                                                         'epoch'])
        net.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        logger.log("=> load-ok checkpoint '{:}' (epoch {:}) done".format(logger.last_info(), checkpoint['epoch']))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch = 0

    if args.eval_once:
        logger.log("=> only evaluate the model once")
        eval_results = eval_all(args, eval_loaders, net, criterion, 'eval-once', logger, opt_config)
        logger.close()
        return

        # Main Training and Evaluation Loop
    start_time = time.time()
    epoch_time = AverageMeter()
    for epoch in range(start_epoch, opt_config.epochs):
        scheduler.step()
        need_time = convert_secs2time(epoch_time.avg * (opt_config.epochs - epoch), True)
        epoch_str = 'epoch-{:03d}-{:03d}'.format(epoch, opt_config.epochs)
        LRs = scheduler.get_lr()
        logger.log('\n==>>{:s} [{:s}], [{:s}], LR : [{:.5f} ~ {:.5f}], Config : {:}'.format(time_string(), epoch_str,
                                                                                            need_time, min(LRs),
                                                                                            max(LRs), opt_config))

        # train for one epoch
        train_loss, train_nme = train(args, train_loader, net, criterion,
                                      optimizer, epoch_str, logger, opt_config)
        # log the results
        logger.log(
            '==>>{:s} Train [{:}] Average Loss = {:.6f}, NME = {:.2f}'.format(time_string(), epoch_str, train_loss,
                                                                              train_nme * 100))

        # remember best prec@1 and save checkpoint
        save_path = save_checkpoint({
            'epoch': epoch,
            'args': deepcopy(args),
            'arch': model_config.arch,
            'state_dict': net.state_dict(),
            'scheduler': scheduler.state_dict(),
            'optimizer': optimizer.state_dict(),
        }, str(logger.path('model') / '{:}-{:}.pth'.format(model_config.arch, epoch_str)), logger)

        last_info = save_checkpoint({
            'epoch': epoch,
            'last_checkpoint': save_path,
        }, str(logger.last_info()), logger)

        eval_results = eval_all(args, eval_loaders, net, criterion, epoch_str, logger, opt_config)

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.close()
示例#10
0
    def __init__(self,
                 file_names: Iterable[str],
                 working_dir=None,
                 cache_dir=None,
                 sr=44100,
                 n_fft=4096,
                 hop_length=441,
                 freq_compression="linear",
                 n_freq_bins=256,
                 f_min=0,
                 f_max=18000,
                 seq_len=128,
                 augmentation=False,
                 noise_files=[],
                 min_max_normalize=False,
                 *args,
                 **kwargs):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.n_fft = n_fft
        self.hop_length = hop_length
        self.f_min = f_min
        self.f_max = f_max

        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(freq_compression, valid_freq_compressions),
            )
        self.freq_compression = freq_compression

        self.possible_call_labels = re.compile("|".join(["call"]))
        self.possible_nocall_labels = re.compile("|".join(["noise"]))

        self._logger.debug("Number of files : {}".format(len(self.file_names)))

        _n_calls = 0
        for f in self.file_names:
            if self.is_call(f):
                _n_calls += 1

        self._logger.debug("Number of calls: {}".format(_n_calls))
        self._logger.debug(
            "Number of noise: {}".format(len(self.file_names) - _n_calls))

        self.augmentation = augmentation

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]
        self.file_reader = AsyncFileReader()
        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )
        if augmentation:
            self._logger.debug(
                "Init augmentation transforms for time and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            self._logger.debug("Running without augmentation")
        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr,
                                   n_mels=n_freq_bins,
                                   f_min=f_min,
                                   f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
            self.t_compr_mfcc = T.M2MFCC(n_mfcc=32)
        else:
            raise "Undefined frequency compression"
        if augmentation:
            if noise_files:
                self._logger.debug(
                    "Init augmentation transform for random noise addition")
                self.t_addnoise = T.RandomAddNoise(
                    noise_files,
                    self.t_spectrogram,
                    T.Compose(self.t_timestretch, self.t_pitchshift,
                              self.t_compr_f),
                    min_length=seq_len,
                    return_original=True)
            else:
                self.t_addnoise = None
        self.t_compr_a = T.Amp2Db(
            min_level_db=DefaultSpecDatasetOps["min_level_db"])

        if min_max_normalize:
            self.t_norm = T.MinMaxNormalize()
            self._logger.debug("Init min-max-normalization activated")
        else:
            self.t_norm = T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
            )
            self._logger.debug("Init 0/1-dB-normalization activated")

        self.t_subseq = T.PaddedSubsequenceSampler(seq_len,
                                                   dim=1,
                                                   random=augmentation)
parser.add_argument('--resume', type=str, default=None,
                    help='put the path to resuming file if needed')
args = parser.parse_args()

args.checkname = args.arc

# Define Saver
saver = Saver(args)
saver.save_experiment_config()

# Define Tensorboard Summary
summary = TensorboardSummary(saver.experiment_dir)
writer = summary.create_summary()

# Data
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
train_trans = transforms.Compose([transforms.Resize(321),
                                  transforms.RandomCrop(224),
                                  transforms.RandomHorizontalFlip(),
                                  transforms.ToTensor(),
                                  normalize,
                                  ])
val_trans = transforms.Compose([transforms.Resize(321),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(),
                                normalize,
                                ])
train_ds = VOCSBDClassification('/path/to/VOC',
                                '/path/to/SBD/benchmark_RELEASE/dataset',
                                transform=train_trans, image_set='train')
train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True)
    def __init__(
            self,
            file_names: Iterable[str],
            working_dir=None,
            cache_dir=None,
            sr=44100,
            n_fft=2048,  #4096
            hop_length=220,  #441
            freq_compression="linear",
            n_freq_bins=256,  # determines the width of the image
            f_min=0,
            f_max=18000,
            seq_len=128,  # shd be adjusted together with sequence_len in class StridedAudioDataset (called by predict.py)
            augmentation=False,
            noise_files=[],
            *args,
            **kwargs):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.n_fft = n_fft
        self.hop_length = hop_length
        self.f_min = f_min
        self.f_max = f_max

        # mel: log transformation of freq (Hz scale to Mel scale)
        # attention: Mel-spectrograms as a network input led to an excessive loss of resolution in higher frequency bands, which was
        # a big problem considering the high-frequency pulsed calls and whistles.
        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(freq_compression, valid_freq_compressions),
            )
        self.freq_compression = freq_compression

        # combine a RegExp pattern into pattern objects for pattern matching
        self.possible_call_labels = re.compile("|".join(["call"]))
        self.possible_nocall_labels = re.compile("|".join(["noise"]))

        self._logger.debug("Number of files : {}".format(len(self.file_names)))

        _n_calls = 0
        for f in self.file_names:
            if self.is_call(f):
                _n_calls += 1

        self._logger.debug("Number of calls: {}".format(_n_calls))
        self._logger.debug(
            "Number of noise: {}".format(len(self.file_names) - _n_calls))

        self.augmentation = augmentation

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),  # return: a vector tensor
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]
        self.file_reader = AsyncFileReader()
        # if user chooses to not cache .spec by omitting the directory
        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            # where .spec is created and stored
            # n_fft, hop_length: meta in spec_dict
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )
        if augmentation:
            self._logger.debug(
                "Init augmentation transforms for time and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            self._logger.debug("Running without augmentation")
        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr,
                                   n_mels=n_freq_bins,
                                   f_min=f_min,
                                   f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max),
                T.M2MFCC())
        else:
            raise "Undefined frequency compression"
        if augmentation:
            if noise_files:
                self._logger.debug(
                    "Init augmentation transform for random noise addition")
                self.t_addnoise = T.RandomAddNoise(
                    noise_files,
                    self.t_spectrogram,
                    T.Compose(self.t_timestretch, self.t_pitchshift,
                              self.t_compr_f),
                    min_length=seq_len,
                    return_original=True
                )  # if return_original = True, both augmented and original specs are returned
            else:
                self.t_addnoise = None
        self.t_compr_a = T.Amp2Db(
            min_level_db=DefaultSpecDatasetOps["min_level_db"])
        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )
        self.t_subseq = T.PaddedSubsequenceSampler(seq_len,
                                                   dim=1,
                                                   random=augmentation)
示例#13
0
def main():
    print('starting denoising')

    noise_sigma = 4e-5  # sigma for the noise simulation
    batch_size = 8  # number of images to run for each minibach
    num_epochs = 200  # number of epochs to train
    validation_seed = 15  # rng seed for validation loop
    log_dir = 'logs/denoise/'  # log dir for models and tensorboard
    device = torch.device('cpu')  # model will run on this device
    dtype = torch.float  # dtype for data and model

    # set up tensorboard
    writer = SummaryWriter(log_dir=log_dir)

    # checkpoint file name
    checkpoint_file = os.path.join(log_dir + 'best_model.pt')

    # -------------------------------------------------------------------------
    # NOISE SIMULATION SETUP
    transform_list = [
        transforms.AddNoise(target_op=False, sigma=noise_sigma),
        transforms.Ifft(norm='ortho'),
        transforms.SquareRootSumSquare(),
        transforms.Normalize(),
        transforms.ToTensor(dat_complex=False, target_complex=False)
    ]

    # -------------------------------------------------------------------------
    # DATALOADER SETUP
    train_dataset = KneeDataSet('pytorch_tutorial_data/',
                                'train',
                                transform=transforms.Compose(transform_list))
    print('data set information:')
    print(train_dataset)
    val_dataset = KneeDataSet('pytorch_tutorial_data/',
                              'val',
                              transform=transforms.Compose(transform_list))
    # convert to a PyTorch dataloader
    # this handles batching, random shuffling, parallelization
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=True,
    )
    display_dat = val_dataset[15]['dat'].unsqueeze(0).to(device=device,
                                                         dtype=dtype)
    display_target = val_dataset[15]['target'].unsqueeze(0).to(device=device,
                                                               dtype=dtype)
    display_vmax = np.max(np.squeeze(display_dat.cpu().numpy()))

    # -------------------------------------------------------------------------
    # MODEL SETUP
    model = DenoiseCnn(num_chans=64,
                       num_layers=4,
                       magnitude_input=True,
                       magnitude_output=True)
    model = model.to(device)
    model = model.train()
    print('CNN model information:')
    print(model)

    # -------------------------------------------------------------------------
    # OPTIMIZER SETUP
    optimizer = torch.optim.Adam(model.parameters())
    loss_fn = torch.nn.MSELoss()

    # -------------------------------------------------------------------------
    # LOAD PREVIOUS STATE
    start_epoch, model, optimizer, min_val_loss = load_checkpoint(
        checkpoint_file, model, optimizer)
    current_seed = 20

    # -------------------------------------------------------------------------
    # NETWORK TRAINING
    for epoch_index in range(start_epoch, num_epochs):
        print('epoch {} of {}'.format(epoch_index + 1, num_epochs))

        # ---------------------------------------------------------------------
        # TRAINING LOOP
        model = model.train()

        # rng seed for noise generation
        torch.manual_seed(current_seed)
        np.random.seed(current_seed)
        torch.cuda.manual_seed(current_seed)

        # batch loop
        losses = []
        for batch in train_loader:
            target = batch['target'].to(device=device, dtype=dtype)
            dat = batch['dat'].to(device=device, dtype=dtype)

            est = model(dat)  # forward propagation
            loss = loss_fn(est, target)  # calculate the loss
            optimizer.zero_grad()  # clear out old gradients
            loss.backward()  # back propagation
            optimizer.step()  # update the CNN weights

            # keep last 10 minibatches to compute training loss
            losses.append(loss.item())
            losses = losses[-10:]

        print('trailing training loss: {}'.format(np.mean(losses)))

        # ---------------------------------------------------------------------
        # EVALUATION LOOP
        model = model.eval()

        # rng seed for noise generation
        current_seed = np.random.get_state()[1][0]
        torch.manual_seed(validation_seed)
        np.random.seed(validation_seed)
        torch.cuda.manual_seed(validation_seed)

        # batch loop
        val_losses = []
        with torch.no_grad():
            for batch in val_loader:
                target = batch['target'].to(device=device, dtype=dtype)
                dat = batch['dat'].to(device=device, dtype=dtype)

                est = model(dat)
                loss = loss_fn(est, target)

                val_losses.append(loss.item())

        print('validation loss: {}'.format(np.mean(val_losses)))

        # ---------------------------------------------------------------------
        # VISUALIZATIONS AND CHECKPOINTS
        if np.mean(val_losses) < min_val_loss:
            save_checkpoint(epoch_index, model, optimizer, np.mean(val_losses),
                            checkpoint_file)

        # write the losses
        writer.add_scalar('loss/train', np.mean(losses), epoch_index + 1)
        writer.add_scalar('loss/validation', np.mean(val_losses),
                          epoch_index + 1)

        # show an example image from the validation data
        model = model.eval()
        with torch.no_grad():
            display_est = model(display_dat)

        writer.add_image('validation/dat',
                         display_dat[0] / display_vmax,
                         global_step=epoch_index + 1)
        writer.add_image('validation/cnn',
                         display_est[0] / display_vmax,
                         global_step=epoch_index + 1)
        writer.add_image('validation/target',
                         display_target[0] / display_vmax,
                         global_step=epoch_index + 1)

    writer.close()
示例#14
0
    saver.save_experiment_config()

    # Define Tensorboard Summary
    summary = TensorboardSummary(saver.experiment_dir)
    args.exp = saver.experiment_dir.split('_')[-1]

    if args.train_dataset == 'cityscapes':
        # Data
        train_trans = transforms.Compose([
            transforms.ToPILImage(),
            # transforms.RandomResizedCrop((args.image_size, args.image_size), scale=(0.2, 2)),
            transforms.Resize((args.image_size, args.image_size)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomAffine(22, scale=(0.75, 1.25)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[123.675, 116.28, 103.53],
                                 std=[58.395, 57.12, 57.375])
            # transforms.NormalizeInstance()
        ])
        val_trans = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((args.image_size, args.image_size),
                              do_mask=False),
            transforms.ToTensor(),
            transforms.Normalize(mean=[123.675, 116.28, 103.53],
                                 std=[58.395, 57.12, 57.375])
            # transforms.NormalizeInstance()
        ])

        if args.ann_type == 'comp':
            train_ds = CityscapesInstances_comp(args.inst_path,
                                                args.ann_train,
示例#15
0
def main():
    parser = argparse.ArgumentParser(
        description='Training single stage FPN with OHEM, resnet as backbone')
    parser.add_argument('DATA_ROOT',
                        help='Location to root directory for dataset reading'
                        )  # /mnt/mars-fast/datasets/
    parser.add_argument(
        'SAVE_ROOT',
        help='Location to root directory for saving checkpoint models'
    )  # /mnt/mars-alpha/
    parser.add_argument(
        'MODEL_PATH',
        help=
        'Location to root directory where kinetics pretrained models are stored'
    )

    parser.add_argument(
        '--MODE',
        default='train',
        help=
        'MODE can be train, gen_dets, eval_frames, eval_tubes define SUBSETS accordingly, build tubes'
    )
    # Name of backbone network, e.g. resnet18, resnet34, resnet50, resnet101 resnet152 are supported
    parser.add_argument('--ARCH',
                        default='resnet50',
                        type=str,
                        help=' base arch')
    parser.add_argument('--MODEL_TYPE',
                        default='I3D',
                        type=str,
                        help=' base model')
    parser.add_argument('--ANCHOR_TYPE',
                        default='RETINA',
                        type=str,
                        help='type of anchors to be used in model')

    parser.add_argument('--SEQ_LEN',
                        default=8,
                        type=int,
                        help='NUmber of input frames')
    parser.add_argument('--TEST_SEQ_LEN',
                        default=8,
                        type=int,
                        help='NUmber of input frames')
    parser.add_argument(
        '--MIN_SEQ_STEP',
        default=1,
        type=int,
        help='DIFFERENCE of gap between the frames of sequence')
    parser.add_argument(
        '--MAX_SEQ_STEP',
        default=1,
        type=int,
        help='DIFFERENCE of gap between the frames of sequence')
    # if output heads are have shared features or not: 0 is no-shareing else sharining enabled
    # parser.add_argument('--MULIT_SCALE', default=False, type=str2bool,help='perfrom multiscale training')
    parser.add_argument('--HEAD_LAYERS',
                        default=3,
                        type=int,
                        help='0 mean no shareding more than 0 means shareing')
    parser.add_argument('--NUM_FEATURE_MAPS',
                        default=5,
                        type=int,
                        help='0 mean no shareding more than 0 means shareing')
    parser.add_argument('--CLS_HEAD_TIME_SIZE',
                        default=3,
                        type=int,
                        help='Temporal kernel size of classification head')
    parser.add_argument('--REG_HEAD_TIME_SIZE',
                        default=3,
                        type=int,
                        help='Temporal kernel size of regression head')

    #  Name of the dataset only voc or coco are supported
    parser.add_argument('--DATASET',
                        default='road',
                        type=str,
                        help='dataset being used')
    parser.add_argument('--TRAIN_SUBSETS',
                        default='train_3,',
                        type=str,
                        help='Training SUBSETS seprated by ,')
    parser.add_argument('--VAL_SUBSETS',
                        default='',
                        type=str,
                        help='Validation SUBSETS seprated by ,')
    parser.add_argument('--TEST_SUBSETS',
                        default='',
                        type=str,
                        help='Testing SUBSETS seprated by ,')
    # Input size of image only 600 is supprted at the moment
    parser.add_argument('--MIN_SIZE',
                        default=512,
                        type=int,
                        help='Input Size for FPN')

    #  data loading argumnets
    parser.add_argument('-b',
                        '--BATCH_SIZE',
                        default=4,
                        type=int,
                        help='Batch size for training')
    parser.add_argument('--TEST_BATCH_SIZE',
                        default=1,
                        type=int,
                        help='Batch size for testing')
    # Number of worker to load data in parllel
    parser.add_argument('--NUM_WORKERS',
                        '-j',
                        default=8,
                        type=int,
                        help='Number of workers used in dataloading')
    # optimiser hyperparameters
    parser.add_argument('--OPTIM',
                        default='SGD',
                        type=str,
                        help='Optimiser type')
    parser.add_argument('--RESUME',
                        default=0,
                        type=int,
                        help='Resume from given epoch')
    parser.add_argument('--MAX_EPOCHS',
                        default=30,
                        type=int,
                        help='Number of training epoc')
    parser.add_argument('-l',
                        '--LR',
                        '--learning-rate',
                        default=0.004225,
                        type=float,
                        help='initial learning rate')
    parser.add_argument('--MOMENTUM', default=0.9, type=float, help='momentum')
    parser.add_argument('--MILESTONES',
                        default='20,25',
                        type=str,
                        help='Chnage the lr @')
    parser.add_argument('--GAMMA',
                        default=0.1,
                        type=float,
                        help='Gamma update for SGD')
    parser.add_argument('--WEIGHT_DECAY',
                        default=1e-4,
                        type=float,
                        help='Weight decay for SGD')

    # Freeze layers or not
    parser.add_argument(
        '--FBN',
        '--FREEZE_BN',
        default=True,
        type=str2bool,
        help='freeze bn layers if true or else keep updating bn layers')
    parser.add_argument(
        '--FREEZE_UPTO',
        default=1,
        type=int,
        help='layer group number in ResNet up to which needs to be frozen')

    # Loss function matching threshold
    parser.add_argument('--POSTIVE_THRESHOLD',
                        default=0.5,
                        type=float,
                        help='Min threshold for Jaccard index for matching')
    parser.add_argument('--NEGTIVE_THRESHOLD',
                        default=0.4,
                        type=float,
                        help='Max threshold Jaccard index for matching')
    # Evaluation hyperparameters
    parser.add_argument(
        '--EVAL_EPOCHS',
        default='30',
        type=str,
        help=
        'eval epochs to test network on these epoch checkpoints usually the last epoch is used'
    )
    parser.add_argument('--VAL_STEP',
                        default=2,
                        type=int,
                        help='Number of training epoch before evaluation')
    parser.add_argument(
        '--IOU_THRESH',
        default=0.5,
        type=float,
        help='Evaluation threshold for validation and for frame-wise mAP')
    parser.add_argument(
        '--CONF_THRESH',
        default=0.025,
        type=float,
        help='Confidence threshold for to remove detection below given number')
    parser.add_argument(
        '--NMS_THRESH',
        default=0.5,
        type=float,
        help='NMS threshold to apply nms at the time of validation')
    parser.add_argument('--TOPK',
                        default=10,
                        type=int,
                        help='topk detection to keep for evaluation')
    parser.add_argument(
        '--GEN_CONF_THRESH',
        default=0.025,
        type=float,
        help='Confidence threshold at the time of generation and dumping')
    parser.add_argument('--GEN_TOPK',
                        default=100,
                        type=int,
                        help='topk at the time of generation')
    parser.add_argument('--GEN_NMS',
                        default=0.5,
                        type=float,
                        help='NMS at the time of generation')
    parser.add_argument('--CLASSWISE_NMS',
                        default=False,
                        type=str2bool,
                        help='apply classwise NMS/no tested properly')
    parser.add_argument(
        '--JOINT_4M_MARGINALS',
        default=False,
        type=str2bool,
        help=
        'generate score of joints i.e. duplexes or triplet by marginals like agents and actions scores'
    )

    ## paths hyper parameters
    parser.add_argument(
        '--COMPUTE_PATHS',
        default=False,
        type=str2bool,
        help=' COMPUTE_PATHS if set true then it overwrite existing ones')
    parser.add_argument(
        '--PATHS_IOUTH',
        default=0.5,
        type=float,
        help='Iou threshold for building paths to limit neighborhood search')
    parser.add_argument(
        '--PATHS_COST_TYPE',
        default='score',
        type=str,
        help=
        'cost function type to use for matching, other options are scoreiou, iou'
    )
    parser.add_argument(
        '--PATHS_JUMP_GAP',
        default=4,
        type=int,
        help=
        'GAP allowed for a tube to be kept alive after no matching detection found'
    )
    parser.add_argument('--PATHS_MIN_LEN',
                        default=6,
                        type=int,
                        help='minimum length of generated path')
    parser.add_argument(
        '--PATHS_MINSCORE',
        default=0.1,
        type=float,
        help='minimum score a path should have over its length')

    ## paths hyper parameters
    parser.add_argument('--COMPUTE_TUBES',
                        default=False,
                        type=str2bool,
                        help='if set true then it overwrite existing tubes')
    parser.add_argument('--TUBES_ALPHA',
                        default=0,
                        type=float,
                        help='alpha cost for changeing the label')
    parser.add_argument('--TRIM_METHOD',
                        default='none',
                        type=str,
                        help='other one is indiv which works for UCF24')
    parser.add_argument('--TUBES_TOPK',
                        default=10,
                        type=int,
                        help='Number of labels to assign for a tube')
    parser.add_argument('--TUBES_MINLEN',
                        default=5,
                        type=int,
                        help='minimum length of a tube')
    parser.add_argument(
        '--TUBES_EVAL_THRESHS',
        default='0.2,0.5',
        type=str,
        help=
        'evaluation threshold for checking tube overlap at evaluation time, one can provide as many as one wants'
    )
    # parser.add_argument('--TRAIL_ID', default=0,
    #                     type=int, help='eval TUBES_Thtrshold at evaluation time')

    ###
    parser.add_argument('--LOG_START',
                        default=10,
                        type=int,
                        help='start loging after k steps for text/tensorboard')
    parser.add_argument('--LOG_STEP',
                        default=10,
                        type=int,
                        help='Log every k steps for text/tensorboard')
    parser.add_argument(
        '--TENSORBOARD',
        default=1,
        type=str2bool,
        help='Use tensorboard for loss/evalaution visualization')

    # Program arguments
    parser.add_argument('--MAN_SEED',
                        default=123,
                        type=int,
                        help='manualseed for reproduction')
    parser.add_argument(
        '--MULTI_GPUS',
        default=True,
        type=str2bool,
        help=
        'If  more than 0 then use all visible GPUs by default only one GPU used '
    )

    # Use CUDA_VISIBLE_DEVICES=0,1,4,6 to select GPUs to use

    ## Parse arguments
    args = parser.parse_args()

    args = utils.set_args(args)  # set directories and SUBSETS fo datasets
    args.MULTI_GPUS = False if args.BATCH_SIZE == 1 else args.MULTI_GPUS
    ## set random seeds and global settings
    np.random.seed(args.MAN_SEED)
    torch.manual_seed(args.MAN_SEED)
    # torch.cuda.manual_seed_all(args.MAN_SEED)
    torch.set_default_tensor_type('torch.FloatTensor')

    args = utils.create_exp_name(args)

    utils.setup_logger(args)
    logger = utils.get_logger(__name__)
    logger.info(sys.version)

    assert args.MODE in [
        'train', 'val', 'gen_dets', 'eval_frames', 'eval_tubes'
    ], 'MODE must be from ' + ','.join(['train', 'test', 'tubes'])

    if args.MODE == 'train':
        args.TEST_SEQ_LEN = args.SEQ_LEN
    else:
        args.SEQ_LEN = args.TEST_SEQ_LEN

    if args.MODE in ['train', 'val']:
        # args.CONF_THRESH = 0.05
        args.SUBSETS = args.TRAIN_SUBSETS
        train_transform = transforms.Compose([
            vtf.ResizeClip(args.MIN_SIZE, args.MAX_SIZE),
            vtf.ToTensorStack(),
            vtf.Normalize(mean=args.MEANS, std=args.STDS)
        ])

        # train_skip_step = args.SEQ_LEN
        # if args.SEQ_LEN>4 and args.SEQ_LEN<=10:
        #     train_skip_step = args.SEQ_LEN-2
        if args.SEQ_LEN > 10:
            train_skip_step = args.SEQ_LEN + (args.MAX_SEQ_STEP - 1) * 2 - 2
        else:
            train_skip_step = args.SEQ_LEN

        train_dataset = VideoDataset(args,
                                     train=True,
                                     skip_step=train_skip_step,
                                     transform=train_transform)
        logger.info('Done Loading Dataset Train Dataset')
        ## For validation set
        full_test = False
        args.SUBSETS = args.VAL_SUBSETS
        skip_step = args.SEQ_LEN * 8
    else:
        args.SEQ_LEN = args.TEST_SEQ_LEN
        args.MAX_SEQ_STEP = 1
        args.SUBSETS = args.TEST_SUBSETS
        full_test = True  #args.MODE != 'train'
        args.skip_beggning = 0
        args.skip_ending = 0
        if args.MODEL_TYPE == 'I3D':
            args.skip_beggning = 2
            args.skip_ending = 2
        elif args.MODEL_TYPE != 'C2D':
            args.skip_beggning = 2

        skip_step = args.SEQ_LEN - args.skip_beggning

    val_transform = transforms.Compose([
        vtf.ResizeClip(args.MIN_SIZE, args.MAX_SIZE),
        vtf.ToTensorStack(),
        vtf.Normalize(mean=args.MEANS, std=args.STDS)
    ])

    val_dataset = VideoDataset(args,
                               train=False,
                               transform=val_transform,
                               skip_step=skip_step,
                               full_test=full_test)
    logger.info('Done Loading Dataset Validation Dataset')

    args.num_classes = val_dataset.num_classes
    # one for objectness
    args.label_types = val_dataset.label_types
    args.num_label_types = val_dataset.num_label_types
    args.all_classes = val_dataset.all_classes
    args.num_classes_list = val_dataset.num_classes_list
    args.num_ego_classes = val_dataset.num_ego_classes
    args.ego_classes = val_dataset.ego_classes
    args.head_size = 256

    if args.MODE in ['train', 'val', 'gen_dets']:
        net = build_retinanet(args).cuda()
        if args.MULTI_GPUS:
            logger.info('\nLets do dataparallel\n')
            net = torch.nn.DataParallel(net)

    for arg in sorted(vars(args)):
        logger.info(str(arg) + ': ' + str(getattr(args, arg)))

    if args.MODE == 'train':
        if args.FBN:
            if args.MULTI_GPUS:
                net.module.backbone.apply(utils.set_bn_eval)
            else:
                net.backbone.apply(utils.set_bn_eval)
        train(args, net, train_dataset, val_dataset)
    elif args.MODE == 'val':
        val(args, net, val_dataset)
    elif args.MODE == 'gen_dets':
        gen_dets(args, net, val_dataset)
        eval_framewise_dets(args, val_dataset)
        build_eval_tubes(args, val_dataset)
    elif args.MODE == 'eval_frames':
        eval_framewise_dets(args, val_dataset)
    elif args.MODE == 'eval_tubes':
        build_eval_tubes(args, val_dataset)
示例#16
0
    def __init__(
        self,
        file_names: Iterable[str],
        working_dir=None,
        cache_dir=None,
        sr=44100,
        n_fft=4096,
        hop_length=441,
        freq_compression="linear",
        n_freq_bins=256,
        f_min=0,
        f_max=18000,
        seq_len=128,
        augmentation=False,
        noise_files_train=[],
        noise_files_val=[],
        noise_files_test=[],
        random=False,
        *args,
        **kwargs
    ):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.sp = signal.signal_proc()

        self.df = 15.0
        self.exp_e = 0.1
        self.bin_pow = 2.0
        self.gaus_mean = 0.0
        self.gaus_stdv = 12.5
        self.poisson_lambda = 15.0
        self.orig_noise_value = -5

        self.f_min = f_min
        self.f_max = f_max
        self.n_fft = n_fft
        self.random = random
        self.hop_length = hop_length
        self.augmentation = augmentation
        self.file_reader = AsyncFileReader()
        self.noise_files_val = noise_files_val
        self.noise_files_test = noise_files_test
        self.freq_compression = freq_compression
        self.noise_files_train = noise_files_train


        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if self.freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(self.freq_compressio, valid_freq_compressions),
            )

        self._logger.debug(
            "Number of files to denoise : {}".format(len(self.file_names))
        )

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]

        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader())

        if self.augmentation:
            self._logger.debug("Init augmentation transforms for intensity, time, and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            #only for noise augmentation during validation phase - intensity, time and pitch augmentation is not used during validation/test
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
            self._logger.debug("Running without intensity, time, and pitch augmentation")

        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
            self.t_compr_mfcc = T.M2MFCC(n_mfcc=32)
        else:
            raise "Undefined frequency compression"

        if self.augmentation and self.noise_files_train and self.dataset_name == "train":
            self._logger.debug("Init training real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_train,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        elif not self.augmentation and self.noise_files_val and self.dataset_name == "val":
            self._logger.debug("Init validation real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_val,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        elif not self.augmentation and self.noise_files_test and self.dataset_name == "test":
            self._logger.debug("Init test real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_test,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        else:
            self.t_addnoise = None
            raise "ERROR: Init noise files for noise adding does not have a proper setup per split!"

        self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])

        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )

        self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
示例#17
0
    def __init__(
        self,
        file_names: Iterable[str],
        working_dir=None,
        cache_dir=None,
        sr=44100,
        n_fft=1024,
        hop_length=512,
        freq_compression="linear",
        n_freq_bins=256,
        f_min=None,
        f_max=18000,
        *args,
        **kwargs
    ):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.sp = signal.signal_proc()

        self.sr = sr
        self.f_min = f_min
        self.f_max = f_max
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.sp = signal.signal_proc()
        self.freq_compression = freq_compression

        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if self.freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
               format(self.freq_compression, valid_freq_compressions),
            )

        self._logger.debug(
            "Number of test files: {}".format(len(self.file_names))
        )

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False)
        ]

        self.file_reader = AsyncFileReader()

        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )

        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(
                n_freq_bins, sr, f_min, f_max
            )
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC()
            )
        else:
            raise "Undefined frequency compression"

        self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])

        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )