Exemplo n.º 1
0
def cmd_analyze(hyp, ref, val_config, vocab, lang, detailed):
	vocab = set(map(str.strip, open(vocab))) if os.path.exists(vocab) else set()
	if lang is not None:
		import datasets
		import ru

		labels = {
			'ru': lambda: datasets.Labels(ru)
		}
		postprocess_transcript = labels[lang]().postprocess_transcript
	else:
		postprocess_transcript = False
	if os.path.exists(val_config):
		val_config = json.load(open(val_config))
		analyzer_configs = val_config['error_analyzer']
		word_tags = val_config['word_tags']
	else:
		analyzer_configs = {}
		word_tags = {}

	word_tagger = WordTagger(word_tags = word_tags, vocab = vocab)
	error_tagger = ErrorTagger()
	analyzer = ErrorAnalyzer(word_tagger = word_tagger, error_tagger = error_tagger, configs = analyzer_configs)
	report = analyzer.analyze(hyp = hyp, ref = ref, postprocess_transcript = postprocess_transcript, detailed=detailed)
	print(json.dumps(report, ensure_ascii = False, indent = 2, sort_keys = True))
Exemplo n.º 2
0
def tabulate(
	experiments_dir,
	experiment_id,
	entropy,
	loss,
	cer10,
	cer15,
	cer20,
	cer30,
	cer40,
	cer50,
	per,
	wer,
	json_,
	bpe,
	der,
	lang
):
	# TODO: bring back custom name to the filtration process, or remove filtration by labels_name entirely.
	labels = datasets.Labels(lang=datasets.Language(lang), name='char')

	res = collections.defaultdict(list)
	experiment_dir = os.path.join(experiments_dir, experiment_id)
	for f in sorted(glob.glob(os.path.join(experiment_dir, f'transcripts_*.json'))):
		eidx = f.find('epoch')
		iteration = f[eidx:].replace('.json', '')
		val_dataset_name = f[f.find('transcripts_') + len('transcripts_'):eidx]
		checkpoint = os.path.join(experiment_dir, 'checkpoint_' + f[eidx:].replace('.json', '.pt')) if not json_ else f
		metric = 'wer' if wer else 'entropy' if entropy else 'loss' if loss else 'per' if per else 'der' if der else 'cer'
		val = torch.tensor([j[metric] for j in json.load(open(f)) if j['labels_name'] == labels.name] or [0.0])
		val = val[~(torch.isnan(val) | torch.isinf(val))]

		if cer10 or cer20 or cer30 or cer40 or cer50:
			val = (val < 0.1 * [False, cer10, cer20, cer30, cer40, cer50].index(True)).float()
		if cer15:
			val = (val < 0.15).float()
		res[iteration].append((val_dataset_name, float(val.mean()), checkpoint))
	val_dataset_names = sorted(set(val_dataset_name for r in res.values() for val_dataset_name, cer, checkpoint in r))
	print('iteration\t' + '\t'.join(val_dataset_names))
	for iteration, r in res.items():
		cers = {val_dataset_name: f'{cer:.04f}' for val_dataset_name, cer, checkpoint in r}
		print(
			f'{iteration}\t' + '\t'.join(cers.get(val_dataset_name, '')
											for val_dataset_name in val_dataset_names) + f'\t{r[-1][-1]}'
		)
Exemplo n.º 3
0
def normalize(input_path, lang, dry = True):
	lang = datasets.Language(lang)
	labels = datasets.Labels(lang)
	for transcript_path in input_path:
		with open(transcript_path) as f:
			transcript = json.load(f)
		for t in transcript:
			if 'ref' in t:
				t['ref'] = labels.postprocess_transcript(lang.normalize_text(t['ref']))
			if 'hyp' in t:
				t['hyp'] = labels.postprocess_transcript(lang.normalize_text(t['hyp']))

			if 'ref' in t and 'hyp' in t:
				t['cer'] = t['cer'] if 'cer' in t else metrics.cer(t['hyp'], t['ref'])
				t['wer'] = t['wer'] if 'wer' in t else metrics.wer(t['hyp'], t['ref'])

		if not dry:
			json.dump(transcript, open(transcript_path, 'w'), ensure_ascii = False, indent = 2, sort_keys = True)
		else:
			return transcript
Exemplo n.º 4
0
def setup(args):
    torch.set_grad_enabled(False)
    checkpoint = torch.load(args.checkpoint, map_location='cpu')
    args.sample_rate, args.window_size, args.window_stride, args.window, args.num_input_features = map(
        checkpoint['args'].get, [
            'sample_rate', 'window_size', 'window_stride', 'window',
            'num_input_features'
        ])
    frontend = models.LogFilterBankFrontend(args.num_input_features,
                                            args.sample_rate,
                                            args.window_size,
                                            args.window_stride,
                                            args.window,
                                            eps=1e-6)
    labels = datasets.Labels(datasets.Language(checkpoint['args']['lang']),
                             name='char')
    model = getattr(models, args.model or checkpoint['args']['model'])(
        args.num_input_features, [len(labels)],
        frontend=frontend,
        dict=lambda logits, log_probs, olen, **kwargs: (logits[0], olen[0]))
    model.load_state_dict(checkpoint['model_state_dict'], strict=False)
    model = model.to(args.device)
    model.eval()
    model.fuse_conv_bn_eval()
    if args.device != 'cpu':
        model, *_ = models.data_parallel_and_autocast(model,
                                                      opt_level=args.fp16)
    decoder = decoders.GreedyDecoder(
    ) if args.decoder == 'GreedyDecoder' else decoders.BeamSearchDecoder(
        labels,
        lm_path=args.lm,
        beam_width=args.beam_width,
        beam_alpha=args.beam_alpha,
        beam_beta=args.beam_beta,
        num_workers=args.num_workers,
        topk=args.decoder_topk)
    return labels, frontend, model, decoder
Exemplo n.º 5
0
def logits(lang, logits, audio_name = None, MAX_ENTROPY = 1.0):
	good_audio_name = set(map(str.strip, open(audio_name[0])) if os.path.exists(audio_name[0]) else audio_name) if audio_name is not None else []
	labels = datasets.Labels(datasets.Language(lang))
	decoder = decoders.GreedyDecoder()
	tick_params = lambda ax, labelsize = 2.5, length = 0, **kwargs: ax.tick_params(axis = 'both', which = 'both', labelsize = labelsize, length = length, **kwargs) or [ax.set_linewidth(0) for ax in ax.spines.values()]
	logits_path = logits + '.html'
	html = open(logits_path, 'w')
	html.write('<html><head>' + meta_charset + f'</head><body><script>{play_script}{onclick_img_script}</script>')

	for i, t in enumerate(torch.load(logits)):
		audio_path, logits = t['audio_path'], t['logits']
		words = t.get('words', [t])
		y = t.get('y', torch.zeros(1, 0, dtype = torch.long))
		begin = t.get('begin', '')
		end = t.get('end', '')
		audio_name = transcripts.audio_name(audio_path)
		extra_metrics = dict(cer = t['cer']) if 'cer' in t else {}

		if good_audio_name and audio_name not in good_audio_name:
			continue

		log_probs = F.log_softmax(logits, dim = 0)
		entropy = models.entropy(log_probs, dim = 0, sum = False)
		log_probs_ = F.log_softmax(logits[:-1], dim = 0)
		entropy_ = models.entropy(log_probs_, dim = 0, sum = False)
		margin = models.margin(log_probs, dim = 0)
		#energy = features.exp().sum(dim = 0)[::2]

		plt.figure(figsize = (6, 2))
		ax = plt.subplot(211)
		plt.imshow(logits, aspect = 'auto')
		plt.xlim(0, logits.shape[-1] - 1)
		#plt.yticks([])
		plt.axis('off')
		tick_params(plt.gca())
		#plt.subplots_adjust(left = 0, right = 1, bottom = 0.12, top = 0.95)

		plt.subplot(212, sharex = ax)
		prob_top1, prob_top2 = log_probs.exp().topk(2, dim = 0).values
		plt.hlines(1.0, 0, entropy.shape[-1] - 1, linewidth = 0.2)
		artist_prob_top1, = plt.plot(prob_top1, 'b', linewidth = 0.3)
		artist_prob_top2, = plt.plot(prob_top2, 'g', linewidth = 0.3)
		artist_entropy, = plt.plot(entropy, 'r', linewidth = 0.3)
		artist_entropy_, = plt.plot(entropy_, 'yellow', linewidth = 0.3)
		plt.legend([artist_entropy, artist_entropy_, artist_prob_top1, artist_prob_top2],
					['entropy', 'entropy, no blank', 'top1 prob', 'top2 prob'],
					loc = 1,
					fontsize = 'xx-small',
					frameon = False)
		
		for b, e, v in zip(*models.rle1d(entropy > MAX_ENTROPY)):
			if bool(v):
				plt.axvspan(int(b), int(e), color='red', alpha=0.2)

		plt.ylim(0, 3.0)
		plt.xlim(0, entropy.shape[-1] - 1)

		decoded = decoder.decode(log_probs.unsqueeze(0), K = 5)[0]
		xlabels = list(
			map(
				'\n'.join,
				zip(
					*[
						labels.decode(d, replace_blank = '.', replace_space = '_', replace_repeat = False, strip = False)
						for d in decoded
					]
				)
			)
		)
		plt.xticks(torch.arange(entropy.shape[-1]), xlabels, fontfamily = 'monospace')
		tick_params(plt.gca())

		if y.numel() > 0:
			alignment = ctc.alignment(
				log_probs.unsqueeze(0).permute(2, 0, 1),
				y.unsqueeze(0).long(),
				torch.LongTensor([log_probs.shape[-1]]),
				torch.LongTensor([len(y)]),
				blank = len(log_probs) - 1
			).squeeze(0)
			
			ax = plt.gca().secondary_xaxis('top')
			ref, ref_ = labels.decode(y.tolist(), replace_blank = '.', replace_space = '_', replace_repeat = False, strip = False), alignment
			ax.set_xticklabels(ref)
			ax.set_xticks(ref_)
			tick_params(ax, colors = 'red')

		#k = 0
		#for i, c in enumerate(ref + ' '):
		#	if c == ' ':
		#		plt.axvspan(ref_[k] - 1, ref_[i - 1] + 1, facecolor = 'gray', alpha = 0.2)
		#		k = i + 1

		plt.subplots_adjust(left = 0, right = 1, bottom = 0.12, top = 0.95)

		buf = io.BytesIO()
		plt.savefig(buf, format = 'jpg', dpi = 600)
		plt.close()

		html.write(f'<h4>{audio_name}')
		html.write(' | '.join('{k}: {v:.02f}' for k, v in extra_metrics.items()))
		html.write('</h4>')
		html.write(fmt_alignment(words))
		html.write('<img data-begin="{begin}" data-end="{end}" data-channel="{channel}" onclick="onclick_img(event)" style="width:100%" src="data:image/jpeg;base64,{encoded}"></img>\n'.format(channel = i, begin = begin, end = end, encoded = base64.b64encode(buf.getvalue()).decode()))
		html.write(fmt_audio(audio_path = audio_path, channel = i))
		html.write('<hr/>')
	html.write('</body></html>')
	return logits_path
Exemplo n.º 6
0
def main(args):
    checkpoints = [
        torch.load(checkpoint_path, map_location='cpu')
        for checkpoint_path in args.checkpoint
    ]
    checkpoint = (checkpoints + [{}])[0]
    if len(checkpoints) > 1:
        checkpoint['model_state_dict'] = {
            k: sum(c['model_state_dict'][k]
                   for c in checkpoints) / len(checkpoints)
            for k in checkpoint['model_state_dict']
        }

    if args.frontend_checkpoint:
        frontend_checkpoint = torch.load(args.frontend_checkpoint,
                                         map_location='cpu')
        frontend_extra_args = frontend_checkpoint['args']
        frontend_checkpoint = frontend_checkpoint['model']
    else:
        frontend_extra_args = None
        frontend_checkpoint = None

    args.experiment_id = args.experiment_id.format(
        model=args.model,
        frontend=args.frontend,
        train_batch_size=args.train_batch_size,
        optimizer=args.optimizer,
        lr=args.lr,
        weight_decay=args.weight_decay,
        time=time.strftime('%Y-%m-%d_%H-%M-%S'),
        experiment_name=args.experiment_name,
        bpe='bpe' if args.bpe else '',
        train_waveform_transform=
        f'aug{args.train_waveform_transform[0]}{args.train_waveform_transform_prob or ""}'
        if args.train_waveform_transform else '',
        train_feature_transform=
        f'aug{args.train_feature_transform[0]}{args.train_feature_transform_prob or ""}'
        if args.train_feature_transform else '').replace('e-0',
                                                         'e-').rstrip('_')
    if checkpoint and 'experiment_id' in checkpoint[
            'args'] and not args.experiment_name:
        args.experiment_id = checkpoint['args']['experiment_id']
    args.experiment_dir = args.experiment_dir.format(
        experiments_dir=args.experiments_dir, experiment_id=args.experiment_id)

    os.makedirs(args.experiment_dir, exist_ok=True)

    if args.log_json:
        args.log_json = os.path.join(args.experiment_dir, 'log.json')

    if checkpoint:
        args.lang, args.model, args.num_input_features, args.sample_rate, args.window, args.window_size, args.window_stride = map(
            checkpoint['args'].get, [
                'lang', 'model', 'num_input_features', 'sample_rate', 'window',
                'window_size', 'window_stride'
            ])
        utils.set_up_root_logger(os.path.join(args.experiment_dir, 'log.txt'),
                                 mode='a')
        logfile_sink = JsonlistSink(args.log_json, mode='a')
    else:
        utils.set_up_root_logger(os.path.join(args.experiment_dir, 'log.txt'),
                                 mode='w')
        logfile_sink = JsonlistSink(args.log_json, mode='w')

    _print = utils.get_root_logger_print()
    _print('\n', 'Arguments:', args)
    _print(
        f'"CUDA_VISIBLE_DEVICES={os.environ.get("CUDA_VISIBLE_DEVICES", default = "")}"'
    )
    _print(
        f'"CUDA_LAUNCH_BLOCKING={os.environ.get("CUDA_LAUNCH_BLOCKING", default="")}"'
    )
    _print('Experiment id:', args.experiment_id, '\n')
    if args.dry:
        return
    utils.set_random_seed(args.seed)
    if args.cudnn == 'benchmark':
        torch.backends.cudnn.benchmark = True

    lang = datasets.Language(args.lang)
    #TODO: , candidate_sep = datasets.Labels.candidate_sep
    normalize_text_config = json.load(open(
        args.normalize_text_config)) if os.path.exists(
            args.normalize_text_config) else {}
    labels = [
        datasets.Labels(
            lang, name='char', normalize_text_config=normalize_text_config)
    ] + [
        datasets.Labels(lang,
                        bpe=bpe,
                        name=f'bpe{i}',
                        normalize_text_config=normalize_text_config)
        for i, bpe in enumerate(args.bpe)
    ]
    frontend = getattr(models,
                       args.frontend)(out_channels=args.num_input_features,
                                      sample_rate=args.sample_rate,
                                      window_size=args.window_size,
                                      window_stride=args.window_stride,
                                      window=args.window,
                                      dither=args.dither,
                                      dither0=args.dither0,
                                      stft_mode='conv' if args.onnx else None,
                                      extra_args=frontend_extra_args)
    model = getattr(models, args.model)(
        num_input_features=args.num_input_features,
        num_classes=list(map(len, labels)),
        dropout=args.dropout,
        decoder_type='bpe' if args.bpe else None,
        frontend=frontend if args.onnx or args.frontend_in_model else None,
        **(dict(inplace=False,
                dict=lambda logits, log_probs, olen, **kwargs: logits[0])
           if args.onnx else {}))

    _print('Model capacity:', int(models.compute_capacity(model, scale=1e6)),
           'million parameters\n')

    if checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'], strict=False)

    if frontend_checkpoint:
        frontend_checkpoint = {
            'model.' + name: weight
            for name, weight in frontend_checkpoint.items()
        }  ##TODO remove after save checkpoint naming fix
        frontend.load_state_dict(frontend_checkpoint)

    if args.onnx:
        torch.set_grad_enabled(False)
        model.eval()
        model.to(args.device)
        model.fuse_conv_bn_eval()

        if args.fp16:
            model = models.InputOutputTypeCast(model.to(torch.float16),
                                               dtype=torch.float16)

        waveform_input = torch.rand(args.onnx_sample_batch_size,
                                    args.onnx_sample_time,
                                    device=args.device)
        logits = model(waveform_input)

        torch.onnx.export(model, (waveform_input, ),
                          args.onnx,
                          opset_version=args.onnx_opset,
                          export_params=args.onnx_export_params,
                          do_constant_folding=True,
                          input_names=['x'],
                          output_names=['logits'],
                          dynamic_axes=dict(x={
                              0: 'B',
                              1: 'T'
                          },
                                            logits={
                                                0: 'B',
                                                2: 't'
                                            }))
        onnxruntime_session = onnxruntime.InferenceSession(args.onnx)
        if args.verbose:
            onnxruntime.set_default_logger_severity(0)
        (logits_, ) = onnxruntime_session.run(
            None, dict(x=waveform_input.cpu().numpy()))
        assert torch.allclose(logits.cpu(),
                              torch.from_numpy(logits_),
                              rtol=1e-02,
                              atol=1e-03)

        #model_def = onnx.load(args.onnx)
        #import onnx.tools.net_drawer # import GetPydotGraph, GetOpNodeProducer
        #pydot_graph = GetPydotGraph(model_def.graph, name=model_def.graph.name, rankdir="TB", node_producer=GetOpNodeProducer("docstring", color="yellow", fillcolor="yellow", style="filled"))
        #pydot_graph.write_dot("pipeline_transpose2x.dot")
        #os.system('dot -O -Gdpi=300 -Tpng pipeline_transpose2x.dot')
        # add metadata to model
        return

    perf.init_default(loss=dict(K=50, max=1000),
                      memory_cuda_allocated=dict(K=50),
                      entropy=dict(K=4),
                      time_ms_iteration=dict(K=50, max=10_000),
                      lr=dict(K=50, max=1))

    val_config = json.load(open(args.val_config)) if os.path.exists(
        args.val_config) else {}
    word_tags = json.load(open(args.word_tags)) if os.path.exists(
        args.word_tags) else {}
    for word_tag, words in val_config.get('word_tags', {}).items():
        word_tags[word_tag] = word_tags.get(word_tag, []) + words
    vocab = set(map(str.strip, open(args.vocab))) if os.path.exists(
        args.vocab) else set()
    error_analyzer = metrics.ErrorAnalyzer(
        metrics.WordTagger(lang, vocab=vocab, word_tags=word_tags),
        metrics.ErrorTagger(), val_config.get('error_analyzer', {}))

    make_transform = lambda name_args, prob: None if not name_args else getattr(
        transforms, name_args[0])(*name_args[1:]) if prob is None else getattr(
            transforms, name_args[0])(prob, *name_args[1:]
                                      ) if prob > 0 else None
    val_frontend = models.AugmentationFrontend(
        frontend,
        waveform_transform=make_transform(args.val_waveform_transform,
                                          args.val_waveform_transform_prob),
        feature_transform=make_transform(args.val_feature_transform,
                                         args.val_feature_transform_prob))

    if args.val_waveform_transform_debug_dir:
        args.val_waveform_transform_debug_dir = os.path.join(
            args.val_waveform_transform_debug_dir,
            str(val_frontend.waveform_transform) if isinstance(
                val_frontend.waveform_transform, transforms.RandomCompose) else
            val_frontend.waveform_transform.__class__.__name__)
        os.makedirs(args.val_waveform_transform_debug_dir, exist_ok=True)

    val_data_loaders = {
        os.path.basename(val_data_path): torch.utils.data.DataLoader(
            val_dataset,
            num_workers=args.num_workers,
            collate_fn=val_dataset.collate_fn,
            pin_memory=True,
            shuffle=False,
            batch_size=args.val_batch_size,
            worker_init_fn=datasets.worker_init_fn,
            timeout=args.timeout if args.num_workers > 0 else 0)
        for val_data_path in args.val_data_path for val_dataset in [
            datasets.AudioTextDataset(
                val_data_path,
                labels,
                args.sample_rate,
                frontend=val_frontend if not args.frontend_in_model else None,
                waveform_transform_debug_dir=args.
                val_waveform_transform_debug_dir,
                min_duration=args.min_duration,
                time_padding_multiple=args.batch_time_padding_multiple,
                pop_meta=True,
                _print=_print)
        ]
    }
    decoder = [
        decoders.GreedyDecoder() if args.decoder == 'GreedyDecoder' else
        decoders.BeamSearchDecoder(labels[0],
                                   lm_path=args.lm,
                                   beam_width=args.beam_width,
                                   beam_alpha=args.beam_alpha,
                                   beam_beta=args.beam_beta,
                                   num_workers=args.num_workers,
                                   topk=args.decoder_topk)
    ] + [decoders.GreedyDecoder() for bpe in args.bpe]

    model.to(args.device)

    if not args.train_data_path:
        model.eval()
        if not args.adapt_bn:
            model.fuse_conv_bn_eval()
        if args.device != 'cpu':
            model, *_ = models.data_parallel_and_autocast(
                model,
                opt_level=args.fp16,
                keep_batchnorm_fp32=args.fp16_keep_batchnorm_fp32)
        evaluate_model(args, val_data_loaders, model, labels, decoder,
                       error_analyzer)
        return

    model.freeze(backbone=args.freeze_backbone,
                 decoder0=args.freeze_decoder,
                 frontend=args.freeze_frontend)

    train_frontend = models.AugmentationFrontend(
        frontend,
        waveform_transform=make_transform(args.train_waveform_transform,
                                          args.train_waveform_transform_prob),
        feature_transform=make_transform(args.train_feature_transform,
                                         args.train_feature_transform_prob))
    tic = time.time()
    train_dataset = datasets.AudioTextDataset(
        args.train_data_path,
        labels,
        args.sample_rate,
        frontend=train_frontend if not args.frontend_in_model else None,
        min_duration=args.min_duration,
        max_duration=args.max_duration,
        time_padding_multiple=args.batch_time_padding_multiple,
        bucket=lambda example: int(
            math.ceil(((example[0]['end'] - example[0]['begin']) / args.
                       window_stride + 1) / args.batch_time_padding_multiple)),
        pop_meta=True,
        _print=_print)

    _print('Time train dataset created:', time.time() - tic, 'sec')
    train_dataset_name = '_'.join(map(os.path.basename, args.train_data_path))
    tic = time.time()
    sampler = datasets.BucketingBatchSampler(
        train_dataset,
        batch_size=args.train_batch_size,
    )
    _print('Time train sampler created:', time.time() - tic, 'sec')

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        num_workers=args.num_workers,
        collate_fn=train_dataset.collate_fn,
        pin_memory=True,
        batch_sampler=sampler,
        worker_init_fn=datasets.worker_init_fn,
        timeout=args.timeout if args.num_workers > 0 else 0)
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=args.lr,
        momentum=args.momentum,
        weight_decay=args.weight_decay,
        nesterov=args.nesterov
    ) if args.optimizer == 'SGD' else torch.optim.AdamW(
        model.parameters(),
        lr=args.lr,
        betas=args.betas,
        weight_decay=args.weight_decay
    ) if args.optimizer == 'AdamW' else optimizers.NovoGrad(
        model.parameters(),
        lr=args.lr,
        betas=args.betas,
        weight_decay=args.weight_decay
    ) if args.optimizer == 'NovoGrad' else apex.optimizers.FusedNovoGrad(
        model.parameters(),
        lr=args.lr,
        betas=args.betas,
        weight_decay=args.weight_decay
    ) if args.optimizer == 'FusedNovoGrad' else None

    if checkpoint and checkpoint['optimizer_state_dict'] is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        if not args.skip_optimizer_reset:
            optimizers.reset_options(optimizer)

    scheduler = optimizers.MultiStepLR(
        optimizer, gamma=args.decay_gamma, milestones=args.decay_milestones
    ) if args.scheduler == 'MultiStepLR' else optimizers.PolynomialDecayLR(
        optimizer,
        power=args.decay_power,
        decay_steps=len(train_data_loader) * args.decay_epochs,
        end_lr=args.decay_lr
    ) if args.scheduler == 'PolynomialDecayLR' else optimizers.NoopLR(
        optimizer)
    epoch, iteration = 0, 0
    if checkpoint:
        epoch, iteration = checkpoint['epoch'], checkpoint['iteration']
        if args.train_data_path == checkpoint['args']['train_data_path']:
            sampler.load_state_dict(checkpoint['sampler_state_dict'])
            if args.iterations_per_epoch and iteration and iteration % args.iterations_per_epoch == 0:
                sampler.batch_idx = 0
                epoch += 1
        else:
            epoch += 1
    if args.iterations_per_epoch:
        epoch_skip_fraction = 1 - args.iterations_per_epoch / len(
            train_data_loader)
        assert epoch_skip_fraction < args.max_epoch_skip_fraction, \
         f'args.iterations_per_epoch must not skip more than {args.max_epoch_skip_fraction:.1%} of each epoch'

    if args.device != 'cpu':
        model, optimizer = models.data_parallel_and_autocast(
            model,
            optimizer,
            opt_level=args.fp16,
            keep_batchnorm_fp32=args.fp16_keep_batchnorm_fp32)
    if checkpoint and args.fp16 and checkpoint['amp_state_dict'] is not None:
        apex.amp.load_state_dict(checkpoint['amp_state_dict'])

    model.train()

    tensorboard_dir = os.path.join(args.experiment_dir, 'tensorboard')
    if checkpoint and args.experiment_name:
        tensorboard_dir_checkpoint = os.path.join(
            os.path.dirname(args.checkpoint[0]), 'tensorboard')
        if os.path.exists(tensorboard_dir_checkpoint
                          ) and not os.path.exists(tensorboard_dir):
            shutil.copytree(tensorboard_dir_checkpoint, tensorboard_dir)
    tensorboard = torch.utils.tensorboard.SummaryWriter(tensorboard_dir)
    tensorboard_sink = TensorboardSink(tensorboard)

    with open(os.path.join(args.experiment_dir, args.args), 'w') as f:
        json.dump(vars(args), f, sort_keys=True, ensure_ascii=False, indent=2)

    with open(os.path.join(args.experiment_dir, args.dump_model_config),
              'w') as f:
        model_config = dict(
            init_params=models.master_module(model).init_params,
            model=repr(models.master_module(model)))
        json.dump(model_config,
                  f,
                  sort_keys=True,
                  ensure_ascii=False,
                  indent=2)

    tic, toc_fwd, toc_bwd = time.time(), time.time(), time.time()

    oom_handler = utils.OomHandler(max_retries=args.oom_retries)
    for epoch in range(epoch, args.epochs):
        sampler.shuffle(epoch + args.seed_sampler)
        time_epoch_start = time.time()
        for batch_idx, (meta, s, x, xlen, y,
                        ylen) in enumerate(train_data_loader,
                                           start=sampler.batch_idx):
            toc_data = time.time()
            if batch_idx == 0:
                time_ms_launch_data_loader = (toc_data - tic) * 1000
                _print('Time data loader launch @ ', epoch, ':',
                       time_ms_launch_data_loader / 1000, 'sec')

            lr = optimizer.param_groups[0]['lr']
            perf.update(dict(lr=lr))

            x, xlen, y, ylen = x.to(args.device, non_blocking=True), xlen.to(
                args.device, non_blocking=True), y.to(
                    args.device, non_blocking=True), ylen.to(args.device,
                                                             non_blocking=True)
            try:
                #TODO check nan values in tensors, they can break running_stats in bn
                log_probs, olen, loss = map(
                    model(x, xlen, y=y, ylen=ylen).get,
                    ['log_probs', 'olen', 'loss'])
                oom_handler.reset()
            except:
                if oom_handler.try_recover(model.parameters(), _print=_print):
                    continue
                else:
                    raise
            example_weights = ylen[:, 0]
            loss, loss_cur = (loss * example_weights).mean(
            ) / args.train_batch_accumulate_iterations, float(loss.mean())

            perf.update(dict(loss_BT_normalized=loss_cur))

            entropy = float(
                models.entropy(log_probs[0], olen[0], dim=1).mean())
            toc_fwd = time.time()
            #TODO: inf/nan still corrupts BN stats
            if not (torch.isinf(loss) or torch.isnan(loss)):
                if args.fp16:
                    with apex.amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                if iteration % args.train_batch_accumulate_iterations == 0:
                    torch.nn.utils.clip_grad_norm_(
                        apex.amp.master_params(optimizer)
                        if args.fp16 else model.parameters(), args.max_norm)
                    optimizer.step()

                    if iteration > 0 and iteration % args.log_iteration_interval == 0:
                        perf.update(utils.compute_memory_stats(),
                                    prefix='performance')
                        tensorboard_sink.perf(perf.default(), iteration,
                                              train_dataset_name)
                        tensorboard_sink.weight_stats(
                            iteration, model, args.log_weight_distribution)
                        logfile_sink.perf(perf.default(), iteration,
                                          train_dataset_name)

                    optimizer.zero_grad()
                    scheduler.step(iteration)
                perf.update(dict(entropy=entropy))
            toc_bwd = time.time()

            time_ms_data, time_ms_fwd, time_ms_bwd, time_ms_model = map(
                lambda sec: sec * 1000, [
                    toc_data - tic, toc_fwd - toc_data, toc_bwd - toc_fwd,
                    toc_bwd - toc_data
                ])
            perf.update(dict(time_ms_data=time_ms_data,
                             time_ms_fwd=time_ms_fwd,
                             time_ms_bwd=time_ms_bwd,
                             time_ms_iteration=time_ms_data + time_ms_model),
                        prefix='performance')
            perf.update(dict(input_B=x.shape[0], input_T=x.shape[-1]),
                        prefix='performance')
            print_left = f'{args.experiment_id} | epoch: {epoch:02d} iter: [{batch_idx: >6d} / {len(train_data_loader)} {iteration: >6d}] {"x".join(map(str, x.shape))}'
            print_right = 'ent: <{avg_entropy:.2f}> loss: {cur_loss_BT_normalized:.2f} <{avg_loss_BT_normalized:.2f}> time: {performance_cur_time_ms_data:.2f}+{performance_cur_time_ms_fwd:4.0f}+{performance_cur_time_ms_bwd:4.0f} <{performance_avg_time_ms_iteration:.0f}> | lr: {cur_lr:.5f}'.format(
                **perf.default())
            _print(print_left, print_right)
            iteration += 1
            sampler.batch_idx += 1

            if iteration > 0 and (iteration % args.val_iteration_interval == 0
                                  or iteration == args.iterations):
                evaluate_model(args, val_data_loaders, model, labels, decoder,
                               error_analyzer, optimizer, sampler,
                               tensorboard_sink, logfile_sink, epoch,
                               iteration)

            if iteration and args.iterations and iteration >= args.iterations:
                return

            if args.iterations_per_epoch and iteration > 0 and iteration % args.iterations_per_epoch == 0:
                break

            tic = time.time()

        sampler.batch_idx = 0
        _print('Epoch time', (time.time() - time_epoch_start) / 60, 'minutes')
        if not args.skip_on_epoch_end_evaluation:
            evaluate_model(args, val_data_loaders, model, labels, decoder,
                           error_analyzer, optimizer, sampler,
                           tensorboard_sink, logfile_sink, epoch + 1,
                           iteration)
Exemplo n.º 7
0
parser.add_argument('-B', type = int, default = 256)
parser.add_argument('-T', type = int, default = 5.12)
parser.add_argument('--profile-cuda', action = 'store_true')
parser.add_argument('--profile-pyprof', action = 'store_true')
parser.add_argument('--profile-autograd')
parser.add_argument('--data-parallel', action = 'store_true')
parser.add_argument('--backward', action = 'store_true')
args = parser.parse_args()

checkpoint = torch.load(args.checkpoint, map_location = 'cpu') if args.checkpoint else None
if checkpoint:
	args.model, args.lang, args.sample_rate, args.window_size, args.window_stride, args.window, args.num_input_features = map(checkpoint['args'].get, ['model', 'lang', 'sample_rate', 'window_size', 'window_stride', 'window', 'num_input_features'])

use_cuda = 'cuda' in args.device

labels = datasets.Labels(datasets.Language(args.lang))

if args.onnx:
	onnxruntime_session = onnxruntime.InferenceSession(args.onnx)
	model = lambda x: onnxruntime_session.run(None, dict(x = x))
	load_batch = lambda x: x.numpy()

else:
	frontend = models.LogFilterBankFrontend(
		args.num_input_features,
		args.sample_rate,
		args.window_size,
		args.window_stride,
		args.window,
		stft_mode = args.stft_mode
	) if args.frontend else None
Exemplo n.º 8
0
def logits(logits, audio_name, MAX_ENTROPY=1.0):
    good_audio_name = set(
        map(str.strip, open(audio_name[0])) if os.path.
        exists(audio_name[0]) else audio_name)
    labels = datasets.Labels(ru)
    decoder = decoders.GreedyDecoder()
    tick_params = lambda ax, labelsize=2.5, length=0, **kwargs: ax.tick_params(
        axis='both',
        which='both',
        labelsize=labelsize,
        length=length,
        **kwargs) or [ax.set_linewidth(0) for ax in ax.spines.values()]
    logits_path = logits + '.html'
    html = open(logits_path, 'w')
    html.write('''<html><meta charset="utf-8"/><body><script>
		function onclick_(evt)
		{
			const img = evt.target;
			const dim = img.getBoundingClientRect();
			const t = (evt.clientX - dim.left) / dim.width;
			const audio = img.nextSibling;
			audio.currentTime = t * audio.duration;
			audio.play();
		}
	</script>''')
    for r in torch.load(logits):
        logits = r['logits']
        if good_audio_name and r['audio_name'] not in good_audio_name:
            continue

        ref_aligned, hyp_aligned = r['alignment']['ref'], r['alignment']['hyp']

        log_probs = F.log_softmax(logits, dim=0)
        entropy = models.entropy(log_probs, dim=0, sum=False)
        log_probs_ = F.log_softmax(logits[:-1], dim=0)
        entropy_ = models.entropy(log_probs_, dim=0, sum=False)
        margin = models.margin(log_probs, dim=0)
        #energy = features.exp().sum(dim = 0)[::2]

        alignment = ctc.alignment(log_probs.unsqueeze(0).permute(2, 0, 1),
                                  r['y'].unsqueeze(0).long(),
                                  torch.LongTensor([log_probs.shape[-1]]),
                                  torch.LongTensor([len(r['y'])]),
                                  blank=len(log_probs) - 1).squeeze(0)

        plt.figure(figsize=(6, 2))

        prob_top1, prob_top2 = log_probs.exp().topk(2, dim=0).values
        plt.hlines(1.0, 0, entropy.shape[-1] - 1, linewidth=0.2)
        artist_prob_top1, = plt.plot(prob_top1, 'b', linewidth=0.3)
        artist_prob_top2, = plt.plot(prob_top2, 'g', linewidth=0.3)
        artist_entropy, = plt.plot(entropy, 'r', linewidth=0.3)
        artist_entropy_, = plt.plot(entropy_, 'yellow', linewidth=0.3)
        plt.legend([
            artist_entropy, artist_entropy_, artist_prob_top1, artist_prob_top2
        ], ['entropy', 'entropy, no blank', 'top1 prob', 'top2 prob'],
                   loc=1,
                   fontsize='xx-small',
                   frameon=False)
        bad = (entropy > MAX_ENTROPY).tolist()
        #runs = []
        #for i, b in enumerate(bad):
        #	if b:
        #		if not runs or not bad[i - 1]:
        #			runs.append([i, i])
        #		else:
        #			runs[-1][1] += 1
        #for begin, end in runs:
        #	plt.axvspan(begin, end, color='red', alpha=0.2)

        plt.ylim(0, 3.0)
        plt.xlim(0, entropy.shape[-1] - 1)

        decoded = decoder.decode(log_probs.unsqueeze(0), K=5)[0]
        xlabels = list(
            map(
                '\n'.join,
                zip(*[
                    labels.decode(d,
                                  replace_blank='.',
                                  replace_space='_',
                                  replace_repeat=False) for d in decoded
                ])))
        #xlabels_ = labels.decode(log_probs.argmax(dim = 0).tolist(), blank = '.', space = '_', replace2 = False)
        plt.xticks(torch.arange(entropy.shape[-1]),
                   xlabels,
                   fontfamily='monospace')
        tick_params(plt.gca())

        ax = plt.gca().secondary_xaxis('top')
        ref, ref_ = labels.decode(r['y'].tolist(),
                                  replace_blank='.',
                                  replace_space='_',
                                  replace_repeat=False), alignment
        ax.set_xticklabels(ref)
        ax.set_xticks(ref_)
        tick_params(ax, colors='red')

        #k = 0
        #for i, c in enumerate(ref + ' '):
        #	if c == ' ':
        #		plt.axvspan(ref_[k] - 1, ref_[i - 1] + 1, facecolor = 'gray', alpha = 0.2)
        #		k = i + 1

        plt.subplots_adjust(left=0, right=1, bottom=0.12, top=0.95)

        buf = io.BytesIO()
        plt.savefig(buf, format='jpg', dpi=600)
        plt.close()

        html.write('<h4>{audio_name} | cer: {cer:.02f}</h4>'.format(**r))
        html.write(word_alignment(r['words']))
        html.write(
            '<img onclick="onclick_(event)" style="width:100%" src="data:image/jpeg;base64,{encoded}"></img>'
            .format(encoded=base64.b64encode(buf.getvalue()).decode()))
        html.write(
            '<audio style="width:100%" controls src="{audio_data_uri(r["audio_path"])}"></audio><hr/>'
        )
    html.write('</body></html>')
    print('\n', logits_path)