split='train', batch_size=batch_size, shuffle=True, downsampling_factor=ds_factor_text), val=D.flickr8k_loader(split='val', batch_size=batch_size)) else: data_asr = data if args.asr_model_dir: net = torch.load(os.path.join(args.asr_model_dir, 'net.best.pt')) else: logging.info('Building ASR/SLT model') config = M1.get_default_config() net = M1.SpeechTranscriber(config) run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32) logging.info('Training ASR/SLT') if data_asr['train'].dataset.is_slt(): M1.experiment(net, data_asr, run_config, slt=True) copy_best('.', 'result.json', 'asr.best.pt', experiment_type='slt') else: M1.experiment(net, data_asr, run_config) copy_best('.', 'result.json', 'asr.best.pt', experiment_type='asr') copyfile('result.json', 'result_asr.json') net = torch.load('asr.best.pt') logging.info('Extracting ASR/SLT transcriptions') for set_name in ['train', 'val']: ds = data[set_name].dataset hyp_asr, ref_asr = extract_trn(net, ds, use_beam_decoding=True) # Replacing original transcriptions with ASR/SLT's output for i in range(len(hyp_asr)): item = ds.split_data[i] if item[2] == ref_asr[i]:
num_layers=6, bidirectional=True, dropout=dropout), rnn_layer_type=nn.GRU), TextDecoder=dict( emb=dict(num_embeddings=fd.vocabulary_size(), embedding_dim=hidden_size), drop=dict(p=dropout), att=dict(in_size_enc=hidden_size * 2, in_size_state=hidden_size, hidden_size=hidden_size), rnn=dict(input_size=hidden_size * 3, hidden_size=hidden_size, num_layers=1, dropout=dropout), out=dict(in_features=hidden_size * 3, out_features=fd.vocabulary_size()), rnn_layer_type=nn.GRU, max_output_length=400, # max length for flickr annotations is 199 sos_id=fd.get_token_id(fd.sos), eos_id=fd.get_token_id(fd.eos), pad_id=fd.get_token_id(fd.pad)), inverse_transform_fn=fd.get_label_encoder().inverse_transform) logging.info('Building model') net = M.SpeechTranscriber(config) run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=32, opt='adam') logging.info('Training') M.experiment(net, data, run_config)
args.enable_help() args.parse() # Setting general configuration torch.manual_seed(args.seed) random.seed(args.seed) # Logging the arguments logging.info('Arguments: {}'.format(args)) batch_size = 8 logging.info('Loading data') data = dict( train=D.flickr8k_loader( args.flickr8k_root, args.flickr8k_meta, args.flickr8k_language, args.audio_features_fn, split='train', batch_size=batch_size, shuffle=True, downsampling_factor=args.downsampling_factor), val=D.flickr8k_loader( args.flickr8k_root, args.flickr8k_meta, args.flickr8k_language, args.audio_features_fn, split='val', batch_size=batch_size, shuffle=False)) logging.info('Building model') net = M.SpeechTranscriber(M.get_default_config()) run_config = dict(max_norm=2.0, max_lr=2 * 1e-4, epochs=args.epochs) logging.info('Training') M.experiment(net, data, run_config, slt=data['train'].dataset.is_slt())