def main(_): # 下载数据集,默认下载清华数据集 download_and_untar(FLAGS.data_url.split(','), FLAGS.data_dir) # 扫描训练集 train_wav_files = load_wav_file(FLAGS.data_dir + 'data_thchs30/train') train_labels_dict = load_label_file(FLAGS.data_dir + 'resource/trans/train.word.txt') # 提取MFCC特征, 生成字典, label向量化 train_sample_files = preapre_wav_list(train_wav_files, FLAGS.dct_coefficient_count, FLAGS.mfcc_dir + 'train/') lexicon, train_labels, train_sample_files = prepare_label_list( train_sample_files, train_labels_dict) train_vector_labels = labels_to_vector(train_labels, lexicon) test_wav_files = load_wav_file(FLAGS.data_dir + 'data_thchs30/test') test_labels_dict = load_label_file(FLAGS.data_dir + 'resource/trans/test.word.txt') test_sample_files = preapre_wav_list(test_wav_files, FLAGS.dct_coefficient_count, FLAGS.mfcc_dir + 'test/') _, test_labels, test_sample_files = prepare_label_list( test_sample_files, test_labels_dict) test_vector_labels = labels_to_vector(test_labels, lexicon) # 开始训练 train(train_sample_files, train_vector_labels, test_sample_files, test_vector_labels, lexicon, FLAGS.dct_coefficient_count, FLAGS.num_contexts, FLAGS.how_many_training_steps, FLAGS.learning_rate, FLAGS.batch_size, FLAGS.summaries_dir, FLAGS.train_dir, FLAGS.eval_step_interval, FLAGS.model_architecture, FLAGS.model_size_info)
def main(_): # 下载数据集,默认下载清华数据集 maybe_download_and_untar(FLAGS.data_url.split(','), FLAGS.data_dir) # 扫描数据集,提取MFCC特征, 生成字典, label向量化 audio_processer = AudioPorcesser(FLAGS.data_dir, FLAGS.num_filters, FLAGS.downsampling_ratio, FLAGS.num_contexts, FLAGS.output_dir) lexicon = audio_processer.prepare( os.path.basename(FLAGS.data_url).split('.')[0]) with open(FLAGS.output_dir + 'symbol_table.txt', 'w') as f: f.write( re.sub('[\s\'{}]', '', str(lexicon)).replace(',', '\n').replace(':', '\t')) num_inputs = FLAGS.num_filters + 2 * FLAGS.num_filters * FLAGS.num_contexts # 开始训练 train(audio_processer, num_inputs, len(lexicon), FLAGS.model_architecture, FLAGS.model_size_info, FLAGS.learning_rate, FLAGS.training_steps, FLAGS.batch_size, FLAGS.aligning, FLAGS.eval_step_interval, FLAGS.output_dir) decoder(audio_processer, FLAGS.output_dir + 'train/', lexicon)
collate_fn=collate_function) if args.model == "Conv": model = ConvModel(args.conv_channels, "ReLU", pos_emb=args.conv_pos_emb) elif args.model == "TransformerEncoder": model = TransformerEncoder(args, 100) elif args.model == "ConvTransformerEncoder": model = ConvTransformerEncoder(args, 21 * 2) elif args.model == "TransformerEnc": model = TransformerEnc(ninp=12 * 2, nhead=4, nhid=100, nout=21 * 2, nlayers=4, dropout=0.0) else: raise ValueError() print(args.resume) if not args.resume: if os.path.isdir(args.exp): raise Exception("Experiment name " + args.exp + " already exists.") os.mkdir(args.exp) os.mkdir(args.exp + "/models") with open(args.exp + "/args.pckl", "wb") as f: pickle.dump(args, f) train(model, train_dataloader, valid_dataloader, args)
drop_last=True, shuffle=True, num_workers=args.workers, collate_fn=pad_collate_t, worker_init_fn=worker_init_fn) val_loader = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, drop_last=True, shuffle=False, num_workers=args.workers, collate_fn=pad_collate) image_model = ImageModels.Resnet101(args) audio_model = AudioModels.Resnet(args) trans_model = AudioModels.Linear_transform(args) class_model = classification.CLASSIFIER(args) discr_model = classification.DISCRIMINATOR(args) train(image_model, audio_model, trans_model, class_model, discr_model, train_loader, val_loader, args) ''' aweight = torch.load('asl_audio.pth') iweight = torch.load('asl_image.pth') tweight = torch.load('asl_trans.pth') audio_model.load_state_dict(aweight) image_model.load_state_dict(iweight) trans_model.load_state_dict(tweight) eva.att(audio_model,image_model,trans_model,val_loader,args) '''
def main(argv): tf.random.set_seed(FLAGS.seed) if FLAGS.tbdir is not None: summary_writers = utils.create_summary_writers( utils.fix_path(FLAGS.tbdir)) # prepare dataset dataset = datasets.get_dataset()() input_shape = dataset.get_input_shape() # Create Nets and Optimizers encoder_decoder = nets.encoder_decoder( input_shape=input_shape, msg_length=FLAGS.msg_length, noise_layers=FLAGS.noise_layers, n_convbnrelu_encoder=FLAGS.n_convbnrelu_encoder, n_convbnrelu_decoder=FLAGS.n_convbnrelu_decoder) discriminator = nets.discriminator( input_shape=input_shape, n_convbnrelu=FLAGS.n_convbnrelu_discriminator) optimizer_encoder_decoder = tf.keras.optimizers.Adam(1e-3) optimizer_discriminator = tf.keras.optimizers.Adam(1e-3) # global step / epoch variables step = tf.Variable(0, dtype=tf.int64) epoch = tf.Variable(0, dtype=tf.int64) # prepare checkpointer ckpt = tf.train.Checkpoint( step=step, epoch=epoch, optimizer_encoder_decoder=optimizer_encoder_decoder, optimizer_discriminator=optimizer_discriminator, encoder_decoder=encoder_decoder, discriminator=discriminator) ckpt_manager = tf.train.CheckpointManager(ckpt, utils.fix_path(FLAGS.ckptdir), max_to_keep=FLAGS.keep_ckpts) if ckpt_manager.latest_checkpoint is not None: if FLAGS.load_from_ckpt: ckpt.restore(ckpt_manager.latest_checkpoint) logging.info("Loading model from checkpoint: {}".format( ckpt_manager.latest_checkpoint)) # Metrics Tracker metrics_train = metrics.MetricsTracker() metrics_val = metrics.MetricsTracker() while epoch < FLAGS.epochs: dataset_train = dataset.create_train_dataset() for epoch_step, cover_images in enumerate(dataset_train): messages = tf.random.uniform([FLAGS.batch_size, FLAGS.msg_length], minval=0, maxval=2, dtype=tf.int32) messages = tf.cast(messages, dtype=tf.float32) time_start = time.time() outputs = steps.train( cover_images=cover_images, messages=messages, encoder_decoder=encoder_decoder, discriminator=discriminator, training=True, optimizer_encoder_decoder=optimizer_encoder_decoder, optimizer_discriminator=optimizer_discriminator) ms_per_step = (time.time() - time_start) * 1000.0 ms_per_sample = ms_per_step / FLAGS.batch_size # Write step summaries is_summary_step = (step.numpy() % FLAGS.summary_freq) == 0 if is_summary_step: step_losses = losses.step_loss( cover_images, messages, encoder_decoder_output=outputs['encoder_decoder'], discriminator_on_cover=outputs['discriminator_on_cover'], discriminator_on_encoded=outputs[ 'discriminator_on_encoded']) metrics_train.update( step_losses, messages, encoder_decoder_output=outputs['encoder_decoder'], discriminator_on_cover=outputs['discriminator_on_cover'], discriminator_on_encoded=outputs[ 'discriminator_on_encoded']) metrics_train_results = metrics_train.results() metrics_train.reset() with summary_writers['train'].as_default(): for _name, _value in metrics_train_results.items(): tf.summary.scalar(_name, _value, step=step) tf.summary.scalar('ms_per_step', ms_per_step, step=step) tf.summary.scalar('ms_per_sample', ms_per_sample, step=step) step.assign_add(1) ckpt_save_path = ckpt_manager.save() logging.info("Saved model after epoch {} to {}".format( epoch.numpy(), ckpt_save_path)) # Training Loss logging.info("Epoch {} Stats".format(epoch.numpy())) logging.info("Training Stats ===========================") for _name, _value in metrics_train_results.items(): logging.info("{}: {:.4f}".format(_name, _value)) # Evaluate dataset_val = dataset.create_val_dataset() for cover_images in dataset_val: messages = utils.create_messages(batch_size=cover_images.shape[0], msg_length=FLAGS.msg_length) # messages = tf.random.uniform( # [FLAGS.batch_size, FLAGS.msg_length], # minval=0, maxval=2, dtype=tf.int32) # messages = tf.cast(messages, dtype=tf.float32) outputs = steps.train(cover_images=cover_images, messages=messages, encoder_decoder=encoder_decoder, discriminator=discriminator, training=False) losses_val_step = losses.step_loss( cover_images, messages, encoder_decoder_output=outputs['encoder_decoder'], discriminator_on_cover=outputs['discriminator_on_cover'], discriminator_on_encoded=outputs['discriminator_on_encoded']) metrics_val.update( losses_val_step, messages, encoder_decoder_output=outputs['encoder_decoder'], discriminator_on_cover=outputs['discriminator_on_cover'], discriminator_on_encoded=outputs['discriminator_on_encoded']) metrics_val_results = metrics_val.results() metrics_val.reset() logging.info("Validation Stats ===========================") with summary_writers['val'].as_default(): for _name, _value in metrics_val_results.items(): tf.summary.scalar(_name, _value, step=step) logging.info("{}: {:.4f}".format(_name, _value)) messages = utils.create_messages(batch_size=cover_images.shape[0], msg_length=FLAGS.msg_length) encoder_decoder_output = encoder_decoder(inputs={ 'cover_image': cover_images, 'message': messages }, training=False) # write example images to Summaries with summary_writers['val'].as_default(): transform_fn = None if FLAGS.to_yuv: transform_fn = tf.image.yuv_to_rgb utils.summary_images( cover=cover_images, encoded=encoder_decoder_output['encoded_image'], transmitted_encoded=encoder_decoder_output[ 'transmitted_encoded_image'], transmitted_cover=encoder_decoder_output[ 'transmitted_cover_image'], step=step, transform_fn=transform_fn) epoch.assign_add(1)
print(args) para = {"num_workers":8, "pin_memory":True} if args.cuda else {} train_loader = torch.utils.data.DataLoader( dataloaders.ImageCaptionDataset(args.data_train), batch_size=args.batch_size, shuffle=True, **para) val_loader = torch.utils.data.DataLoader( dataloaders.ImageCaptionDataset(args.data_val, image_conf={'center_crop':True}), batch_size=args.batch_size, shuffle=False, **para) audio_model = models.Davenet() image_model = models.VGG16(pretrained=args.pretrained_image_model) if not bool(args.exp_dir): print("exp_dir not specified, automatically creating one...") now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') args.exp_dir = "exp/Data-%s/AudioModel-%s_ImageModel-%s_Optim-%s_LR-%s_Epochs-%s_%s" % ( os.path.basename(args.data_train), args.audio_model, args.image_model, args.optim, args.lr, args.n_epochs, timestamp) if not args.resume: print("\nexp_dir: %s" % args.exp_dir) os.makedirs("%s/models" % args.exp_dir) with open("%s/args.pkl" % args.exp_dir, "wb") as f: pickle.dump(args, f) train(audio_model, image_model, train_loader, val_loader, args)