Exemplo n.º 1
0
                    type=float,
                    default=1.0,
                    help="Margin paramater for triplet loss")
parser.add_argument("--input-length",
                    "-L",
                    type=int,
                    default=2048,
                    help="number of input frames",
                    choices=[1024, 2048])

args = parser.parse_args()

print(args)

# Setup loaders, models and loss
train_loader = torch.utils.data.DataLoader(dataloaders.ImageCaptionDataset(
    args.data_train, audio_conf={'target_length': args.input_length}),
                                           batch_size=args.batch_size,
                                           shuffle=True,
                                           num_workers=8,
                                           pin_memory=True)

val_loader = torch.utils.data.DataLoader(dataloaders.ImageCaptionDataset(
    args.data_val,
    audio_conf={'target_length': args.input_length},
    image_conf={'center_crop': True}),
                                         batch_size=args.batch_size,
                                         shuffle=False,
                                         num_workers=8,
                                         pin_memory=True)

audio_model = models.DaveNet(embedding_dim=args.input_length)
Exemplo n.º 2
0
                    help="audio model architecture", choices=["Davenet"])
parser.add_argument("--image-model", type=str, default="VGG16",
                    help="image model architecture", choices=["VGG16"])
parser.add_argument("--pretrained-image-model", action="store_true",
                    dest="pretrained_image_model", help="Use an image network pretrained on ImageNet")
parser.add_argument("--margin", type=float, default=1.0, help="Margin paramater for triplet loss")
parser.add_argument("--input-length", "-L", type=int, default=2048,
                    help="number of input frames", choices=[1024, 2048])

args = parser.parse_args()

print(args)

# Setup loaders, models and loss
train_loader = torch.utils.data.DataLoader(
    dataloaders.ImageCaptionDataset(args.data_train, audio_conf={'target_length': args.input_length}, image_conf={'center_crop': True}),
    batch_size=args.batch_size, shuffle=True, num_workers=8, pin_memory=True)

val_loader = torch.utils.data.DataLoader(
    dataloaders.ImageCaptionDataset(args.data_val, audio_conf={'target_length': args.input_length}, image_conf={'center_crop': True}),
    batch_size=args.batch_size, shuffle=False, num_workers=8, pin_memory=True)

audio_model = models.ConvX3AudioNet(input_length=args.input_length)
image_model = models.VGG16()

if bool(args.train_path):
    audio_model.load_state_dict(torch.load("%s/models/best_audio_model.pth" % args.train_path), strict=False)

criterion = DotLoss()

# Set up the optimizer
Exemplo n.º 3
0
args.cuda = not args.no_cuda and torch.cuda.is_available()

resume = args.resume

if args.resume:
    assert(bool(args.exp_dir))
    with open("%s/args.pkl" % args.exp_dir, "rb") as f:
        args = pickle.load(f)
args.resume = resume
        
print(args)

para = {"num_workers":8, "pin_memory":True} if args.cuda else {}
train_loader = torch.utils.data.DataLoader(
    dataloaders.ImageCaptionDataset(args.data_train),
    batch_size=args.batch_size, shuffle=True, **para)

val_loader = torch.utils.data.DataLoader(
    dataloaders.ImageCaptionDataset(args.data_val, image_conf={'center_crop':True}),
    batch_size=args.batch_size, shuffle=False, **para)

audio_model = models.Davenet()
image_model = models.VGG16(pretrained=args.pretrained_image_model)

if not bool(args.exp_dir):
    print("exp_dir not specified, automatically creating one...")
    now = datetime.datetime.now(dateutil.tz.tzlocal())
    timestamp = now.strftime('%Y_%m_%d_%H_%M_%S')
    args.exp_dir = "exp/Data-%s/AudioModel-%s_ImageModel-%s_Optim-%s_LR-%s_Epochs-%s_%s" % (
        os.path.basename(args.data_train), args.audio_model, args.image_model, args.optim,
Exemplo n.º 4
0
                    help="matchmap similarity function",
                    choices=["SISA", "MISA", "SIMA"])

args = parser.parse_args()

resume = args.resume

if args.resume:
    assert (bool(args.exp_dir))
    with open("%s/args.pkl" % args.exp_dir, "rb") as f:
        args = pickle.load(f)
args.resume = resume

print(args)

train_loader = torch.utils.data.DataLoader(dataloaders.ImageCaptionDataset(
    args.data_train),
                                           batch_size=args.batch_size,
                                           shuffle=True,
                                           num_workers=8,
                                           pin_memory=True)

val_loader = torch.utils.data.DataLoader(dataloaders.ImageCaptionDataset(
    args.data_val, image_conf={'center_crop': True}),
                                         batch_size=args.batch_size,
                                         shuffle=False,
                                         num_workers=8,
                                         pin_memory=True)

# Pick right model based on string input
models_module = __import__("models")
audio_class = getattr(models_module, args.audio_model)