def extract_feats(args): params = args if params['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif params['model'] == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) else: print("doesn't support %s" % (params['model'])) model.last_linear = utils.Identity() model = nn.DataParallel(model) model = model.cuda() prepro_feats.extract_feats(params, model, load_image_fn)
def extract_image_feats(video_path): print('extracting image features...') model = resnet152(pretrained='imagenet') model = model.cuda() # model = nn.DataParallel(model) model.last_linear = utils.Identity() model.eval() C, H, W = 3, 224, 224 load_image_fn = utils.LoadTransformImage(model) dst = os.path.join(video_path.split('/')[0], 'info') with open(os.devnull, "w") as ffmpeg_log: command = 'ffmpeg -i ' + video_path + ' -vf scale=400:300 ' + '-qscale:v 2 '+ '{0}/%06d.jpg'.format(dst) subprocess.call(command, shell=True, stdout=ffmpeg_log, stderr=ffmpeg_log) image_list = sorted(glob.glob(os.path.join(dst, '*.jpg'))) samples = np.round(np.linspace(0, len(image_list) - 1, 80)) image_list = [image_list[int(sample)] for sample in samples] images = torch.zeros((len(image_list), C, H, W)) for i in range(len(image_list)): img = load_image_fn(image_list[i]) images[i] = img with torch.no_grad(): image_feats = model(images.cuda().squeeze()) image_feats = image_feats.cpu().numpy() for file in os.listdir(dst): if file.endswith('.jpg'): os.remove(os.path.join(dst, file)) return image_feats
def generate_C2D_model(opt): if opt.c2d_model_name == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt.c2d_model_name == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt.c2d_model_name == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt.c2d_model_name == 'inceptionresnetv2': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionresnetv2(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) model.last_linear = utils.Identity() if not opt.no_cuda: model = model.to(opt.device) return load_image_fn, model, (C, H, W)
def extract_image_feats(video_path): hasilPred.configure(text="Membuat Prediksi....") model = resnet152(pretrained='imagenet') model = model.cuda() model.last_linear = utils.Identity() model.eval() C, H, W = 3, 224, 224 load_image_fn = utils.LoadTransformImage(model) dst = os.path.join(video_path.split('\\')[0], 'info') if os.path.exists(dst): print(" Menghapus Direktori: " + dst + "\\") shutil.rmtree(dst) os.makedirs(dst) with open(os.devnull, "w") as ffmpeg_log: command = 'ffmpeg -i ' + video_path + ' -vf scale=400:300 ' + '-qscale:v 2 ' + '{0}/%06d.jpg'.format( dst) subprocess.call(command, shell=True, stdout=ffmpeg_log, stderr=ffmpeg_log) list_image = sorted(glob.glob(os.path.join(dst, '*.jpg'))) samples = np.round(np.linspace(0, len(list_image) - 1, 80)) list_image = [list_image[int(sample)] for sample in samples] images = torch.zeros((len(list_image), C, H, W)) for i in range(len(list_image)): img = load_image_fn(list_image[i]) images[i] = img with torch.no_grad(): image_feats = model(images.cuda().squeeze()) image_feats = image_feats.cpu().numpy() for file in os.listdir(dst): if file.endswith('.jpg'): os.remove(os.path.join(dst, file)) return image_feats
def fix_frame_extract(frame_path, feats_path, frames_num, model, video_name): # load model C, H, W = 3, 224, 224 if model == 'resnet152': model = pretrainedmodels.resnet152(pretrained='imagenet') elif model == 'vgg16': model = pretrainedmodels.vgg16(pretrained='imagenet') elif model == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(pretrained='imagenet') model.last_linear = utils.Identity() model = model.to(device) model.eval() load_image_fn = utils.LoadTransformImage(model) # load data img_list = sorted(frame_path.glob('*.jpg')) # get index samples_ix = np.linspace(0, len(img_list) - 1, frames_num).astype(int) img_list = [img_list[i] for i in samples_ix] # build tensor imgs = torch.zeros([len(img_list), C, H, W]) for i in range(len(img_list)): img = load_image_fn(img_list[i]) imgs[i] = img imgs = imgs.to(device) with torch.no_grad(): feats = model(imgs) feats = feats.cpu().numpy() # save np.save(os.path.join(feats_path, video_name + ".npy"), feats)
def generate_2D_model(opt): if opt['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg16': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg16(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg19': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg19(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet50': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet50(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet101': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet101(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(num_classes=1000, pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'nasnet': C, H, W = 3, 331, 331 model = pretrainedmodels.nasnetalarge(num_classes=1001, pretrained='imagenet+background') load_image_fn = utils.LoadTransformImage(model) else: print("doesn't support %s" % (opt['model'])) model.last_linear = utils.Identity() model = nn.DataParallel(model) # if opt['saved_model'] != '': # model.load_state_dict(torch.load(opt['saved_model']), strict=False) model = model.cuda() return model
def extract_feats(frame_path, feats_path, interval, model, video_name): """ extract feature from frames of one video :param video_name: :param model: name of model :param frame_path: path of frames :param feats_path: path to store results :param interval: (str) The interval when extract frames from videos :return: None """ # load model C, H, W = 3, 224, 224 if model == 'resnet152': model = pretrainedmodels.resnet152(pretrained='imagenet') elif model == 'vgg16': model = pretrainedmodels.vgg16(pretrained='imagenet') elif model == 'inception_v4': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv4(pretrained='imagenet') model.last_linear = utils.Identity() model = model.to(device) model.eval() load_image_fn = utils.LoadTransformImage(model) # load data img_list = sorted(frame_path.glob('*.jpg')) # get index samples_ix = np.arange(0, len(img_list), interval) img_list = [img_list[int(i)] for i in samples_ix] # build tensor imgs = torch.zeros([len(img_list), C, H, W]) for i in range(len(img_list)): img = load_image_fn(img_list[i]) imgs[i] = img imgs = imgs.to(device) with torch.no_grad(): feats = model(imgs) feats = feats.cpu().numpy() # save np.save(os.path.join(feats_path, video_name + ".npy"), feats)
C, H, W = 3, 224, 224 model = pretrainedmodels.resnet34(pretrained='imagenet') elif params['model'] == 'inceptionresnetv2': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionresnetv2( num_classes=1001, pretrained='imagenet+background') elif params['model'] == 'googlenet': C, H, W = 3, 224, 224 model = googlenet(pretrained=True) print(model) else: print("doesn't support %s" % (params['model'])) if params['model'] != 'googlenet': load_image_fn = utils.LoadTransformImage(model) model.last_linear = utils.Identity() else: load_image_fn = google_load() model = model.cuda() #summary(model, (C, H, W)) if params['test_latency']: test_latency(params, model, load_image_fn, C, H, W) else: extract_feats(params, model, load_image_fn, C, H, W) ''' python extract_image_feats_from_frames.py \ --frame_path "/home/yangbang/VideoCaptioning/MSRVTT/all_frames/" \ --feat_path "/home/yangbang/VideoCaptioning/MSRVTT/feats/" \ --feat_name msrvtt_R152 \
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--video_dir', type=str, default='../msrvtt_2017/train-video', help='The video dir that one would like to extract audio file from') parser.add_argument('--output_dir', type=str, default='../msrvtt_2017/preprocessed', help='The file output directory') parser.add_argument( '--output_channels', type=int, default=1, help='The number of output audio channels, default to 1') parser.add_argument( '--output_frequency', type=int, default=16000, help='The output audio frequency in Hz, default to 16000') parser.add_argument( '--band_width', type=int, default=160, help= 'Bandwidth specified to sample the audio (unit in kbps), default to 160' ) parser.add_argument( '--model', type=str, default='resnet152', help= 'The pretrained model to use for extracting image features, default to resnet152' ) parser.add_argument('--gpu', type=str, default='0', help='The CUDA_VISIBLE_DEVICES argument, default to 0') parser.add_argument( '--n_frame_steps', type=int, default=80, help='The number of frames to extract from a single video') opt = parser.parse_args() opt = vars(opt) if not os.path.exists(opt['output_dir']): os.mkdir(opt['output_dir']) vToA(opt) split_audio(opt) print('cleaning up original .wav files...') dir = opt['output_dir'] dir = os.listdir(dir) for file in dir: if file.endswith('.wav'): os.remove(os.path.join(opt['output_dir'], file)) os.environ['CUDA_VISIBLE_DEVICES'] = opt['gpu'] if opt['model'] == 'resnet152': C, H, W = 3, 224, 224 model = pretrainedmodels.resnet152(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'inception_v3': C, H, W = 3, 299, 299 model = pretrainedmodels.inceptionv3(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) elif opt['model'] == 'vgg16': C, H, W = 3, 224, 224 model = pretrainedmodels.vgg16(pretrained='imagenet') load_image_fn = utils.LoadTransformImage(model) else: print('The image model is not supported') model.last_linear = utils.Identity() model = nn.DataParallel(model) model = model.cuda() extract_image_feats(opt, model, load_image_fn)