def annotate_video(file_path, coordinates): """ Annotates supplied video from predicted coordinates. Args: file_path: path System path of video to annotate coordinates: list Predicted body part coordinates for each frame in the video """ # Load raw video from skvideo.io import vreader, ffprobe, FFmpegWriter videogen = vreader(file_path) video_metadata = ffprobe(file_path)['video'] fps = video_metadata['@r_frame_rate'] frame_height, frame_width = next(vreader(file_path)).shape[:2] frame_side = frame_width if frame_width >= frame_height else frame_height # Initialize annotated video vcodec = 'libvpx-vp9' #'libx264' writer = FFmpegWriter(normpath(file_path.split('.')[0] + '_tracked.mp4'), inputdict={'-r': fps}, outputdict={ '-r': fps, '-bitrate': '-1', '-vcodec': vcodec, '-pix_fmt': 'yuv420p', '-lossless': '1' }) #'-lossless': '1' # Annotate video from PIL import Image, ImageDraw i = 0 while True: try: frame = next(videogen) image = Image.fromarray(frame) image_draw = ImageDraw.Draw(image) image_coordinates = coordinates[i] image = helpers.display_body_parts(image, image_draw, image_coordinates, image_height=frame_height, image_width=frame_width, marker_radius=int(frame_side / 150)) image = helpers.display_segments(image, image_draw, image_coordinates, image_height=frame_height, image_width=frame_width, segment_width=int(frame_side / 100)) writer.writeFrame(np.array(image)) i += 1 except: break writer.close()
def generate_dataset(video_dir): if not os.path.exists(video_dir): raise ValueError("video dir does not exist") video_files = os.listdir(video_dir) image_index = 1 # for i in video_files: # print i.decode('utf8') # exit() for video in video_files: if 'rmvb' in video: continue print video video_path = os.path.join(video_dir, video) metadata = ffprobe(video_path) frame_info = metadata["video"]["@avg_frame_rate"].split('/') rate = int(frame_info[0])/int(frame_info[1]) try: frame_array = vreader(video_path) frame_index = 0 for frame in frame_array: if frame_index % (rate * interval) == 0: image = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) image = cv2.resize(image, (image_height, image_width)) cv2.imwrite(test_raw_file + str(image_index) + '.jpg', image) cv2.imwrite(test_copy_file + str(image_index) + '.jpg', image) print("write image %d" % image_index) image_index += 1 elif frame_index % (rate * interval) == 1: image = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) image = cv2.resize(image, (image_height, image_width)) cv2.imwrite(test_copy_file + str(image_index -1) + '.jpg', image) frame_index += 1 except RuntimeError: continue
def detect_video(conf, video_file, out_path, yolo, level=0): """Use yolo v3 to detect video. # Argument: video: video file. yolo: YOLO, yolo model. level : on which resolution to run detection, range[1 - 7], default is 416 the resolution list is in conf.resolutions """ videogen = io.vreader(video_file) metdata = io.ffprobe(video_file) frame_rate = int( int(metdata['video']['@avg_frame_rate'].split('/')[0]) / int(metdata['video']['@avg_frame_rate'].split('/')[1])) frame = next(videogen) shape = (frame.shape[1], frame.shape[0]) video_writer = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'XVID'), frame_rate, shape) for frame in tqdm(videogen, total=int(metdata['video']['@nb_frames'])): detected_frame = np.array( yolo.detect_on_img(conf, Image.fromarray(frame), level=level))[..., ::-1] # pdb.set_trace() video_writer.write(detected_frame) video_writer.release() videogen.close()
def convert_vid(path): out_path = path.split('.')[0] + '.hdf5' cap = vreader(path) with h5py.File(out_path, 'w') as f: needs_resize = False first_frame = next(cap) # NOTE this assumes original aspect ratio is 16:9 if first_frame.shape[0] != 320: needs_resize = True first_frame = np.swapaxes(resize(first_frame, (320, 180)), 0, 1) f.create_dataset('vid_frames', data=np.expand_dims(first_frame, 0), maxshape=(None, first_frame.shape[0], first_frame.shape[1], first_frame.shape[2]), compression='gzip') for ind, frame in enumerate(cap): if needs_resize: frame = np.swapaxes(resize(frame, (320, 180)), 0, 1) f['vid_frames'].resize((f['vid_frames'].shape[0] + 1), axis=0) f['vid_frames'][-1] = frame print('Wrote', out_path)
def write_frames(flags, rgb_out_dir): rgb_video = os.path.join(flags.dataset, 'rgb.mp4') video = io.vreader(rgb_video) for i, frame in enumerate(video): print(f"Writing rgb frame {i:06}" + " " * 10, end='\r') frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) frame = cv2.resize(frame, (OUT_WIDTH, OUT_HEIGHT)) frame_path = os.path.join(rgb_out_dir, f"{i:06}.jpg") params = [int(cv2.IMWRITE_JPEG_QUALITY), 90] cv2.imwrite(frame_path, frame, params)
def extract_frames_from_vid(vid_path): fourcc = cv2.VideoWriter_fourcc(*'MJPG') data = io.ffprobe(vid_path)['video'] rate = int(data['@r_frame_rate'].split('/')[0]) out = None for frame in tqdm(io.vreader(vid_path), unit=' frame'): if out is None: out = cv2.VideoWriter('out_' + vid_path, fourcc, rate, frame.shape[1:3][::-1]) frame = frame_face_blur(frame) out.write(frame) out.release()
def process_video(self, video_p: Path, output_p: Path, reduce_rate: int = 1): meta = ffprobe(video_p) nb_frames = int(meta["video"]["@nb_frames"]) frames = vreader(str(video_p)) writer = FFmpegWriter(str(output_p), outputdict={"-r": str(int(30 / reduce_rate))}) for i, frame in enumerate(tqdm(frames, total=nb_frames)): if i % reduce_rate == 0: frame = self.process_frame(frame) writer.writeFrame(frame) writer.close()
def __init__(self, file_paths, transform=False, resize=(256, 256)): """ file_paths: a list of length batch_size containing paths to video files transform: whether to crop/resize the frame_tensor Note: this must currently be set to True resize: resize dimensions for the frames (h, w) """ self.frame_generators = [] for file_path in file_paths: assert os.path.exists( file_path), 'Video file path ' + file_path + ' does not exist.' self.frame_generators.append(vid.vreader(str(file_path))) assert transform == True, 'Non-transformed batch not implemented.' self.transform = transform self.resize = resize
def generate_batches(video_path, batch_size=64, video_options=None): vid = vreader(str(video_path), outputdict={'-s': '224x224'}) batch = [] for frame in vid: if len(batch) == batch_size: batch = [] batch.append(frame) if len(batch) == batch_size: yield np.array(batch) yield np.array(batch)
def get_video_info(video_path): cap = sk.vreader(video_path) seg_l = 4 metadata = sk.ffprobe(video_path) # print (json.dumps(metadata, indent=4)) # print (json.dumps(metadata["video"], indent=4)) """ fps : @r_frame_rate length : @duration frames : @nb_frames """ length = float(json.dumps(metadata["video"]["@duration"]).replace('"', '')) # fnum = float(json.dumps(metadata["video"]["@nb_frames"]).replace('"', '')) fps = float( json.dumps(metadata["video"]["@r_frame_rate"]).replace( '"', '').split('/')[0]) / float( json.dumps(metadata["video"]["@r_frame_rate"]).replace( '"', '').split('/')[1]) fnum = int(np.ceil(length * fps)) print('length : %.5f / frames : %d / fps : %.2f' % (length, fnum, fps)) img_id = [] frame_list = [] id = 0 for frame in cap: frame = cv2.resize(frame, dsize=(224, 224)) frame_list.append(frame) img_id.append(id) id += 1 segs = [img_id[i:i + seg_l] for i in range(len(img_id) - seg_l + 1)] segs = reduce(lambda x, y: x + y, segs) feat = [] for seg in segs: feat.append(frame_list[seg]) idx = np.arange(fps, fnum, fps) idx = np.floor(idx) idx = idx.tolist() idx = map(int, idx) return feat, fnum, fps, length, img_id, idx
def write_images(): root = '../UCF-14/' classes = open('../dataset/classInd_14.txt', 'r') var1 = {} for line in classes: words = line.split(" ") var1[words[1].split("\n")[0]] = words[0] for path, subdirs, files in os.walk(root): for filename in files: print filename if ".DS_Store" not in filename: folder = 'images' + '/' + filename.split('.')[0] + '/' if not os.path.isdir(folder): os.mkdir(folder) else: shutil.rmtree(folder) os.mkdir(folder) try: cnt = 0 full_path = path + '/' + filename cap = vreader(full_path) fcnt = 1 for frame in cap: vid_name = filename.split('.')[0] img_path = folder + vid_name + '_{}.jpg'.format(cnt + 1) img_name = vid_name + '_{}'.format(cnt + 1) if fcnt % 5 == 0: vwrite(img_path, frame) cnt = cnt + 1 fcnt += 1 if cnt: with open("count.txt", "w") as txt: text = str(cnt) + " " + img_name.split( '.')[0] + "\n" txt.write(text) except (RuntimeError, TypeError, NameError): print "Some Error happened"
def make_video_frames(datadir, outdir): # train video data for i in range(10000): if not os.path.exists(os.path.join(outdir, 'video%s' % i)): os.makedirs(os.path.join(outdir, 'video%s' % i)) filename = ('video%s.mp4' % i) videopath = os.path.join(datadir, filename) cap = sk.vreader(videopath) metadata = sk.ffprobe(videopath) # print json.dumps(metadata["video"], indent=4) """ fps : @r_frame_rate length : @duration frames : @nb_frames """ length = float( json.dumps(metadata["video"]["@duration"]).replace('"', '')) frames = float( json.dumps(metadata["video"]["@nb_frames"]).replace('"', '')) fps = int(frames / length) print('%sth video' % i) print('length : %d / frames : %d / fps : %d' % (length, frames, fps)) cent = np.linspace(0, length, 7)[1:-1] for x in range(len(cent)): cent[x] = int(cent[x]) frames = cent * fps idx = 0 filenum = 0 for frame in cap: if idx in frames: frame = cv2.resize(frame, dsize=(224, 224)) sk.vwrite(outdir + '/video%s/frame%s.png' % (i, filenum), frame) filenum += 1 idx += 1 if i % 1000 == 0: print('%sth video processed...' % i)
def extract_keyframes_ffmpeg(video_path: str, output_path: str) -> None: """ Extract all keyframes from a video file and save them to disk :param video_path: Absolute path to the input video :param output_path: Absolute path where all keyframes will be saved """ assert os.path.exists(video_path) video_name = os.path.splitext(os.path.basename(video_path))[0] video_data = io.vreader(video_path, outputdict={ '-vf': 'select=eq(pict_type\,PICT_TYPE_I)', '-vsync': 'vfr' }) cnt = 0 for kframe in video_data: cv2.imwrite(join(output_path, f'{video_name}_{cnt:03d}.png'), cv2.cvtColor(kframe, code=cv2.COLOR_RGB2BGR)) cnt += 1 print( f'EXTK> Extracted {cnt} keyframes from {os.path.basename(video_path)} to {output_path}' )
def extract_frames(d_path, fps=25): """ extract frames from video files, 1 frame per second by default :param d_path: data path :param fps: number of frames being extracted per second :return: data and labels """ for n in range(1, 81): video_id = str(n).zfill(2) video_name = '{}videos/video{}.mp4'.format(d_path, video_id) video_reader = vreader(video_name) for i, frame in enumerate(video_reader): if i % fps == 0: # remove black boarders # frame = frame[:, 40:-55, :] # downscale frame to (224, 224, 3) frame = resize(frame, (224, 224, 3)) # extract frame to different folder frame_path = '{}frames/video{}-frame{}.png'.format( d_path, video_id, i) plt.imsave(arr=frame, fname=frame_path)
def main(argv): pycaffe_dir = os.path.dirname(__file__) parser = argparse.ArgumentParser() # Required arguments: input file. parser.add_argument("input_file", help="Path to the input image file") # Optional arguments. parser.add_argument("--model_def", help="Model definition file.") parser.add_argument("--pretrained_model", help="Trained model weights file.") args = parser.parse_args() metadata = ffprobe(args.input_file) avg_frame_rate = metadata["video"]["@avg_frame_rate"].split('/') rate = int(avg_frame_rate[0]) / int(avg_frame_rate[1]) video = vreader(args.input_file) if os.path.exists('./temp/image') == False: os.mkdir('temp/image') image_files = os.listdir('./temp/image') if len(image_files) > 0: for filename in image_files: os.remove('./temp/image/' + filename) index = 0 for frame in video: if index % (rate * interval) == 0: img = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) cv2.imwrite('temp/image/' + str(index / rate) + '.jpg', img) index += 1 # Pre-load caffe model. nsfw_net = caffe.Net( args.model_def, # pylint: disable=invalid-name args.pretrained_model, caffe.TEST) # Load transformer # Note that the parameters are hard-coded for best results caffe_transformer = caffe.io.Transformer( {'data': nsfw_net.blobs['data'].data.shape}) caffe_transformer.set_transpose( 'data', (2, 0, 1)) # move image channels to outermost caffe_transformer.set_mean('data', np.array( [104, 117, 123])) # subtract the dataset-mean value in each channel caffe_transformer.set_raw_scale('data', 255) # rescale from [0, 1] to [0, 255] caffe_transformer.set_channel_swap( 'data', (2, 1, 0)) # swap channels from RGB to BGR # fetch all image score. scores_list = np.array([]) safe_count = 0 unsafe_count = 0 middle_count = 0 danger_list = [] image_files = os.listdir('./temp/image') for filename in image_files: image_data = open('./temp/image/' + filename).read() scores = caffe_preprocess_and_compute( image_data, caffe_transformer=caffe_transformer, caffe_net=nsfw_net, output_layers=['prob']) if scores[1] > 0.8: unsafe_count += 1 danger_list.append(filename) elif scores[1] < 0.2: safe_count += 1 else: middle_count += 1 scores_list = np.append(scores_list, scores[1]) # Scores is the array containing SFW / NSFW image probabilities # scores[1] indicates the NSFW probability print("total: %d, safe: %d, unsafe: %d, middle: %d" % (scores_list.shape[0], safe_count, unsafe_count, middle_count)) print danger_list
def main(video_dict): model_def = 'nsfw_model/deploy.prototxt' pretrained_model = 'nsfw_model/resnet_50_1by2_nsfw.caffemodel' pycaffe_dir = os.path.dirname(__file__) # Pre-load caffe model. nsfw_net = caffe.Net( model_def, # pylint: disable=invalid-name pretrained_model, caffe.TEST) # Load transformer # Note that the parameters are hard-coded for best results caffe_transformer = caffe.io.Transformer( {'data': nsfw_net.blobs['data'].data.shape}) caffe_transformer.set_transpose( 'data', (2, 0, 1)) # move image channels to outermost caffe_transformer.set_mean('data', np.array( [104, 117, 123])) # subtract the dataset-mean value in each channel caffe_transformer.set_raw_scale('data', 255) # rescale from [0, 1] to [0, 255] caffe_transformer.set_channel_swap( 'data', (2, 1, 0)) # swap channels from RGB to BGR if not os.path.exists(image_temp): os.mkdir(image_temp) if not os.path.exists(video_temp): os.mkdir(video_temp) conclusion_dict = {} for video_name in video_dict: video_path = os.path.join(video_temp, video_name) if not os.path.exists(video_path): continue image_files = os.listdir(image_temp) if len(image_files) > 0: for filename in image_files: filepath = os.path.join(image_temp, filename) os.remove(filepath) metadata = ffprobe(video_path) avg_frame_rate = metadata["video"]["@avg_frame_rate"].split('/') rate = int(avg_frame_rate[0]) / int(avg_frame_rate[1]) video = vreader(video_path) index = 0 for frame in video: if index % (rate * interval) == 0: img = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) cv2.imwrite(image_temp + '/' + str(index / rate) + '.jpg', img) index += 1 # fetch all image score. safe_count = 0 danger_count = 0 warning_count = 0 middle_count = 0 danger_list = [] warning_list = [] scores_list = np.array([]) image_files = os.listdir(image_temp) for image_name in image_files: second = image_name.split('.')[0] image_path = os.path.join(image_temp, image_name) image_data = open(image_path).read() scores = caffe_preprocess_and_compute( image_data, caffe_transformer=caffe_transformer, caffe_net=nsfw_net, output_layers=['prob']) if scores[1] > 0.8: danger_count += 1 danger_list.append(second) elif scores[1] > 0.5: warning_count += 1 middle_count += 1 warning_list.append(second) elif scores[1] > 0.2: middle_count += 1 else: safe_count += 1 scores_list = np.append(scores_list, scores[1]) # Scores is the array containing SFW / NSFW image probabilities # scores[1] indicates the NSFW probability conclusion_dict[video_name] = { 'url': video_dict[video_name], 'name': video_name.split('.')[0], 'extension': video_name.split('.')[-1], 'total_count': scores_list.shape[0], 'danger_count': danger_count, 'warning_count': warning_count, 'danger_second': danger_list, 'warning_second': warning_list, } print( "video name: %s, total: %d, danger_count: %d, warning_count: %d" % (video_name, scores_list.shape[0], danger_count, warning_count)) os.remove(video_path) return conclusion_dict
def tagVideo(modelpath, videopath, outputPath=None): """ detect if persons in video are wearing masks or not """ result = -1 model = MaskDetector() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.load_state_dict(torch.load(modelpath, map_location=device)['state_dict'], strict=False) model = model.to(device) model.eval() faceDetector = FaceDetector( prototype= '/var/www/covosk-cv/covid-mask-detector/models/deploy.prototxt.txt', model= '/var/www/covosk-cv/covid-mask-detector/models/res10_300x300_ssd_iter_140000.caffemodel', ) transformations = Compose([ ToPILImage(), Resize((100, 100)), ToTensor(), ]) if outputPath: writer = FFmpegWriter(str(outputPath)) font = cv2.FONT_HERSHEY_SIMPLEX #cv2.namedWindow('main', cv2.WINDOW_NORMAL) labels = ['No mask', 'Mask'] labelColor = [(10, 0, 255), (10, 255, 0)] for frame in vreader(str(videopath)): frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) faces = faceDetector.detect(frame) #print ("FRAME") for face in faces: xStart, yStart, width, height = face #print ("FACE",face) # clamp coordinates that are outside of the image xStart, yStart = max(xStart, 0), max(yStart, 0) # Image is 640x640 #print ("DIMS",xStart, yStart ,width,height) right = min(xStart + width, 639) bottom = min(yStart + height, 639) #print ("Right",xStart+width) #print ("Bottom",yStart+height) area = width * height inarea = (right - xStart) * (bottom - yStart) #print ("Area",area) #print ("inArea",inarea) areaperc = inarea / area #print ("areaperc",areaperc) # predict mask label on extracted face faceImg = frame[yStart:yStart + height, xStart:xStart + width] output = model(transformations(faceImg).unsqueeze(0).to(device)) #print ("OUTPUT",output) _, predicted = torch.max(output.data, 1) #print ("result",_) # center text according to the face frame textSize = cv2.getTextSize(labels[predicted], font, 1, 2)[0] textX = xStart + width // 2 - textSize[0] // 2 # draw prediction label cc = (126, 65, 64) if (areaperc > 0.75): #print (labels[predicted]) if predicted: result = 1 elif result == -1: result = 0 else: cc = (200, 200, 200) # draw face frame cv2.rectangle(frame, (xStart, yStart), (xStart + width, yStart + height), cc, thickness=2) cv2.putText(frame, labels[predicted], (textX, yStart - 20), font, 1, labelColor[predicted], 2) if outputPath: writer.writeFrame(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) #cv2.imshow('main', frame) #if cv2.waitKey(1) & 0xFF == ord('q'): # break if outputPath: writer.close() #cv2.destroyAllWindows() #if result==0: print ("No Mask Found") #if result==1: print ("Mask Found") #if result==-1: print ("No face in photo") return result
def tagVideo(modelpath=None, videopath=None, outputPath=None, outputPathMask=None): modelpath = "./mask-detection/models/face_mask.ckpt" """ detect if persons in video are wearing masks or not """ model = MaskDetector() model.load_state_dict(torch.load(modelpath, map_location='cpu')['state_dict'], strict=False) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) model.eval() faceDetector = FaceDetector( prototype='mask-detection/models/deploy.prototxt.txt', model='mask-detection/models/res10_300x300_ssd_iter_140000.caffemodel', ) transformations = Compose([ ToPILImage(), Resize((100, 100)), ToTensor(), ]) if outputPath: writer = FFmpegWriter(str(outputPath)) font = cv2.FONT_HERSHEY_SIMPLEX #cv2.namedWindow('main', cv2.WINDOW_NORMAL) labels = ['No mask', 'Mask'] labelColor = [(10, 0, 255), (10, 255, 0)] try: for frame in vreader(str(videopath)): frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) faces = faceDetector.detect(frame) for face in faces: xStart, yStart, width, height = face # clamp coordinates that are outside of the image xStart, yStart = max(xStart, 0), max(yStart, 0) # predict mask label on extracted face faceImg = frame[yStart:yStart + height, xStart:xStart + width] output = model( transformations(faceImg).unsqueeze(0).to(device)) _, predicted = torch.max(output.data, 1) cv2.rectangle(frame, (xStart, yStart), (xStart + width, yStart + height), (126, 65, 64), thickness=2) # center text according to the face frame textSize = cv2.getTextSize(labels[predicted], font, 1, 2)[0] textX = xStart + width // 2 - textSize[0] // 2 # draw prediction label cv2.putText(frame, labels[predicted], (textX, yStart - 20), font, 1, labelColor[predicted], 2) if outputPath: try: if labels[predicted] == "No mask": writer.writeFrame( cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) writer.close() os.remove(videopath) print("Person without mask detected!") return ("No mask") elif labels[predicted] == "Mask": os.remove(videopath) print("Person with mask detected!") return ("Face detected with Mask") except: os.remove(videopath) print("No face detected!") return ("No face detected!") #cv2.imshow('main', frame) if cv2.waitKey(1) & 0xFF == ord('q'): break #cv2.destroyAllWindows() except: os.remove(videopath) print("Image could not be opened!")
def video_to_frames(video_path: str, frame_template: str): for i, frame in enumerate(vreader(video_path)): frame = resize(frame, (240, 320)) frame = frame[10:170] frame = round_(frame * 255).astype("uint8") imsave(frame_template.format(i), frame, check_contrast=False)
path = os.path.join('checkpoints', model_name + '.meta') meta_graph = tf.train.import_meta_graph(path) session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) meta_graph.restore(sess=session, save_path=os.path.join('checkpoints', model_name)) # load placeholders c_input = tf.get_collection('c_input')[0] c_state_input = tf.get_collection('c_state_input')[0] c_fb_input = tf.get_collection('c_fb_input')[0] c_output = tf.get_collection('c_output')[0] c_state_output = tf.get_collection('c_state_output')[0] # load video frames reader = io.vreader(video_path) # fill buffer buffer = [] buffer_hr = [] for i in range(buffer_len): add_image = next(reader) buffer_hr.append(add_image) # downscale by factor 4 with gaussian smoothing if downscale: s = 1.5 add_image = gf(add_image, sigma=[s, s, 0])[0::4, 0::4, :] add_image = np.rint(np.clip(add_image, 0, 255)).astype(np.uint8)
src_det_file_path = '/home/jinchoi/src/rehab/dataset/action/kinetics/detectron_results/{}/full_kinetics_detection_{}_rearranged_org_spatial_dim.npy'.format(split,split) # src_det_file_path = '/home/jinchoi/src/rehab/dataset/action/kinetics/detectron_results/{}/full_kinetics_detection_{}_rearranged.npy'.format(split,split) videos_root = '/home/jinchoi/src/rehab/dataset/action/kinetics/videos/{}'.format(split) height_in_tgt_dets = 256 # read the original detection file dets = np.load(src_det_file_path, allow_pickle=True).item() pdb.set_trace() for i,(cur_cls,vid_datas) in enumerate(dets['dets'].items()): cur_cls = cur_cls.replace(' ', '_') for k,v in vid_datas.items(): # get the width and height of the video filelist = gb.glob(os.path.join(videos_root, cur_cls, k)+'*') if len(filelist) > 0: vidfile_name = filelist[0].split('/')[-1] input_video_path = os.path.join(videos_root, cur_cls, vidfile_name) videogen = vreader(input_video_path) vis_vid = [] resize_dim = for idx,frame in enumerate(videogen): cv2.resize(frame, ) pdb.set_trace() print(' ')
def analyze_video(file_path, model, framework, resolution, lite): """ Predict pose coordinates on supplied video. Args: file_path: path System path of video to analyze model: deep learning model Initialized EfficientPose model to utilize (RT, I, II, III, IV, RT_Lite, I_Lite or II_Lite) framework: string Deep learning framework to use (Keras, TensorFlow, TensorFlow Lite or PyTorch) resolution: int Input height and width of model to utilize lite: boolean Defines if EfficientPose Lite model is used Returns: Predicted pose coordinates in all frames of the supplied video. """ # Define batch size and number of batches in each part batch_size = 1 if framework in ['tensorflowlite', 'tflite'] else 49 part_size = 490 if framework in ['tensorflowlite', 'tflite'] else 10 # Load video from skvideo.io import vreader, ffprobe start_time = time.time() try: videogen = vreader(file_path) video_metadata = ffprobe(file_path)['video'] num_video_frames = int(video_metadata['@nb_frames']) num_batches = int(np.ceil(num_video_frames / batch_size)) frame_height, frame_width = next(vreader(file_path)).shape[:2] except: print( '\n##########################################################################################################' ) print( 'Video "{0}" could not be loaded. Please verify that the file is working.' .format(file_path)) print( '##########################################################################################################\n' ) return False # Operate on batches coordinates = [] batch_num = 1 part_start_time = time.time() print( '\n##########################################################################################################' ) while True: # Fetch batch of frames batch = [next(videogen, None) for _ in range(batch_size)] if not type(batch[0]) == np.ndarray: break elif not type(batch[-1]) == np.ndarray: batch = [ frame if type(frame) == np.ndarray else np.zeros( (frame_height, frame_width, 3)) for frame in batch ] # Preprocess batch batch = helpers.preprocess(batch, resolution, lite) # Perform inference batch_outputs = infer(batch, model, lite, framework) # Extract coordinates for batch batch_coordinates = [ helpers.extract_coordinates(batch_outputs[n, ...], frame_height, frame_width) for n in range(batch_size) ] coordinates += batch_coordinates # Print partial processing time if batch_num % part_size == 0: print( '{0} of {1}: Part processed in {2} seconds | Video processed for {3} seconds' .format(int(batch_num / part_size), int(np.ceil(num_batches / part_size)), '%.3f' % (time.time() - part_start_time), '%.3f' % (time.time() - start_time))) part_start_time = time.time() batch_num += 1 # Print total processing time print('{0} of {0}: Video processed in {1} seconds'.format( int(np.ceil(num_batches / part_size)), '%.3f' % (time.time() - start_time))) print( '##########################################################################################################\n' ) return coordinates[:num_video_frames]
def tagVideo(modelpath, videopath, outputPath=None): """ detect if persons in video are wearing masks or not """ model = MaskDetector() model.load_state_dict(torch.load(modelpath)['state_dict'], strict=False) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = model.to(device) model.eval() faceDetector = FaceDetector( prototype='./models/deploy.prototxt.txt', model='./models/res10_300x300_ssd_iter_140000.caffemodel', ) transformations = Compose([ ToPILImage(), Resize((100, 100)), ToTensor(), ]) if outputPath: writer = FFmpegWriter(str(outputPath)) # fontC = 'simsun.ttc' font = cv2.FONT_HERSHEY_SIMPLEX cv2.FONT_HERSHEY_SIMPLEX cv2.namedWindow('main', cv2.WINDOW_NORMAL) labels = ['No mask', 'Mask'] labelColor = [ (255, 255, 255), (10, 255, 0) ] # Can have a different color for predicted with mask or without for frame in vreader(str(videopath)): frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) faces = faceDetector.detect(frame) for face in faces: xStart, yStart, width, height = face # clamp coordinates that are outside of the image xStart, yStart = max(xStart, 0), max(yStart, 0) # predict mask label on extracted face faceImg = frame[yStart:yStart + height, xStart:xStart + width] output = model(transformations(faceImg).unsqueeze(0).to(device)) _, predicted = torch.max(output.data, 1) # draw face frame cv2.rectangle(frame, (xStart, yStart), (xStart + width, yStart + height), (255, 255, 255), thickness=2) # draw the prediction label in CHINESE imgNoMask = np.zeros([20, 40, 3], dtype=np.uint8) imgMask = np.zeros([20, 20, 3], dtype=np.uint8) imgNoMask.fill(255) imgMask.fill(255) b, g, r, a = 0, 0, 0, 0 if predicted == 0: img = cv2ImgAddText(imgNoMask, "没有", 3, 3, (b, g, r), 15) else: img = cv2ImgAddText(imgMask, "有", 3, 3, (b, g, r), 15) img_height, img_width, _ = img.shape frame[ yStart:yStart + img_height, xStart:xStart + img_width] = img # Replace the top corner left with the image of Chinese words # Add the Prediction Label in ENGLISH according to the face frame # center text according to the face frame textSize = cv2.getTextSize(labels[predicted], font, 1, 2)[0] textX = xStart + width // 2 - textSize[0] // 2 # draw prediction label cv2.putText(frame, labels[predicted], (textX + 40, yStart + 20), font, 0.5, labelColor[predicted], 1) if outputPath: writer.writeFrame(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) cv2.imshow('main', frame) if cv2.waitKey(1) & 0xFF == ord('q'): break if outputPath: writer.close() cv2.destroyAllWindows()
def extract_db(config, dir_meta, cameras): """ :param config: config from config.yaml :param dir_meta: determine extraction of train or test :param cameras: :return: """ dataset = [] # all images of train or test all_joints = dt.vicon_joints for dir, meta in dir_meta: # one action contains 8 cam views meta_sub = meta['subject'] meta_act = config['action_reverse_map'][ meta['action']] # action string name meta_subact = meta['subaction'] gt_pos_path = os.path.join(dir, 'gt_skel_gbl_pos.txt') gt_ori_path = os.path.join(dir, 'gt_skel_gbl_ori.txt') calib_imu_bone_path = os.path.join( dir, 's{}_{}{}_calib_imu_bone.txt'.format(meta_sub, meta_act, meta_subact)) calib_imu_ref_path = os.path.join( dir, 's{}_{}{}_calib_imu_ref.txt'.format(meta_sub, meta_act, meta_subact)) imu_data_path = os.path.join( dir, 's{}_{}{}_Xsens.sensors'.format(meta_sub, meta_act, meta_subact)) bvh_path = os.path.join( dir, '{}{}_BlenderZXY_YmZ.bvh'.format(meta_act, meta_subact)) gt_pos = dt.parse_vicon_gt_pos(gt_pos_path) gt_ori = dt.parse_vicon_gt_ori(gt_ori_path) imu_data = dt.parse_sensor_6axis(imu_data_path) calib_imu_bone = dt.parse_calib_imu_bone(calib_imu_bone_path) calib_imu_ref = dt.parse_calib_imu_ref(calib_imu_ref_path) bone_info = dt.parse_imu_bone_info(bvh_path) canvas_size = (1079., 1919.) # height width filtered_joints = config['joints_filter'] # bone vector / orientation, not camera related bones = [ 'Head', 'Sternum', 'Pelvis', 'L_UpArm', 'R_UpArm', 'L_LowArm', 'R_LowArm', 'L_UpLeg', 'R_UpLeg', 'L_LowLeg', 'R_LowLeg' ] # obtain ref for all bones bone_refs = dict() for bone in bones: joint_p = bone_info[bone][0] joint_c = bone_info[bone][1] bone_vec = np.array(bone_info[bone][2]) * 25.4 q_TI = calib_imu_ref[bone] q_bi = calib_imu_bone[bone] q_TI = Quaternion(q_TI) q_bi = Quaternion(q_bi) q_ib = q_bi.conjugate bone_refs[bone] = { 'joint_p': joint_p, 'joint_c': joint_c, 'bone_vec': bone_vec, 'q_TI': q_TI, 'q_ib': q_ib } bone_vectors = dict() # of all frames for c in range(8): mp4_file_name = 'TC_S{}_{}{}_cam{}.mp4'.format( meta_sub, meta_act, meta_subact, c + 1) mp4_file_path = os.path.join(dir, mp4_file_name) cam = cameras[c] vid_info = ffprobe(mp4_file_path) vid_frame_num = int(vid_info['video']['@nb_frames']) # print(mp4_file_name, vid_info['video']['@nb_frames'], len(gt_pos)- int(vid_info['video']['@nb_frames']), # vid_info['video']['@bit_rate']) out_path = os.path.join(config['db_out_dir'], 'marked') out_path = os.path.join( out_path, 'sub{}_{}_{}_cam{}'.format(meta_sub, meta_act, meta_subact, c + 1)) if config['save_visualization']: if not os.path.exists(out_path): os.makedirs(out_path) # where to save extract frames seq_dir_name = 's_{:0>2}_act_{:0>2}_subact_{:0>2}_ca_{:0>2}'.format( meta_sub, meta['action'], meta_subact, c + 1) seq_dir_path = os.path.join(config['db_out_dir'], seq_dir_name) if config['save_frame']: if not os.path.exists(seq_dir_path): os.makedirs(seq_dir_path) vid_ff = vreader(mp4_file_path) min_frame_to_iter = min(vid_frame_num, len(gt_pos), len(gt_ori), len(imu_data)) for idx in tqdm(range(min_frame_to_iter)): pose3d = np.zeros([3, len(all_joints)]) for idx_j, j in enumerate(all_joints): pose3d[:, idx_j] = gt_pos[idx][j] pose3d = pose3d * 0.0254 # inch to meter pose2d = project_pose3d_to_2d(pose3d, cam, do_distor_corr=True) if config['save_visualization'] or config['save_frame']: aframe = next(vid_ff) if config[ 'save_visualization']: # skeleton visualization save to disk out_file_path = os.path.join(out_path, '{:0>6d}.jpg'.format(idx)) marked_img = _visualize_one_frame( aframe, pose2d) # todo vis box on image img_4save = cv2.cvtColor(marked_img, cv2.COLOR_RGB2BGR) cv2.imwrite(out_file_path, img_4save) # cropping box information p2d, p3d_cam, p3d, vis = filter_and_project_2d_pose( gt_pos[idx], filtered_joints, cam, canvas_size, do_distor_corr=True) mvpose_vis = np.reshape([vis / 2., vis / 2., vis / 2.], (3, -1)) # vis follow coco protocol, divide 2 and copy 3 times to follow mvpose root_joint = project_pose3d_to_cam( np.reshape(gt_pos[idx]['Hips'], (3, -1)) * 0.0254, cam) tl_joint = np.copy(root_joint) # shape (3,1) br_joint = np.copy(root_joint) tl_joint[0, 0] -= 1.0000 tl_joint[1, 0] -= 0.9000 br_joint[0, 0] += 1.0000 br_joint[1, 0] += 1.1000 bbox_25d = np.concatenate((root_joint, tl_joint, br_joint), axis=1) bbox = project_cam_to_uv( bbox_25d, cam, do_distor_corr=True) # contain 3 point: center, tl, br box_center = tuple(bbox[:, 0]) # (x, y) box_scale = tuple((bbox[:, 2] - bbox[:, 1]) / 200.) box = tuple(np.concatenate([bbox[:, 2], bbox[:, 1] ])) # (x_tl, y_tl, x_br, y_br) frame_file_name = '{:0>6d}.jpg'.format(idx) frame_file_path = os.path.join(seq_dir_path, frame_file_name) if config['save_frame']: # save video frame to disk frame_to_cv = cv2.cvtColor(aframe, cv2.COLOR_RGB2BGR) cv2.imwrite(frame_file_path, frame_to_cv) # notice: Difference between totalcapture and h36m project, # (1) joints_3d in mm # (2) camera['T'] in mm # (3) in totalcapture: point_Camera = R.dot(point_Tracking) + T (point and T in m); # in h36m: point_Camera = R.dot(point_Tracking - T) (point and T in mm) # aka in h36m: point_Tracking = R^{-1}.dot(point_Camera) + T # (4) coordinates shape is (num_cords, 3), aka row vector, but I like col vector more cam_in_h36m_format = copy.deepcopy(cam) cam_in_h36m_format['R'] = cam_in_h36m_format['R'] cam_in_h36m_format['T'] = cam_in_h36m_format['T'] * 1000. * ( -1.) cam_in_h36m_format['T'] = cam_in_h36m_format['R'].T.dot( cam_in_h36m_format['T']) del cam_in_h36m_format['intri_mat'] del cam_in_h36m_format['extri_mat'] # bone vector # avoid parsing in each view, only in first view if idx not in bone_vectors: bone_vector_of_one_frame = dict() for bone in bones: q_TI = bone_refs[bone]['q_TI'] q_ib = bone_refs[bone]['q_ib'] bone_vec = bone_refs[bone]['bone_vec'] ori = imu_data[idx][bone][0] q_Ii = Quaternion(ori) q_Tb = q_TI * q_Ii * q_ib rotated_bone_vec = q_Tb.rotate(bone_vec) bone_vector_of_one_frame[bone] = rotated_bone_vec bone_vectors[idx] = bone_vector_of_one_frame dataitem = { 'image': os.path.join(seq_dir_name, '{:0>6d}.jpg'.format(idx)), 'joints_2d': p2d.T, 'joints_3d': (p3d_cam * 1000.).T, # 3d pose in camera frame, for psm evaluation 'joints_vis': mvpose_vis.T, # 0: in-visible, 1: visible. 'center': box_center, 'scale': box_scale, 'box': box, 'video_id': mp4_file_name, # mp4 file name # todo 'image_id': idx, 'subject': meta['subject'], 'action': meta['action'], 'subaction': meta['subaction'], 'camera_id': c, # start from 0 'camera': cam_in_h36m_format, 'source': 'totalcapture', 'bone_vec': bone_vectors[idx], 'joints_gt': p3d.T * 1000. # groundtruth in tracking frame } dataset.append(dataitem) return dataset
len(argv) - 2, \ desc='Files', \ ), \ argv[2:]): # Loading source video VIDEO_PATH = abspath(source) if not os.access(VIDEO_PATH, os.R_OK): print('Internal error, source file is not avaliable') exit(1) METADATA = get_metdata(source) VIDEO_FRAMES_COUNT = int(METADATA["@nb_frames"]) VIDEO_TIME_BASE = eval(METADATA["@codec_time_base"]) # VIDEO_FRAMES_COUNT = 100000 cap = vreader(VIDEO_PATH) L = [] diffs = [0] frames = [] # Hashing for _, frame in zip(trange(VIDEO_FRAMES_COUNT, leave=False), cap): hash_img = int(phash64(frame)) if len(L): diffs.append(hamming(hash_img, L[-1])) L.append(hash_img) frames.append(frame) # Get scene segmentation and their hash scenes = get_joly_scenes_sementation(frames, nb_std_above_mean_th=2.)#get_scenes_segmentation(diffs, nb_std_above_mean_th=2.5)
import skvideo.io as skv import numpy as np import os assert os.path.isfile('computation_recorded.mp4') video = skv.vreader('computation_recorded.mp4') writer = skv.FFmpegWriter('video_corrected.mp4') for frame in video: writer.writeFrame(np.flip(frame, axis=-1))
def run_demo(cuda, record, vfile): model = 'models/21styles.params' ngf = 128 style_size = 512 style_folder = 'images/styles/' mirror = False vDir = './video/' vPath = vDir + vfile oFile = 'output21-' + vfile wM, hM = 640, 480 if cuda: ctx = mx.gpu(0) os.environ['MXNET_CUDNN_AUTOTUNE_DEFAULT'] = '0' else: ctx = mx.cpu(0) style_loader = StyleLoader(style_folder, style_size, ctx) style_model = Net(ngf=ngf) style_model.load_parameters(model, ctx=ctx) metadata = ffprobe(vPath) fps = metadata["video"]["@avg_frame_rate"] # print(json.dumps(metadata["video"], indent=4)) w, h = int(metadata["video"]["@width"]), int(metadata["video"]["@height"]) downsize = h > hM if downsize: w = 2 * int(w * hM / h / 2) h = hM # downsize = w > wM # if downsize : # h = 2 * int(h * wM / w / 2); w = wM swidth = int(w / 4) sheight = int(h / 4) wName = vfile + ' STYLIZED VIDEO fps:' + fps + ' W:' + str( w) + ' H:' + str(h) if record: out = FFmpegWriter(vDir + oFile, inputdict={ '-r': str(fps), '-s': '{}x{}'.format(2 * w, h) }, outputdict={ '-r': str(fps), '-c:v': 'h264' }) key, idx = 0, 0 cv2.namedWindow(wName, cv2.WINDOW_NORMAL) cv2.resizeWindow(wName, 2 * w, h) for img in vreader(vPath): idx += 1 if downsize: img = cv2.resize(img, (w, h), interpolation=cv2.INTER_AREA) if mirror: img = cv2.flip(img, 1) cimg = img.copy() img = np.array(img).transpose(2, 0, 1).astype(float) img = F.expand_dims(mx.nd.array(img, ctx=ctx), 0) # changing styles if idx % 50 == 1: style_v = style_loader.get(int(idx / 20)) style_model.set_target(style_v) img = style_model(img) simg = np.squeeze(style_v.asnumpy()) simg = simg.transpose(1, 2, 0).astype('uint8') img = F.clip(img[0], 0, 255).asnumpy() img = img.transpose(1, 2, 0).astype('uint8') # display simg = cv2.resize(simg, (swidth, sheight), interpolation=cv2.INTER_CUBIC) cimg[0:sheight, 0:swidth, :] = simg img = np.concatenate((cimg, cv2.cvtColor(img, cv2.COLOR_BGR2RGB)), axis=1) if record: out.writeFrame(img) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) cv2.imshow(wName, img) key = cv2.waitKey(1) if key == 27: # Esc break if record: out.close() transferAudio(vPath, vDir, oFile) print("Done OK. Created Stylised Video file", vDir + oFile) print("fps :", fps, " W:", w, " H:", h) cv2.destroyAllWindows()
def video_classify(video_dict): if not os.path.exists(image_temp): os.mkdir(image_temp) if not os.path.exists(video_temp): os.mkdir(video_temp) conclusion_dict = {} for video_name in video_dict: video_id = video_name.split('.')[0] video_path = os.path.join(video_temp, video_name) if not os.path.exists(video_path): continue image_video_temp = os.path.join(image_temp, video_id) if not os.path.exists(image_video_temp): os.mkdir(image_video_temp) else: shutil.rmtree(image_video_temp) os.mkdir(image_video_temp) metadata = ffprobe(video_path) avg_frame_rate = metadata["video"]["@avg_frame_rate"].split('/') rate = int(avg_frame_rate[0]) / int(avg_frame_rate[1]) video = vreader(video_path) index = 0 for frame in video: if index % (rate * interval) == 0: img = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) img_name = str(index / rate) + '.jpg' img_path = os.path.join(image_video_temp, img_name) cv2.imwrite(img_path, img) index += 1 # fetch all image score. safe_count = 0 danger_count = 0 warning_count = 0 middle_count = 0 danger_list = [] warning_list = [] scores_list = np.array([]) image_files = os.listdir(image_video_temp) for image_name in image_files: second = image_name.split('.')[0] image_path = os.path.join(image_video_temp, image_name) image_data = open(image_path).read() scores = caffe_preprocess_and_compute( image_data, caffe_transformer=caffe_transformer, caffe_net=nsfw_net, output_layers=['prob']) if scores[1] > 0.8: danger_count += 1 danger_list.append(second) elif scores[1] > 0.5: warning_count += 1 middle_count += 1 warning_list.append(second) elif scores[1] > 0.2: middle_count += 1 else: safe_count += 1 scores_list = np.append(scores_list, scores[1]) # Scores is the array containing SFW / NSFW image probabilities # scores[1] indicates the NSFW probability conclusion_dict[video_name] = { 'url': video_dict[video_name], 'name': video_id, 'extension': video_name.split('.')[-1], 'total_count': scores_list.shape[0], 'danger_count': danger_count, 'warning_count': warning_count, 'danger_second': danger_list, 'warning_second': warning_list, } print( "video name: %s, total: %d, danger_count: %d, warning_count: %d" % (video_name, scores_list.shape[0], danger_count, warning_count)) os.remove(video_path) shutil.rmtree(image_video_temp) return conclusion_dict