def __init__(self, models=list(), total_norm_weights=None, score_name='fc-action', dev_id=0): """ Contruct an action classifier Args: models: list of tuples in the form of (model_proto, model_params, model_fusion_weight, input_type, conv_support, input_size). input_type is: 0-RGB, 1-Optical flow. conv_support indicates whether the network supports convolution testing, which is faster. If this is not supported, we will use oversampling instead total_norm_weights: sum of all model_fusion_weights when normalization is wanted, otherwise use None """ self.__net_vec = [ CaffeNet(x[0], x[1], dev_id, input_size=(340, 256) if x[4] else None) for x in models ] self.__net_weights = [float(x[2]) for x in models] if total_norm_weights is not None: s = sum(self.__net_weights) self.__net_weights = [x / s for x in self.__net_weights] self.__input_type = [x[3] for x in models] self.__conv_support = [x[4] for x in models] self.__num_net = len(models) # the input size of the network self.__input_size = [x[5] for x in models] # whether we should prepare flow stack self.__need_flow = max(self.__input_type) > 0 # the name in the proto for action classes self.__score_name = score_name # the video downloader #self.__video_dl = youtube_dl.YoutubeDL( # { # 'outtmpl': '%(id)s.%(ext)s' # } #) if self.__need_flow: self.__flow_extractor = FlowExtractor(dev_id)
def __init__(self, models=list(), total_norm_weights=None, score_name='fc-action', dev_id=0): """ Contruct an action classifier Args: models: list of tuples in the form of (model_proto, model_params, model_fusion_weight, input_type, conv_support, input_size). input_type is: 0-RGB, 1-Optical flow. conv_support indicates whether the network supports convolution testing, which is faster. If this is not supported, we will use oversampling instead total_norm_weights: sum of all model_fusion_weights when normalization is wanted, otherwise use None """ self.__net_vec = [CaffeNet(x[0], x[1], dev_id, input_size=(340, 256) if x[4] else None ) for x in models] self.__net_weights = [float(x[2]) for x in models] if total_norm_weights is not None: s = sum(self.__net_weights) self.__net_weights = [x/s for x in self.__net_weights] self.__input_type = [x[3] for x in models] self.__conv_support = [x[4] for x in models] self.__num_net = len(models) # the input size of the network self.__input_size = [x[5] for x in models] # whether we should prepare flow stack self.__need_flow = max(self.__input_type) > 0 # the name in the proto for action classes self.__score_name = score_name # the video downloader self.__video_dl = youtube_dl.YoutubeDL( { 'outtmpl': '%(id)s.%(ext)s' } ) if self.__need_flow: self.__flow_extractor = FlowExtractor(dev_id)
class ActionClassifier(object): """ This class provides and end-to-end interface to classifying videos into activity classes """ def __init__(self, models=list(), total_norm_weights=None, score_name='fc-action', dev_id=0): """ Contruct an action classifier Args: models: list of tuples in the form of (model_proto, model_params, model_fusion_weight, input_type, conv_support, input_size). input_type is: 0-RGB, 1-Optical flow. conv_support indicates whether the network supports convolution testing, which is faster. If this is not supported, we will use oversampling instead total_norm_weights: sum of all model_fusion_weights when normalization is wanted, otherwise use None """ self.__net_vec = [ CaffeNet(x[0], x[1], dev_id, input_size=(340, 256) if x[4] else None) for x in models ] self.__net_weights = [float(x[2]) for x in models] if total_norm_weights is not None: s = sum(self.__net_weights) self.__net_weights = [x / s for x in self.__net_weights] self.__input_type = [x[3] for x in models] self.__conv_support = [x[4] for x in models] self.__num_net = len(models) # the input size of the network self.__input_size = [x[5] for x in models] # whether we should prepare flow stack self.__need_flow = max(self.__input_type) > 0 # the name in the proto for action classes self.__score_name = score_name # the video downloader self.__video_dl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s.%(ext)s'}) if self.__need_flow: self.__flow_extractor = FlowExtractor(dev_id) def classify(self, video, model_mask=None, cache_manager=None): """ Args: video: Returns: scores: frm_scores: """ import urlparse if os.path.isfile(video): return self._classify_from_file(video, model_mask, cache_manager) elif urlparse.urlparse(video).scheme != "": return self._classify_from_url(video, model_mask, cache_manager) raise ValueError("Unknown input data type") def _classify_from_file(self, filename, model_mask, cache_manager=None): """ Input a file on harddisk Args: filename: cache: cache intermediate results and use previously cached intermediate result is possible Returns: cls: classification scores frm_scores: frame-wise classification scores """ vid_info = _dummy_vid_info() vid_info.path = filename video_proc = VideoProc(vid_info) video_proc.open_video(True) # here we use interval of 30, roughly 1FPS frm_it = None cached_flow = None if cache_manager is not None: frm_it = cache_manager.load(videoname=filename, type="framestack") cached_flow = cache_manager.load(videoname=filename, type="flowstack") if frm_it is None: frm_it = video_proc.frame_iter(timely=False, ignore_err=True, interval=30, length=6 if self.__need_flow else 1, new_size=(340, 256)) all_scores = [] all_start = time.clock() # process model mask mask = [True] * self.__num_net n_model = self.__num_net if model_mask is not None: for i in xrange(len(model_mask)): mask[i] = model_mask[i] if not mask[i]: n_model -= 1 frame_cache = [] flow_cache = [] cnt = 0 for frm_stack in frm_it: if cache_manager is not None: frame_cache.append(frm_stack) start = time.clock() cnt += 1 frm_scores = [] flow_stack = None for net, run, in_type, conv_support, net_input_size in \ zip(self.__net_vec, mask, self.__input_type, self.__conv_support, self.__input_size): if not run: continue frame_size = (340 * net_input_size / 224, 256 * net_input_size / 224) if in_type == 0: # RGB input frm_scores.append( net.predict_single_frame( frm_stack[:1], self.__score_name, over_sample=not conv_support, frame_size=None if net_input_size == 224 else frame_size)) elif in_type == 1: # Flow input if flow_stack is None: # Extract flow if necessary if cached_flow is not None: flow_stack = cached_flow[cnt - 1] else: flow_stack = self.__flow_extractor.extract_flow( frm_stack, frame_size) if cache_manager is not None: flow_cache.append(flow_stack) frm_scores.append( net.predict_single_flow_stack( flow_stack, self.__score_name, over_sample=not conv_support)) all_scores.append(frm_scores) end = time.clock() elapsed = end - start # print "frame sample {}: {} second".format(cnt, elapsed) if cache_manager is not None: if len(frame_cache) != 0: cache_manager.dump(frame_cache, filename, "framestack") if len(flow_cache) != 0: cache_manager.dump(flow_cache, filename, "flowstack") # aggregate frame-wise scores agg_scores = [] for i in xrange(n_model): model_scores = sliding_window_aggregation_func(np.array( [x[i] for x in all_scores]), norm=False) agg_scores.append(model_scores) final_scores = default_fusion_func( np.zeros_like(agg_scores[0]), agg_scores, [w for w, m in zip(self.__net_weights, mask) if m]) all_end = time.clock() total_time = all_end - all_start # print "total time: {} second".format(total_time) print('{0} processed.'.format(filename)) return final_scores, all_scores, total_time def _classify_from_url(self, url, model_mask, cache_manager=None): """ This function classify an video based on input video url It will first use Youtube-dl to download the video. Then will do classification on the downloaded file Returns: cls: classification scores frm_scores: frame-wise classification scores """ file_info = self.__video_dl.extract_info( url) # it also downloads the video file filename = file_info['id'] + '.' + file_info['ext'] scores, frm_scores, total_time = self._classify_from_file( filename, model_mask, cache_manager) import os os.remove(filename) return scores, frm_scores, total_time
class ActionClassifier(object): """ This class provides and end-to-end interface to classifying videos into activity classes """ def __init__(self, models=list(), total_norm_weights=None, score_name='', dev_id=0): """ Contruct an action classifier Args: models: list of tuples in the form of (model_proto, model_params, model_fusion_weight, input_type, conv_support, input_size). input_type is: 0-RGB, 1-Optical flow. conv_support indicates whether the network supports convolution testing, which is faster. If this is not supported, we will use oversampling instead total_norm_weights: sum of all model_fusion_weights when normalization is wanted, otherwise use None """ self.__net_vec = [ CaffeNet(x[0], x[1], dev_id, input_size=(340, 256) if x[4] else None) for x in models ] self.__net_weights = [float(x[2]) for x in models] if total_norm_weights is not None: s = sum(self.__net_weights) self.__net_weights = [x / s for x in self.__net_weights] self.__input_type = [x[3] for x in models] self.__conv_support = [x[4] for x in models] self.__num_net = len(models) # the input size of the network self.__input_size = [x[5] for x in models] # whether we should prepare flow stack self.__need_flow = max(self.__input_type) > 0 # the name in the proto for action classes self.__score_name_resnet = 'caffe.Flatten_673' self.__score_name_bn = 'global_pool' # the video downloader self.__video_dl = youtube_dl.YoutubeDL({'outtmpl': '%(id)s.%(ext)s'}) if self.__need_flow: self.__flow_extractor = FlowExtractor(dev_id) def classify(self, video, model_mask=None): """ Args: video: Returns: scores: all_features: """ import urlparse if os.path.isfile(video): return self._classify_from_file(video, model_mask) elif urlparse.urlparse(video).scheme != "": return self._classify_from_url(video, model_mask) raise ValueError("Unknown input data type") def _classify_from_file(self, filename, model_mask): """ Input a file on harddisk Args: filename: Returns: cls: classification scores all_features: RGB ResNet feature and Optical flow BN Inception feature in a list """ duration = getLength(filename) duration_in_second = float(duration[0][15:17]) * 60 + float( duration[0][18:23]) info_dict = { 'annotations': list(), 'url': '', 'duration': duration_in_second, 'subset': 'testing' } vid_info = Video('0', info_dict) # update dummy video info... vid_info.path = filename video_proc = VideoProc(vid_info) video_proc.open_video(True) # here we use interval of 30, roughly 1FPS frm_it = video_proc.frame_iter(timely=True, ignore_err=True, interval=0.5, length=6 if self.__need_flow else 1, new_size=(340, 256)) all_features = { 'resnet': np.empty(shape=(0, 2048)), 'bn': np.empty(shape=(0, 1024)) } all_start = time.clock() cnt = 0 # process model mask mask = [True] * self.__num_net n_model = self.__num_net if model_mask is not None: for i in xrange(len(model_mask)): mask[i] = model_mask[i] if not mask[i]: n_model -= 1 for frm_stack in frm_it: start = time.clock() cnt += 1 flow_stack = None for net, run, in_type, conv_support, net_input_size in \ zip(self.__net_vec, mask, self.__input_type, self.__conv_support, self.__input_size): if not run: continue frame_size = (340 * net_input_size / 224, 256 * net_input_size / 224) if in_type == 0: # RGB input # TODO for now we only sample one frame w/o applying mean-pooling all_features['resnet'] = np.concatenate( (all_features['resnet'], net.predict_single_frame( frm_stack[:1], self.__score_name_resnet, over_sample=not conv_support, frame_size=None if net_input_size == 224 else frame_size)), axis=0) elif in_type == 1: # Flow input if flow_stack is None: # Extract flow if necessary # we disabled spatial data aug # the size for flow frames are 224 x 224, hard coded flow_frame_size = (224, 224) flow_stack = self.__flow_extractor.extract_flow( frm_stack, flow_frame_size) # store all the optical flow features # all_features['bn'] = np.concatenate((all_features['bn'], np.squeeze(net.predict_single_flow_stack(flow_stack, self.__score_name_bn, # over_sample=not conv_support))), axis=0) # store only the optical flow feature for the center crop bn_aug = np.squeeze( net.predict_single_flow_stack(flow_stack, self.__score_name_bn, over_sample=False)) # over_sample=not conv_support)) # bn_aug = np.squeeze(bn_aug) # bn_center = bn_aug[5] bn_center = bn_aug bn_center = np.reshape(bn_center, (1, bn_center.shape[0])) all_features['bn'] = np.concatenate( (all_features['bn'], bn_center), axis=0) end = time.clock() elapsed = end - start print "frame sample {}: {} second".format(cnt, elapsed) print all_features['resnet'].shape, all_features['bn'].shape np.save(filename[:-4] + "_resnet", all_features['resnet']) np.save(filename[:-4] + "_bn", all_features['bn']) return all_features def _classify_from_url(self, url, model_mask): """ This function classify an video based on input video url It will first use Youtube-dl to download the video. Then will do classification on the downloaded file Returns: cls: classification scores all_features: RGB ResNet feature and Optical flow BN Inception feature in a list """ file_info = self.__video_dl.extract_info( url) # it also downloads the video file filename = file_info['id'] + '.' + file_info['ext'] scores, all_features, total_time = self._classify_from_file( filename, model_mask) import os os.remove(filename) return scores, all_features, total_time
class ActionClassifier(object): """ This class provides and end-to-end interface to classifying videos into activity classes """ def __init__(self, models=list(), total_norm_weights=None, score_name='fc-action', dev_id=0): """ Contruct an action classifier Args: models: list of tuples in the form of (model_proto, model_params, model_fusion_weight, input_type, conv_support, input_size). input_type is: 0-RGB, 1-Optical flow. conv_support indicates whether the network supports convolution testing, which is faster. If this is not supported, we will use oversampling instead total_norm_weights: sum of all model_fusion_weights when normalization is wanted, otherwise use None """ self.__net_vec = [CaffeNet(x[0], x[1], dev_id, input_size=(340, 256) if x[4] else None ) for x in models] self.__net_weights = [float(x[2]) for x in models] if total_norm_weights is not None: s = sum(self.__net_weights) self.__net_weights = [x/s for x in self.__net_weights] self.__input_type = [x[3] for x in models] self.__conv_support = [x[4] for x in models] self.__num_net = len(models) # the input size of the network self.__input_size = [x[5] for x in models] # whether we should prepare flow stack self.__need_flow = max(self.__input_type) > 0 # the name in the proto for action classes self.__score_name = score_name # the video downloader self.__video_dl = youtube_dl.YoutubeDL( { 'outtmpl': '%(id)s.%(ext)s' } ) if self.__need_flow: self.__flow_extractor = FlowExtractor(dev_id) def classify(self, video, model_mask=None): """ Args: video: Returns: scores: frm_scores: """ import urlparse if os.path.isfile(video): return self._classify_from_file(video, model_mask) elif urlparse.urlparse(video).scheme != "": return self._classify_from_url(video, model_mask) raise ValueError("Unknown input data type") def _classify_from_file(self, filename, model_mask): """ Input a file on harddisk Args: filename: Returns: cls: classification scores frm_scores: frame-wise classification scores """ vid_info = _dummy_vid_info() vid_info.path = filename video_proc = VideoProc(vid_info) video_proc.open_video(True) # here we use interval of 30, roughly 1FPS frm_it = video_proc.frame_iter(timely=False, ignore_err=True, interval=30, length=6 if self.__need_flow else 1, new_size=(340, 256)) all_scores = [] all_start = time.clock() cnt = 0 # process model mask mask = [True] * self.__num_net n_model = self.__num_net if model_mask is not None: for i in xrange(len(model_mask)): mask[i] = model_mask[i] if not mask[i]: n_model -= 1 for frm_stack in frm_it: start = time.clock() cnt += 1 frm_scores = [] flow_stack = None for net, run, in_type, conv_support, net_input_size in \ zip(self.__net_vec, mask, self.__input_type, self.__conv_support, self.__input_size): if not run: continue frame_size = (340 * net_input_size / 224, 256 * net_input_size / 224) if in_type == 0: # RGB input frm_scores.append(net.predict_single_frame(frm_stack[:1], self.__score_name, over_sample=not conv_support, frame_size=None if net_input_size == 224 else frame_size )) elif in_type == 1: # Flow input if flow_stack is None: # Extract flow if necessary flow_stack = self.__flow_extractor.extract_flow(frm_stack, frame_size) frm_scores.append(net.predict_single_flow_stack(flow_stack, self.__score_name, over_sample=not conv_support)) all_scores.append(frm_scores) end = time.clock() elapsed = end - start print "frame sample {}: {} second".format(cnt, elapsed) # aggregate frame-wise scores agg_scores = [] for i in xrange(n_model): model_scores = sliding_window_aggregation_func(np.array([x[i] for x in all_scores]), norm=False) agg_scores.append(model_scores) final_scores = default_fusion_func(np.zeros_like(agg_scores[0]), agg_scores, [w for w, m in zip(self.__net_weights, mask) if m]) all_end = time.clock() total_time = all_end - all_start print "total time: {} second".format(total_time) return final_scores, all_scores, total_time def _classify_from_url(self, url, model_mask): """ This function classify an video based on input video url It will first use Youtube-dl to download the video. Then will do classification on the downloaded file Returns: cls: classification scores frm_scores: frame-wise classification scores """ file_info = self.__video_dl.extract_info(url) # it also downloads the video file filename = file_info['id']+'.'+file_info['ext'] scores, frm_scores, total_time = self._classify_from_file(filename, model_mask) import os os.remove(filename) return scores, frm_scores, total_time