def process(video): id, platform = video try: cap = cv2.VideoCapture(video_helper.get_path(platform, id)) frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT) if frame_count < NUM_FRAMES * NUM_SEGMENTS: return "Too few frames", id, platform, None # Divide the video into NUM_SEGMENTS segments and take the center NUM_FRAMES frames for analysis. padding_frames = int((frame_count / NUM_SEGMENTS - NUM_FRAMES) / 2) segments = np.zeros((NUM_SEGMENTS, NUM_FRAMES, FRAME_WIDTH, FRAME_HEIGHT, NUM_RGB_CHANNELS)) for i in range(NUM_SEGMENTS): # Skip ahead padding_frames for _ in range(padding_frames): cap.read() # Take NUM_FRAMES frames for j in range(NUM_FRAMES): _, frame = cap.read() segments[i][j] = cv2.resize( video_helper.crop_center_square(frame), (FRAME_WIDTH, FRAME_HEIGHT)) # Again, skip ahead padding_frames for _ in range(padding_frames): cap.read() cap.release() # batch size 5 allows for 4 workers on a 12GB GPU prediction = rgb_model.predict(np.array(segments), batch_size=5) # The model averages next, so I do the same. All NUM_SEGMENTS outputs are then averaged again. mean = prediction.mean(axis=1).mean(axis=0)[0][0] # Compression to reduce memory footprint of sparse vectors return "Success", id, platform, zlib.compress(mean, 9) except Exception as e: return str(e), id, platform, None
def process(video): # Takes a video and returns every nth frame preprocessed as a numpy-array id, platform = video images = [] cap = cv2.VideoCapture(video_helper.get_path(platform, id)) count = 0 while True: success, image = cap.read() if success: if count % EVERY_FRAME == 0: x = resize(image, (224, 224), mode='constant') * 255 x = preprocess_input(x) images.append(x) count += 1 else: # Reached the end of the video cap.release() break if len(images) > MIN_IMAGES: # Batch predict frame_results = model.predict(np.array(images)) # The shape is (n_frames, 1, 1, layer_output) frame_results = frame_results.reshape(-1, frame_results.shape[-1]) # Mean pooling mean = np.mean(frame_results, axis=0) # Compression to reduce memory footprint of sparse vectors return "Success", id, platform, zlib.compress(mean, 9) else: return "Too few frames", id, platform, None
def display_video(platorm="facebook", id="CadburyBournvita/1937970696267088"): path = video_helper.get_path(platform=platorm, id=id) # path = os.path.relpath(path, os.getcwd()) video_encoded = base64.b64encode(open(path, "rb").read()) display( HTML(data='''<video alt="test" controls> <source src="data:video/mp4;base64,{0}" type="video/mp4" /> </video>'''.format(video_encoded.decode('ascii'))))
def download(youtube_video_id): ret = dict() try: video_path = video_helper.get_path("youtube") video_file = video_path + youtube_video_id + ".mp4" ydl_opts = { # Download smallest file but not less then 240p (so not 144p for example) 'format': 'worst[height>=240][ext=mp4]/worst[height>=240]/worst', # best[height<=360][ext=mp4] 'outtmpl': video_file, 'quiet': True, 'logger': QuietLogger() } with youtube_dl.YoutubeDL(ydl_opts) as ydl: ret = dict() info = ydl.extract_info(youtube_video_id, download=False) if "_type" not in info or info["_type"] != "playlist": # Likes might be disabled ret["likes"] = info[ "like_count"] if "like_count" in info else -1 ret["views"] = info[ "view_count"] if "view_count" in info else -1 ret["duration"] = info["duration"] * 1000 # Youtube-dl does not extract these at this point, neither does pytube. ret["comments"] = -1 ret["shares"] = -1 if ret["duration"] <= video_helper.LENGTH_CUTOFF: # Only download if its not too long ydl.extract_info(youtube_video_id, download=True) ffprobe = video_helper.get_ffprobe_json(video_file) size = int(ffprobe['format']['size']) if size <= video_helper.SIZE_CUTOFF: ret["crawling_status"] = "Success" else: # File is too big. os.remove(video_file) ret["crawling_status"] = "Too big" else: # Video is too long. ret["crawling_status"] = "Too long" else: ret["crawling_status"] = "Is stream recording" except Exception as e: # traceback.print_exc() ret["crawling_status"] = str( e)[:100] # to prevent filling the db with stack traces return ret
def process(video): # Takes a video and returns every nth frame preprocessed as a numpy-array id, platform = video try: path = extract_audio(video_helper.get_path(platform, id)) audio, _ = librosa.load(path, dtype='float32', sr=22050, mono=True) # SoundNet needs the range to be between -256 and 256 # In addition to the research this is based on, we scale the amplitude maximum = max(audio.max(), -audio.min()) if maximum != 0.0: audio *= 256.0 / maximum # reshaping the audio data so it fits into the graph (batch_size, num_samples, num_filter_channels) audio = np.reshape(audio, (1, -1, 1)) prediction = model.predict(audio) subprocess.call(["rm", path]) prediction = prediction.mean(axis=1)[0] return "Success", id, platform, zlib.compress(prediction, 9) else: subprocess.call(["rm", path]) return "No Audio", id, platform, None except Exception as e: return str(e), id, platform, None
def run(): MODEL = "yolov3" # Postfix -tiny net, meta = darknet_wrapper.initialize_classifier( config="cfg/%s.cfg" % MODEL, weights="weights/%s.weights" % MODEL, data="cfg/coco.data") conn = psycopg2.connect(database="video_article_retrieval", user="******") c = conn.cursor() # Just classifying facebook videos for now c.execute( "SELECT id, platform FROM videos WHERE object_detection_yolo_status<>'Success' AND platform = 'facebook'" ) videos = c.fetchall() print("%d videos left to analyze" % len(videos)) crawling_progress = StatusVisualization(len(videos), update_every=10) for id, platform in videos: # print(platform, id) # We need to extract the images first # start = time.time() images = [] cap = cv2.VideoCapture(video_helper.get_path(platform, id)) count = 0 while True: success, image = cap.read() if success: if count % 30 == 0: path = tempfile.gettempdir() + "/%05d.jpg" % count cv2.imwrite(path, image) images.append(path) count += 1 else: # Reached the end of the video break # print("Extracted %d images in %d seconds" % (len(images), time.time() - start)) # start = time.time() for index, image in enumerate(images): try: result = darknet_wrapper.detect(net, meta, image) # print("%d: Found %d rois in %s" % (index, len(result), image)) for entity in result: # format is (class, probability (x,y,width, height)) ANKERED IN THE CENTER! (label, probability, (x, y, width, height)) = entity # x,y,height and width are not saved for now. # print("%d,%d (%dx%d): %s (%.3f)" % (x, y, width, height, label, probability)) c.execute( "INSERT INTO object_detection_yolo(id,platform,second,class,probability) VALUES (%s,%s,%s,%s,%s)", [ id, platform, index, str(label, "utf-8"), probability ]) conn.commit() except Exception as e: print(e) # Update the classification status c.execute( "UPDATE videos SET object_detection_yolo_status = 'Success' WHERE id=%s AND platform=%s", [id, platform]) conn.commit() # print("Detection took %d seconds" % (time.time() - start)) crawling_progress.inc()
def download(tweet_id): """ :param tweet_id: :return: dict: crawling_status, views, duraton (ms), comments (=reply), shares (=retweets), likes (=favorite) """ ret = dict() try: video_path = video_helper.get_path("twitter") # Get an authorization and a guest Token by extracting it from the Twitter source code video_player_url = 'https://twitter.com/i/videos/tweet/' + tweet_id video_player_response = requests.get(video_player_url) video_player_soup = BeautifulSoup(video_player_response.text, 'lxml') js_file_url = video_player_soup.find('script')['src'] js_file_response = requests.get(js_file_url) bearer_token_pattern = re.compile('Bearer ([a-zA-Z0-9%-])+') bearer_token = bearer_token_pattern.search(js_file_response.text) bearer_token = bearer_token.group(0) # For now I'm manually getting one by going to https://twitter.com/i/videos/tweet/1041730759613046787 # And looking at the request headers for the config request. guest_token = "1049750915719340034" # Talk to the API to get the m3u8 URL using the token just extracted player_config_url = 'https://api.twitter.com/1.1/videos/tweet/config/%s.json' % tweet_id player_config_response = requests.get(player_config_url, headers={ 'Authorization': bearer_token, "x-guest-token": guest_token }) if player_config_response.status_code == 200: player_config = json.loads(player_config_response.text) if player_config['track']['contentType'] == 'media_entity': m3u8_url = player_config['track']['playbackUrl'] ret["views"] = util.convert_si_to_number( player_config['track']['viewCount']) ret["duration"] = int(player_config['track']['durationMs']) # Get some more information by extracting it from the website embedding the tweet status_url = "http://twitter.com/i/status/" + tweet_id status_response = requests.get(status_url) status_soup = BeautifulSoup(status_response.text, 'lxml') stats = status_soup.find( "div", {'class': "permalink-tweet-container"}) # Sometimes comments are disabled, then this is just 0. ret["comments"] = int( stats.find("span", { 'class': "ProfileTweet-action--reply" }).find("span", {'class': "ProfileTweet-actionCount" })['data-tweet-stat-count']) ret["shares"] = int( stats.find("span", { 'class': "ProfileTweet-action--retweet" }).find("span", {'class': "ProfileTweet-actionCount" })['data-tweet-stat-count']) ret["likes"] = int( stats.find("span", { 'class': "ProfileTweet-action--favorite" }).find("span", {'class': "ProfileTweet-actionCount" })['data-tweet-stat-count']) # Get m3u8 m3u8_response = requests.get( m3u8_url, headers={'Authorization': bearer_token}) m3u8_url_parse = urllib.parse.urlparse(m3u8_url) video_host = m3u8_url_parse.scheme + '://' + m3u8_url_parse.hostname m3u8_parse = m3u8.loads(m3u8_response.text) if m3u8_parse.is_variant: # Find video with 480p resolution or higher (or lower if not available) # ...sort by res sorted_by_res = sorted( m3u8_parse.playlists, key=lambda video: video.stream_info.resolution[0]) correct_res = None for video in sorted_by_res: if video.stream_info.resolution[0] >= 480: correct_res = video break if correct_res is None: # No video with resolution >= 480p found correct_res = sorted_by_res[-1] ts_m3u8_response = requests.get(video_host + correct_res.uri) ts_m3u8_parse = m3u8.loads(ts_m3u8_response.text) video_file = os.path.join(video_path, tweet_id + ".ts") with open(video_file, 'ab+') as wfd: for ts_uri in ts_m3u8_parse.segments.uri: ts_file = requests.get(video_host + ts_uri) wfd.write(ts_file.content) ffprobe = video_helper.get_ffprobe_json(video_file) duration = int(float(ffprobe['format']['duration']) * 1000) size = int(ffprobe['format']['size']) if duration <= video_helper.LENGTH_CUTOFF: if size <= video_helper.SIZE_CUTOFF: ret["crawling_status"] = "Success" else: # File is too big. os.remove(video_file) ret["crawling_status"] = "Too big" else: # Video is too long. os.remove(video_file) ret["crawling_status"] = "Too long" else: # No playlists are contained in the response ret["crawling_status"] = "Not is_variant" else: # The playable media is not a video (e.g. its a gif) ret["crawling_status"] = "Content Type: %s" % player_config[ 'track']['contentType'] else: # The server returned an error message (most times this is a 404, meaning the tweet doesn't have playable media attached) ret["crawling_status"] = "Player Config: %d" % player_config_response.status_code except Exception as e: traceback.print_exc() ret["crawling_status"] = str(e) return ret
def download(facebook_video_id): """ :param facebook_video_id: Combination of the actual video id and the username, id + "/" + user_name :return: """ ret = dict() user_name, video_id = facebook_video_id.split("/") try: video_path = os.path.join(video_helper.get_path("facebook"), user_name) url = "https://www.facebook.com/%s/videos/%s" % (user_name, video_id) res = requests.get(url, timeout=5, allow_redirects=True) if res.status_code == 200: # Alternatively, theres also hd_src and both with _no_ratelimit postfix # (but if one doesn't exist, neither does) mp4_url_occurences = re.findall("sd_src:\"(.*?)\",", res.text) if len(mp4_url_occurences) > 0: ret["comments"] = int( re.findall("commentcount:([0-9]*),", res.text)[0]) ret["shares"] = int( re.findall("sharecount:([0-9]*),", res.text)[0]) ret["likes"] = int( re.findall("likecount:([0-9]*),", res.text)[0]) view_count = re.findall("viewCount:\"([0-9,]*)\",", res.text) # Number of views are not always present if len(view_count) == 1: ret["views"] = int(view_count[0].replace(",", "")) else: ret["views"] = -1 r = requests.get(mp4_url_occurences[0], stream=True) if not os.path.exists(video_path): # Every user has its own path os.makedirs(video_path) video_file = video_path + "/" + video_id + ".mp4" with open(video_file, 'wb+') as file: for chunk in r.iter_content(chunk_size=1024): if chunk: file.write(chunk) ffprobe = video_helper.get_ffprobe_json(video_file) ret["duration"] = int( float(ffprobe['format']['duration']) * 1000) size = int(ffprobe['format']['size']) if ret["duration"] <= video_helper.LENGTH_CUTOFF: if size <= video_helper.SIZE_CUTOFF: ret["crawling_status"] = "Success" else: # File is too big. os.remove(video_file) ret["crawling_status"] = "Too big" else: # Video is too long. os.remove(video_file) ret["crawling_status"] = "Too long" else: ret["crawling_status"] = "Video not available" else: ret["crawling_status"] = res.status_code except (HTTPError, ConnectionError): ret["crawling_status"] = "Invalid URL" except Exception as e: traceback.print_exc() ret["crawling_status"] = str(e) return ret