def test_determine_ext(self): self.assertEqual( determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') self.assertEqual( determine_ext('http://example.com/foo/bar/?download', None), None) self.assertEqual( determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual( determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual( determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8')
def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._search_regex( r'<h1 class="video__body__header__title">(.+?)</h1>', webpage, 'title') data_video = self._html_search_regex( r'data-video=(["\'])(?P<id>.+?)\1', webpage, 'data-video', group='id') json_url = 'https://mediazone.vrt.be/api/v1/canvas/assets/' + data_video data = self._download_json(json_url, video_id) formats = [] for target in data['targetUrls']: if 'type' and 'url' in target: extension = utils.determine_ext(target['url']) if target['type'] == 'PROGRESSIVE_DOWNLOAD': formats.append({ 'format_id': extension, 'url': target['url'], 'protocol': 'http', }) elif target['type'] == 'HLS': formats.extend(self._extract_m3u8_formats( target['url'], video_id, entry_protocol='m3u8_native', ext='mp4', preference=0, fatal=False, m3u8_id='hls')) elif target['type'] == 'HDS': formats.append({ 'format_id': extension, 'url': target['url'], 'protocol': 'HDS', }) elif target['type'] == 'RTMP': formats.append({ 'format_id': extension, 'url': target['url'], 'protocol': 'rtmp', }) elif target['type'] == 'RTSP': formats.append({ 'format_id': extension, 'url': target['url'], 'protocol': 'rtsp', }) self._sort_formats(formats) duration = utils.int_or_none(data.get('duration')) / 1000 return { 'id': video_id, 'title': title, 'formats': formats, 'duration': duration, }
def run(self, dl_url: str, add_webcam: bool, add_annotations: bool, add_cursor, keep_tmp_files: bool, filename: str): m_obj = re.match(self._VALID_URL, dl_url) video_id = m_obj.group('id') video_website = m_obj.group('website') self.to_screen("Downloading meta informations") # Make sure the lesson exists self._download_webpage(dl_url, video_id) self._create_tmp_dir(video_id) # Extract basic metadata metadata_url = video_website + '/presentation/' + video_id + '/metadata.xml' metadata = self._download_xml(metadata_url, video_id) shapes_url = video_website + '/presentation/' + video_id + '/shapes.svg' shapes = self._download_xml(shapes_url, video_id) cursor_url = video_website + '/presentation/' + video_id + '/cursor.xml' cursor_infos = self._download_xml(cursor_url, video_id) # Parse metadata.xml meta = metadata.find('./meta') start_time = xpath_text(metadata, 'start_time') recording_duration = float(xpath_text( metadata, './playback/duration')) / 1000.0 # in seconds title = xpath_text(meta, 'meetingName') try: bbb_origin_version = xpath_text(meta, 'bbb-origin-version') if bbb_origin_version is not None: bbb_version = bbb_origin_version.split(' ')[0] self.to_screen("BBB version: " + bbb_version) except Exception: pass # Downloading Slides images = list() self.xml_find_rec(shapes, _s('svg:image'), images) # images = shapes.findall(_s("./svg:image[@class='slide']")) slides_infos = [] bonus_images = [] img_path_to_filename = {} counter = 0 for image in images: img_path = image.get(_x('xlink:href')) image_url = video_website + '/presentation/' + video_id + '/' + img_path image_width = int(float(image.get('width'))) image_height = int(float(image.get('height'))) if not image.get('class') or image.get('class') != 'slide': image_filename = image_url.split('/')[-1] image_path = video_id + '/' + image_filename bonus_images.append( BonusImage( image_url, image_filename, image_path, image_width, image_height, )) continue image_id = image.get('id') slide_annotations = shapes.find( _s("./svg:g[@image='{}']".format(image_id))) if img_path.endswith('deskshare.png'): image_url = video_website + '/presentation/' + video_id + '/deskshare/deskshare.webm' slide_filename = 'deskshare.webm' else: if img_path not in img_path_to_filename: slide_filename = 'slide-{:03d}'.format( counter) + '.' + determine_ext(img_path) img_path_to_filename[img_path] = slide_filename counter += 1 else: slide_filename = img_path_to_filename[img_path] slide_path = video_id + '/' + slide_filename slide_ts_in = float(image.get('in')) slide_ts_out = float(image.get('out')) slide_ts_duration = max( 0.0, min(recording_duration - slide_ts_in, slide_ts_out - slide_ts_in)) slides_infos.append( Slide( image_id, image_url, slide_filename, slide_path, image_width, image_height, slide_ts_in, slide_ts_out, slide_ts_duration, slide_annotations, )) # We now change the xml tree, all hrefs of all images now point to local files for image in images: image.attrib[_x('xlink:href')] = video_id + '/' + image.attrib[_x( 'xlink:href')].split('/')[-1] self.to_screen("Downloading slides") self._write_slides(slides_infos, self.ydl) self._write_slides(bonus_images, self.ydl) if add_annotations: slides_infos = self._add_annotations(slides_infos) if add_cursor: slides_infos = self._add_cursor(slides_infos, cursor_infos) # Downlaoding Webcam / Deskshare video_base_url = video_website + '/presentation/' + video_id if not self.verbose: self.ydl.to_stderr_backup = self.ydl.to_stderr self.ydl.to_stderr = types.MethodType(dummy_to_stderr, self.ydl) webcams_path = video_id + '/webcams.webm' try: self.to_screen("Downloading webcams.webm") webcams_dl = { 'id': video_id, 'title': title, 'url': video_base_url + '/video/webcams.webm', 'timestamp': int(start_time), } self.ydl.params['outtmpl'] = webcams_path self.ydl.process_ie_result(webcams_dl) except DownloadError: self.to_screen( "Downloading webcams.webm failed! Downloading webcams.mp4 instead" ) webcams_path = video_id + '/webcams.mp4' try: webcams_dl = { 'id': video_id, 'title': title, 'url': video_base_url + '/video/webcams.mp4', 'timestamp': int(start_time), } self.ydl.params['outtmpl'] = webcams_path self.ydl.process_ie_result(webcams_dl) except DownloadError: webcams_path = None self.to_screen("Error: Downloading webcams.mp4 failed!") deskshare_path = video_id + '/deskshare.webm' try: self.to_screen("Downloading deskshare.webm") deskshare_dl = { 'id': video_id, 'title': title, 'url': video_base_url + '/deskshare/deskshare.webm', 'timestamp': int(start_time), } self.ydl.params['outtmpl'] = deskshare_path self.ydl.process_ie_result(deskshare_dl) except DownloadError: self.to_screen( "Downloading deskshare.webm failed! Downloading deskshare.mp4 instead" ) deskshare_path = video_id + '/deskshare.mp4' try: deskshare_dl = { 'id': video_id, 'title': title, 'url': video_base_url + '/deskshare/deskshare.mp4', 'timestamp': int(start_time), } self.ydl.params['outtmpl'] = deskshare_path self.ydl.process_ie_result(deskshare_dl) except DownloadError: deskshare_path = None self.to_screen( "Warning: Downloading deskshare.mp4 failed - No desk was likely shared in this session." ) if not self.verbose: self.ydl.to_stderr = self.ydl.to_stderr_backup # Post processing slideshow_w, slideshow_h = self._rescale_slides(slides_infos) slideshow_path = self._create_slideshow(slides_infos, video_id, slideshow_w, slideshow_h) formatted_date = datetime.fromtimestamp( int(start_time) / 1000).strftime('%Y-%m-%dT%H-%M-%S') if filename is not None: result_path = filename else: result_path = formatted_date + '_' + title.replace( '/', '_', title.count('/')) + '.mp4' self.to_screen("Mux Slideshow") webcam_w, webcam_h = self._get_webcam_size(slideshow_w, slideshow_h) if os.path.isfile(result_path): self.report_warning("Final Slideshow already exists. Abort!") return if add_webcam: self.ffmpeg.mux_slideshow_with_webcam(slideshow_path, webcams_path, webcam_w, webcam_h, result_path) else: self.ffmpeg.mux_slideshow(slideshow_path, webcams_path, result_path) if not keep_tmp_files: self.to_screen("Cleanup") self._remove_tmp_dir(video_id)
def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None) self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8')
def _do_download(self, filename, info_dict): url = info_dict['url'] # Check file already present if self.params.get('continuedl', False) and os.path.isfile(encodeFilename(filename)) and not self.params.get('nopart', False): self.report_file_already_downloaded(filename) self._hook_progress({ 'filename': filename, 'status': 'finished', 'total_bytes': os.path.getsize(encodeFilename(filename)), }) return True # Attempt to download using rtmpdump if url.startswith('rtmp'): return self._download_with_rtmpdump(filename, url, info_dict.get('player_url', None), info_dict.get('page_url', None), info_dict.get('play_path', None), info_dict.get('tc_url', None), info_dict.get('rtmp_live', False), info_dict.get('rtmp_conn', None)) # Attempt to download using mplayer if url.startswith('mms') or url.startswith('rtsp'): return self._download_with_mplayer(filename, url) # m3u8 manifest are downloaded with ffmpeg if determine_ext(url) == u'm3u8': return self._download_m3u8_with_ffmpeg(filename, url) tmpfilename = self.temp_name(filename) stream = None # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} if 'user_agent' in info_dict: headers['Youtubedl-user-agent'] = info_dict['user_agent'] basic_request = compat_urllib_request.Request(url, None, headers) request = compat_urllib_request.Request(url, None, headers) if self.params.get('test', False): request.add_header('Range','bytes=0-10240') # Establish possible resume length if os.path.isfile(encodeFilename(tmpfilename)): resume_len = os.path.getsize(encodeFilename(tmpfilename)) else: resume_len = 0 open_mode = 'wb' if resume_len != 0: if self.params.get('continuedl', False): self.report_resuming_byte(resume_len) request.add_header('Range','bytes=%d-' % resume_len) open_mode = 'ab' else: resume_len = 0 count = 0 retries = self.params.get('retries', 0) while count <= retries: # Establish connection try: if count == 0 and 'urlhandle' in info_dict: data = info_dict['urlhandle'] data = compat_urllib_request.urlopen(request) break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: # Unexpected HTTP error raise elif err.code == 416: # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header data = compat_urllib_request.urlopen(basic_request) content_length = data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: raise else: # Examine the reported length if (content_length is not None and (resume_len - 100 < int(content_length) < resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, # changing the file size slightly and causing problems for some users. So # I decided to implement a suggested change and consider the file # completely downloaded if the file size differs less than 100 bytes from # the one in the hard drive. self.report_file_already_downloaded(filename) self.try_rename(tmpfilename, filename) self._hook_progress({ 'filename': filename, 'status': 'finished', }) return True else: # The length does not match, we start the download over self.report_unable_to_resume() open_mode = 'wb' break # Retry count += 1 if count <= retries: self.report_retry(count, retries) if count > retries: self.report_error(u'giving up after %s retries' % retries) return False data_len = data.info().get('Content-length', None) if data_len is not None: data_len = int(data_len) + resume_len min_data_len = self.params.get("min_filesize", None) max_data_len = self.params.get("max_filesize", None) if min_data_len is not None and data_len < min_data_len: self.to_screen(u'\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) return False if max_data_len is not None and data_len > max_data_len: self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) return False data_len_str = format_bytes(data_len) byte_counter = 0 + resume_len block_size = self.params.get('buffersize', 1024) start = time.time() while self._go_on: # Download and write before = time.time() data_block = data.read(block_size) after = time.time() if len(data_block) == 0: break byte_counter += len(data_block) # Open file just in time if stream is None: try: (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) assert stream is not None filename = self.undo_temp_name(tmpfilename) self.report_destination(filename) except (OSError, IOError) as err: self.report_error(u'unable to open for writing: %s' % str(err)) return False try: stream.write(data_block) except (IOError, OSError) as err: self.to_stderr(u"\n") self.report_error(u'unable to write data: %s' % str(err)) return False if not self.params.get('noresizebuffer', False): block_size = self.best_block_size(after - before, len(data_block)) # Progress message speed = self.calc_speed(start, time.time(), byte_counter - resume_len) if data_len is None: eta = percent = None else: percent = self.calc_percent(byte_counter, data_len) eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) self.report_progress(percent, data_len_str, speed, eta) self._hook_progress({ 'downloaded_bytes': byte_counter, 'total_bytes': data_len, 'tmpfilename': tmpfilename, 'filename': filename, 'status': 'downloading', 'eta': eta, 'speed': speed, }) # Apply rate limit self.slow_down(start, byte_counter - resume_len) if stream is None: self.to_stderr(u"\n") self.report_error(u'Did not get any data blocks') return False stream.close() self.report_finish(data_len_str, (time.time() - start)) if data_len is not None and byte_counter != data_len: raise ContentTooShortError(byte_counter, int(data_len)) self.try_rename(tmpfilename, filename) # Update file modification time if self.params.get('updatetime', True): info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) self._hook_progress({ 'downloaded_bytes': byte_counter, 'total_bytes': byte_counter, 'filename': filename, 'status': 'finished', }) return True
def run(self, dl_url: str, add_webcam: bool, add_annotations: bool, add_cursor, keep_tmp_files: bool, filename: str): m_obj = self._VALID_URL_RE.match(dl_url) video_id = m_obj.group('id') video_website = m_obj.group('website') self.to_screen("Downloading meta informations") # Make sure the lesson exists self._download_webpage(dl_url, video_id) self._create_tmp_dir(video_id) # Extract basic metadata metadata_url = video_website + '/presentation/' + video_id + '/metadata.xml' metadata = self._download_xml(metadata_url, video_id) shapes_url = video_website + '/presentation/' + video_id + '/shapes.svg' shapes = self._download_xml(shapes_url, video_id) cursor_url = video_website + '/presentation/' + video_id + '/cursor.xml' cursor_infos = self._download_xml(cursor_url, video_id) # Parse metadata.xml meta = metadata.find('./meta') start_time = xpath_text(metadata, 'start_time') title = xpath_text(meta, 'meetingName') bbb_version = xpath_text(meta, 'bbb-origin-version').split(' ')[0] self.to_screen("BBB version: " + bbb_version) # Downloading Slides images = shapes.findall(_s("./svg:image[@class='slide']")) slides_infos = [] img_path_to_filename = {} counter = 0 for image in images: img_path = image.get(_x('xlink:href')) image_id = image.get('id') image_url = video_website + '/presentation/' + video_id + '/' + img_path image_width = int(image.get('width')) image_height = int(image.get('height')) slide_annotations = shapes.find(_s("./svg:g[@image='{}']".format(image_id))) if img_path.endswith('deskshare.png'): image_url = video_website + '/presentation/' + video_id + '/deskshare/deskshare.webm' slide_filename = 'deskshare.webm' else: if img_path not in img_path_to_filename: slide_filename = 'slide-{:03d}'.format(counter) + '.' + determine_ext(img_path) img_path_to_filename[img_path] = slide_filename counter += 1 else: slide_filename = img_path_to_filename[img_path] slide_path = video_id + '/' + slide_filename slide_ts_in = float(image.get('in')) slide_ts_out = float(image.get('out')) slides_infos.append( Slide( image_id, image_url, slide_filename, slide_path, image_width, image_height, slide_ts_in, slide_ts_out, max(0, slide_ts_out - slide_ts_in), slide_annotations, ) ) self.to_screen("Downloading slides") self._write_slides(slides_infos, self.ydl) if add_annotations: slides_infos = self._add_annotations(slides_infos) if add_cursor: slides_infos = self._add_cursor(slides_infos, cursor_infos) # Downlaoding Webcam / Deskshare video_base_url = video_website + '/presentation/' + video_id webcams_path = video_id + '/webcams.webm' try: self.to_screen("Downloading webcams.webm") webcams_dl = { 'id': video_id, 'title': title, 'url': video_base_url + '/video/webcams.webm', 'timestamp': int(start_time), } self.ydl.params['outtmpl'] = webcams_path self.ydl.process_ie_result(webcams_dl) except DownloadError: pass deskshare_path = video_id + '/deskshare.webm' try: self.to_screen("Downloading deskshare.webm") deskshare_dl = { 'id': video_id, 'title': title, 'url': video_base_url + '/deskshare/deskshare.webm', 'timestamp': int(start_time), } self.ydl.params['outtmpl'] = deskshare_path self.ydl.process_ie_result(deskshare_dl) except DownloadError: pass # Post processing slideshow_w, slideshow_h = self._rescale_slides(slides_infos) slideshow_path = self._create_slideshow(slides_infos, video_id, slideshow_w, slideshow_h) formatted_date = datetime.fromtimestamp(int(start_time) / 1000).strftime('%Y-%m-%dT%H-%M-%S') if filename is not None: result_path = filename else: result_path = formatted_date + '_' + title.replace('/','_',title.count('/')) + '.mp4' self.to_screen("Mux Slideshow") webcam_w, webcam_h = self._get_webcam_size(slideshow_w, slideshow_h) if os.path.isfile(result_path): self.report_warning("Final Slideshow already exists. Abort!") return if add_webcam: self.ffmpeg.mux_slideshow_with_webcam(slideshow_path, webcams_path, webcam_w, webcam_h, result_path) else: self.ffmpeg.mux_slideshow(slideshow_path, webcams_path, result_path) if not keep_tmp_files: self.to_screen("Cleanup") self._remove_tmp_dir(video_id)
def test_determine_ext(self): self.assertEqual(determine_ext("http://example.com/foo/bar.mp4/?download"), "mp4") self.assertEqual(determine_ext("http://example.com/foo/bar/?download", None), None) self.assertEqual(determine_ext("http://example.com/foo/bar.nonext/?download", None), None) self.assertEqual(determine_ext("http://example.com/foo/bar/mp4?download", None), None) self.assertEqual(determine_ext("http://example.com/foo/bar.m3u8//?download"), "m3u8")
def _parse_mediapackage(self, video): tracks = video.get('media', {}).get('track', []) video_id = video.get('id') formats = [] for track in tracks: href = track['url'] ext = determine_ext(href, None) track_obj = {'url': href} transport = track.get('transport') if transport == 'DASH' or ext == 'mpd': formats.extend(self._extract_mpd_formats(href, video_id, mpd_id='dash', fatal=False)) elif transport == 'HLS' or ext == 'm3u8': formats.extend( self._extract_m3u8_formats(href, video_id, m3u8_id='hls', entry_protocol='m3u8_native', fatal=False) ) elif transport == 'HDS' or ext == 'f4m': formats.extend(self._extract_f4m_formats(href, video_id, f4m_id='hds', fatal=False)) elif transport == 'SMOOTH': formats.extend(self._extract_ism_formats(href, video_id, ism_id='smooth', fatal=False)) elif ext == 'smil': formats.extend(self._extract_smil_formats(href, video_id, fatal=False)) else: if transport is not None: track_obj.update({'format_note': track.get('transport')}) if transport == 'RTMP': m_obj = re.search(r'^(?:rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', href) if not m_obj: continue track_obj.update( { 'app': m_obj.group('app'), 'play_path': m_obj.group('playpath'), 'rtmp_live': True, 'preference': -2, } ) extention = m_obj.group('playpath').split(':') if len(extention) > 1: track_obj.update({'ext': extention[0]}) audio_info = track.get('audio') if audio_info is not None: if 'bitrate' in audio_info: track_obj.update({'abr': int_or_none(audio_info.get('bitrate'), 1000)}) if 'samplingrate' in audio_info: track_obj.update({'asr': int_or_none(audio_info.get('samplingrate'))}) audio_encoder = audio_info.get('encoder', {}) if 'type' in audio_encoder: track_obj.update({'acodec': audio_encoder.get('type')}) video_info = track.get('video') if video_info is not None: if 'resolution' in video_info: track_obj.update({'resolution': video_info.get('resolution')}) resolution = parse_resolution(video_info.get('resolution')) track_obj.update(resolution) if 'framerate' in video_info: track_obj.update({'fps': int_or_none(video_info.get('framerate'))}) if 'bitrate' in video_info: track_obj.update({'vbr': int_or_none(video_info.get('bitrate'), 1000)}) video_encoder = video_info.get('encoder', {}) if 'type' in video_encoder: track_obj.update({'vcodec': video_encoder.get('type')}) formats.append(track_obj) self._sort_formats(formats) result_obj = {'formats': formats} if video_id is not None: result_obj.update({'id': video_id}) title = video.get('title') if title is not None: result_obj.update({'title': title}) series = video.get('seriestitle') if series is not None: result_obj.update({'series': series}) season_id = video.get('series') if season_id is not None: result_obj.update({'season_id': season_id}) creator = video.get('creators', {}).get('creator') if creator is not None: result_obj.update({'creator': creator}) timestamp = parse_iso8601(video.get('start')) if timestamp is not None: result_obj.update({'timestamp': timestamp}) attachments = video.get('attachments', {}).get('attachment', []) if len(attachments) > 0: thumbnail = attachments[0].get('url') result_obj.update({'thumbnail': thumbnail}) return result_obj