def test_get_transcripts__continue_on_error(self, mock_get_transcript): video_id_1 = 'video_id_1' video_id_2 = 'video_id_2' YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) mock_get_transcript.assert_any_call(video_id_1, ('en',), None, None) mock_get_transcript.assert_any_call(video_id_2, ('en',), None, None)
def test_get_transcripts(self, mock_get_transcript): video_id_1 = 'video_id_1' video_id_2 = 'video_id_2' languages = ['de', 'en'] YouTubeTranscriptApi.get_transcripts([video_id_1, video_id_2], languages=languages) mock_get_transcript.assert_any_call(video_id_1, languages, None, None) mock_get_transcript.assert_any_call(video_id_2, languages, None, None) self.assertEqual(mock_get_transcript.call_count, 2)
def test_get_transcripts__continue_on_error(self): video_id_1 = 'video_id_1' video_id_2 = 'video_id_2' YouTubeTranscriptApi.get_transcript = MagicMock( side_effect=Exception('Error')) YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'], continue_after_error=True) YouTubeTranscriptApi.get_transcript.assert_any_call( video_id_1, ('en', ), None) YouTubeTranscriptApi.get_transcript.assert_any_call( video_id_2, ('en', ), None)
def obtener_transcripciones_videos(ids_videos): """ Obtiene las transcripciones de varios videos, dados sus ids. Retorna un diccionario, donde las llaves son los ids, y los valores son cadenas. """ # Obtener las listas de subtitulos en espanol diccionario_subtitulos = YouTubeTranscriptApi.get_transcripts( ids_videos, languages=['es'])[0] # Diccionario en donde se concatenaran los subtitulos por cada id diccionario_transcripciones = dict() # Recorrer el diccionario de subtitulos for id in diccionario_subtitulos.keys(): # Cadena temporal donde se concatenaran los subtitulos para un id transcripcion = '' # Recorrer la lista de subtitulos para un id y concatenarlos for s in diccionario_subtitulos[id]: transcripcion += s['text'] + ' ' # Guardar el resultado en el diccionario de transcripciones diccionario_transcripciones[id] = transcripcion # Retornar el diccionario de transcripciones return diccionario_transcripciones
def get_youtube_cc(url): try: video_ids = [url.split('?v=')[1]] id = video_ids[0] captions = str() cc = (YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])) for line in (cc[0][id]): captions += (' ' + line['text']) return (captions, True) except Exception as e: return ("Can't fetch from youtube captions", False)
def test_get_transcript__with_proxies(self): proxies = {'http': '', 'https:': ''} transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', proxies=proxies) self.assertEqual(transcript, [{ 'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54 }, { 'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16 }, { 'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239 }]) YouTubeTranscriptApi.get_transcript = MagicMock() YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) YouTubeTranscriptApi.get_transcript.assert_any_call( 'GJLlxj_dtq8', ('en', ), proxies)
def download(self): print("Downloading...") try: self.title = YouTube(self.url).title except: pass if not Path.exists(self.vid_path): print("Downloading Video...") while True: try: YouTube(self.url).streams.first().download(self.dl_path, "vid") break except KeyError: print("Download failed. Retry...") if not Path.exists(self.sub_path): print("Downloading Transcript...") sub = YouTubeTranscriptApi.get_transcripts([self.video_id], languages=['en']) pickle.dump(sub, open(self.sub_path, "wb")) self.subtitles = pickle.load(open(self.sub_path,"rb"))
def caption_data(video_ids): transcript_data = YouTubeTranscriptApi.get_transcripts( video_ids=video_ids, continue_after_error=True) for vid in transcript_data[0]: text_list = [] counter = 0 for trans_dict in transcript_data[0][vid]: #I think this is where they are getting concatenateds if counter < 2: print(trans_dict['text']) counter += 1 text_list.append(trans_dict['text']) text_list.append(' ') caption_text = "".join(text_list) caption = Caption(body=caption_text, video_id=vid) db.session.add(caption) db.session.commit() return_JSON = {"status": 'success'} return return_JSON
def download_transcripts_of_playlist(playlist_id, transcripts_dir): """Retrieves transcripts of individual youtube videos from playlist and writes to json files. params: playlist_id: youtube videos playlist id transcripts_dir: path to write transcripts json """ # create transcripts directory if not exist if not os.path.exists(transcripts_dir): os.makedirs(transcripts_dir, exist_ok=True) # get video ids from playlist id video_ids = get_video_ids(playlist_id) # get transcripts transcripts, unretrieved_videos = YouTubeTranscriptApi.get_transcripts( video_ids, languages=['en'], continue_after_error=True) count = 1 for key, transcript in transcripts.items(): # the json file where the output must be stored file_path = os.path.join(transcripts_dir, str(count) + key + ".json") print(file_path) out_file = open(file_path, "w") # dump to json json.dump(transcript, out_file, indent=3) out_file.close() count += 1 print("unretrieved_videos: ", unretrieved_videos) else: # skip downloading transcripts if already exists print( "playlist with id {0} transcripts already downloaded, skipping...". format(playlist_id))
def test_get_transcripts__stop_on_error(self, mock_get_transcript): with self.assertRaises(Exception): YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
from youtube_transcript_api import YouTubeTranscriptApi video_ids = ['5-yxXzLX2QY', '9UuFTwUxkLw'] out, _ = YouTubeTranscriptApi.get_transcripts(video_ids) text_out = {} for video_id in out: text_out[video_id] = ' '.join([x.get('text', '') for x in out[video_id]]) print(text_out)
from lxml.etree import tostring from youtube_transcript_api import YouTubeTranscriptApi #Totally useless file tbh def get_video_captions(video_id): CAPTION_URL = 'https://www.diycaptions.com/php/get-automatic-captions-as-txt.php?id=' + video_id + '&language=asr' captionPage = requests.get(CAPTION_URL) captionTree = html.fromstring(captionPage.content) caption = captionTree.xpath('//div[@contenteditable="true"]/text()') return (str(caption)) if (__name__ == '__main__'): video_id = 'Cjim2F5Kk38' transcript_data = YouTubeTranscriptApi.get_transcripts( ['Cjim2F5Kk38', 'DtdRCCMvllo']) for vid in transcript_data[0]: text_list = [] counter = 0 for trans_dict in transcript_data[0][vid]: if counter < 3: print(trans_dict['text']) text_list.append(trans_dict['text']) caption_text = "".join(text_list) # print(caption_text)
'load a df with video ids (which will be used for the youtube api to download the transcripts: and later on for extracting the labels ' with open('IdList_selfWachtedYoutubeVids.txt',encoding="utf-8") as f: idList = f.readlines() #txt file with the ids: #alternatively: #dic = dict_oldDf # a dictionary where the keys correspond the the youtubeIDs #======================================================================== # ' downloading the transcripts by their ids ' #======================================================================== # from youtube_transcript_api import YouTubeTranscriptApi import time # just to record how long it takes to download the transcripts STARTTIME = time.time() #plus counting the time Transcripts_w_timestamps =YouTubeTranscriptApi.get_transcripts(video_ids=idList,continue_after_error=True) Transcripts_w_timestamps = Transcripts_w_timestamps[0] print('time it took:', time.time() - STARTTIME) print( 'len trans', len(Transcripts_w_timestamps)) # see how many could be downloaded # ============================================================================= # transcripts that were unable to be extraced: # ============================================================================= ids_thatcouldnotbedownloaded = list( set_originalId - set_downloadedtransIds ) print( 'len downloaded trans:',ids_thatcouldnotbedownloaded) # ============================================================================= # # creating a dict with transcripts, ψ Writing to string files to (re)create the transcripts
#This is to overcome redundant information. df1.sort_values(['ViewCount', 'likeCount', 'CommentCount'], ascending=[False, False, False], inplace=True) df1.reset_index(drop=True, inplace=True) df_red = df1 #Step 6: df_red.loc[:, "emv_video"] = df_red.apply( lambda row: int(row.ViewCount) * 0.14 + int(row.CommentCount) * 8.20 + int( row.likeCount) * 0.72, axis=1) videoid = list(df_red['VideoId']) x = YouTubeTranscriptApi.get_transcripts(videoid, continue_after_error=True) vids_with_sub = x[0] vids_without_sub = x[1] df_trans = pd.DataFrame(list(vids_with_sub.keys()), columns=['VideoId']) # In[117]: result2 = [] for i in range(0, len(vids_with_sub)): print(i) result1 = [] list_con = list(vids_with_sub.values())[i] for j in list_con: text_proc = j['text'] if (re.findall('[a-zA-Z]', text_proc) == []): text_proc_fin = text_proc
def test_get_transcripts__with_cookies(self, mock_get_transcript): cookies = '/example_cookies.txt' YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], cookies=cookies) mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en', ), None, cookies)
from youtube_transcript_api import YouTubeTranscriptApi import json print(YouTubeTranscriptApi.get_transcript("P_6vDLq64gE")) transcript_list = YouTubeTranscriptApi.list_transcripts("P_6vDLq64gE") print(transcript_list) with open('examples/file1.json', 'w', encoding='utf-8') as f: json.dump(YouTubeTranscriptApi.get_transcripts(["iCvmsMzlF7o"], languages=['uk']), f, ensure_ascii=False) transcript = transcript_list.find_transcript(['de', 'en']) print(transcript) transcript = transcript_list.find_manually_created_transcript(['uk']) print(transcript.is_generated) print( transcript.video_id, transcript.language, transcript.language_code, # whether it has been manually created or generated by YouTube transcript.is_generated, # whether this transcript can be translated or not transcript.is_translatable, # a list of languages the transcript can be translated to transcript.translation_languages, ) transcript.fetch()
def test_get_transcripts__stop_on_error(self): YouTubeTranscriptApi.get_transcript = MagicMock( side_effect=Exception('Error')) with self.assertRaises(Exception): YouTubeTranscriptApi.get_transcripts(['video_id_1', 'video_id_2'])
def test_get_transcripts__with_proxies(self, mock_get_transcript): proxies = {'http': '', 'https:': ''} YouTubeTranscriptApi.get_transcripts(['GJLlxj_dtq8'], proxies=proxies) mock_get_transcript.assert_any_call('GJLlxj_dtq8', ('en', ), proxies, None)
} video_detail_df = pd.read_csv(video_details_file, index_col=0) video_ids = list( video_detail_df[video_detail_df['caption'] == True]['video']) ### extract the video ids appear as the file name in a folder video_set_with_transcripts = get_all_video_ids_with_transcripts( video_caption_pickle_folder) unfound_video = set( pd.read_csv(os.path.join(CUR_FILE_DIR, '../Data', 'unfound.csv'), header=None)[0]) video_ids = list( set(video_ids) - video_set_with_transcripts - unfound_video) # video_ids = ['-TIkkGSHWeM'] print(len(video_ids)) for i in range(0, len(video_ids), 50): print('start to extract videos {0!r}'.format(str(i))) video_ids_sub = video_ids[i:i + 50] print(video_ids_sub) transcripts = YouTubeTranscriptApi.get_transcripts( video_ids_sub, languages=['en'], continue_after_error=True, proxies=proxies) persist_transcript(transcripts) print(str(i)) # time.sleep(600)