def getCaptions(self, videoId): # Get captions try: # using the manually created one captions = YouTubeTranscriptApi.list_transcripts( videoId).find_manually_created_transcript(['en']).fetch() except: try: # using the generated one captions = YouTubeTranscriptApi.list_transcripts( videoId).find_generated_transcript(['en']).fetch() # if there is no English caption including auto-generated one. except: captions = [{'text': '', 'start': -1, 'duration': -1}] return captions
def clean_transcript(link): video_id=gen_id(link) transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) transcript = transcript_list.find_generated_transcript(['en']) transcript=transcript.fetch() profanity.load_censor_words() for i in transcript: i['text']=profanity.censor(i['text']) title=get_title(link) if not os.path.exists(((os.path.join(SAVE_TRANSCRIPT_PATH,title))+'.txt')): file1=open(((os.path.join(SAVE_TRANSCRIPT_PATH,title))+'.txt'),'a') for line in transcript: text=line['text'] start=line['start'] duration=line['duration'] inf=[text,start,duration] file1.writelines(str(inf)) file1.write('\n') file1.close() print('Transcript saved to',((os.path.join(SAVE_TRANSCRIPT_PATH,title))+'.txt')) else: print('File Already Exists!') print() return transcript
def fetch(Vid_ID): #load all possible scripts of a video possible_scripts = YTScript.list_transcripts(Vid_ID) #autogenerated seem to avoid repeats better at least for #'CKZ58bXtQnU' type = '' for scripts in possible_scripts: if scripts.is_translatable: if scripts.is_generated: type = "generated" autogenerated = possible_scripts.find_generated_transcript( ['en']) script = autogenerated.fetch() else: type = "manual" manual = possible_scripts.find_manually_created_transcript( ['en']) script = manual.fetch() if scripts.is_generated: type = "generated" autogenerated = possible_scripts.find_generated_transcript( ['en', 'en-US', 'en-GB']) script = autogenerated.fetch() else: type = "manual" manual = possible_scripts.find_manually_created_transcript( ['en', 'en-US', 'en-GB']) script = manual.fetch() return script, type
def transcribe(video_id): try: # retrieve the available transcripts transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) # iterate over all available transcripts for transcript in transcript_list: transcript = transcript.fetch() subtitles = "" for subtitle in transcript: subtitles += (subtitle['text'] + ' ') r = requests.post('http://bark.phon.ioc.ee/punctuator', data={'text': subtitles}) r2 = requests.get("https://www.youtube.com/watch?v=" + video_id) soup = bs(r2.content) title = soup.find("meta", {"name": "title"})['content'] cleanString = re.sub('\W+', ' ', title) # writeToFile("./subtitles.txt", r.text) d = dict() d['subtitle'] = r.text + "<h4>URL: https://www.youtube.com/watch?v=" + video_id + " </h4><iframe width='420' height='345' src='https://www.youtube.com/embed/" + video_id + "'></iframe>" d['title'] = cleanString return d except: err = dict() err['subtitle'] = "<h4>URL: https://www.youtube.com/watch?v=" + video_id + " </h4><iframe width='420' height='345' src='https://www.youtube.com/embed/" + video_id + "'></iframe>" err['title'] = "Subtitle doesn't found for video" return err
def getTranscriptByLink(self, video_id): # html = urlopen(link) #"https://www.youtube.com/watch?v=5_zrHZdhaBU" # soup = BeautifulSoup(html, 'html.parser') # nameList = soup.findAll("div", {"id": "cp-2"}) # for name in nameList: # print(name.get_text()) vectorOfWords = "" #print(type(transcript_list)) try: transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) #transcript_list_Raw = transcript_list.find_transcript(['en']) #print(transcript_list.find_transcript([])) transcript_list_Raw = YouTubeTranscriptApi.get_transcript( video_id, languages=['en']) for transcript in transcript_list_Raw: # the Transcript object provides metadata properties #print(transcript) vectorOfWords = vectorOfWords + " " + transcript['text'] print("Am I next getting here?") return [video_id, vectorOfWords] except Exception: print("No English Transcript. For future use add British English")
def test_translate_transcript(self): transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en']) translated_transcript = transcript.translate('af') self.assertEqual(translated_transcript.language_code, 'af') self.assertIn('&tlang=af', translated_transcript._url)
def get_youtube_subtitle(self, youtube_id, generated_mode, language_code): ''' Get youtube subtitle using youtube_transcript_api Parameters ---------- youtube_id : string youtube video ID generated_mode : boolean Generated or manual subtitle (True of False) language_code : list list of string containing subtitle language code ['en','fr','ar'] Returns ------- list list containing youtube subtitle information [{text, start, duration}, ...] ''' transcript_list = YouTubeTranscriptApi.list_transcripts(youtube_id) if generated_mode: return transcript_list.find_generated_transcript( language_code).fetch() else: return transcript_list.find_manually_created_transcript( language_code).fetch()
def get_english_subs(x_list): '''Given a list of video_ids; get auto-generated subtitles using a YouTube api''' failed_list = [] success_list = [] print(f'Extracting cc will take at least {(len(x_list)*3)/60} minutes.') for vid in x_list: vid_id = vid[2] try: transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id) tra = transcript_list.find_transcript(['en']) subs = tra.fetch() success_list.append((vid_id, subs)) # modified sleep(3) except: failed_list.append(vid) if len(failed_list) > 0: print('_' * 50) print( f'{len(failed_list)} transcripts out of {numb_vids} couldn\'t be extracted' ) print( 'This could be because auto-generated cc wasn\'t enabled,\nOr English couldn\'t be recognized as the language of the video.' ) print('\n') print('The following videos couldn\'t be processed:') for i in failed_list: print(i) print('*' * 50) else: print('All videos transcripts were extracted') print('\n') print(f'Only {len(success_list)} transcirpts were extracted.') return success_list
def get_form(request): # if this is a POST request we need to process the form data if request.method == 'POST': # create a form instance and populate it with data from the request: form = YoutubeForm(request.POST) # check whether it's valid: if form.is_valid(): # process the data in form.cleaned_data as required # ... # redirect to a new URL: if request.POST['video_link']: # grab video transcript video_param = request.POST['video_link'] parsed_url = urlparse(video_param) if parsed_url.query is "": video_id = video_param else: url_params = parse_qs(parsed_url.query) video_id = url_params.get('v')[0] print(video_id) transcript_list = YouTubeTranscriptApi.list_transcripts( video_id) transcript = transcript_list.find_transcript(['en-US', 'en']) full_transcript = transcript.fetch() return render(request, 'transcript.html', {'transcript_items': full_transcript}) # return template with transcript and copiable json object? return HttpResponseRedirect('/thanks') # if a GET (or any other method) we'll create a blank form else: form = YoutubeForm() return render(request, 'form.html', {'form': form})
def download_auto_generated_transcript(target_url, output_path): """ Downloads the auto generated transcript that Google creates for YouTube. :param target_url: Video url :parma output_path: Name and path where to save the transcript in JSON format. Example: download_auto_generated_transcript("https://www.youtube.com/watch?v=vJ6MrDO0kgY", "Democratic Presidential Debate - June 26.json") OUT: /path/to/mp3/Democratic Presidential Debate - June 26.mp3 """ url = target_url.replace("https://www.youtube.com/watch?v=", '') transcript_list = YouTubeTranscriptApi.list_transcripts(url, 'en') auto_generated_transcript = None for transcript in transcript_list: if transcript.language_code == 'en': auto_generated_transcript = transcript.fetch() if auto_generated_transcript: with open(output_path, 'w') as f: json.dump(auto_generated_transcript, f, indent=4) return os.path.abspath(output_path) else: print("Couldn't find english transcript")
def test_translate_transcript__not_translatable(self): transcript = YouTubeTranscriptApi.list_transcripts( 'GJLlxj_dtq8').find_transcript(['en']) transcript.translation_languages = [] with self.assertRaises(NotTranslatable): transcript.translate('af')
def get_transcription(url): """ Function takes a YouTube video URL and extracts the automatically-generated transcripts from it """ # Checks the format of the URL if "https://www.youtube.com/watch?v=" in url: input_url_id = url.replace("https://www.youtube.com/watch?v=", "") elif "https://youtu.be/" in url: input_url_id = url.replace("https://youtu.be/", "") # Creates a blank list to iterate over text_parts = [] # Gets a list of all available transcripts try: list_of_transcripts = YouTubeTranscriptApi.list_transcripts( input_url_id) print("Checking for Transcriptions...") # Checks to see if a manual transcript is created if not, checks to see if a generated one is created if 'en-US' in list_of_transcripts._manually_created_transcripts: print("Manual Transcription Found.") transcript = list_of_transcripts.find_manually_created_transcript( ['en-US']) elif 'en' in list_of_transcripts._manually_created_transcripts: print("Manual Transcription Found.") transcript = list_of_transcripts.find_manually_created_transcript( ['en']) elif 'en' in list_of_transcripts._generated_transcripts: print("Auto-Generated Transcription Found.") transcript = list_of_transcripts.find_generated_transcript(['en']) # Saves the transcript into a variable to iterate over raw_transcription = transcript.fetch() # Indexing of raw transcripts iteration_of_raw = 0 # Iterates over each dictionary and extracts 'text' key then appends the blank text_parts list for i in raw_transcription: indexed_dictionary = raw_transcription[iteration_of_raw] text_from_dictionary = indexed_dictionary['text'] text_parts.append(text_from_dictionary) iteration_of_raw += 1 # Defines how we want each text element to be separated with separator_for_each_text = " " # Joins the separator with the text_parts clean_transcription = separator_for_each_text.join(text_parts) # Returns the cleaned transcripts return clean_transcription except: print("No Transcriptions Found") clean_transcription = "No Transcriptions Found" return clean_transcription
def _has_manually_created_transcript(self, youtube_video_id): transcript_list = YouTubeTranscriptApi.list_transcripts( youtube_video_id) try: transcript_list.find_manually_created_transcript(["en"]) except Exception: return False return True
def get_transcript(videoId, language): try: transcript_list = YouTubeTranscriptApi.list_transcripts(videoId) transcript = transcript_list.find_manually_created_transcript( [language]) return transcript.fetch() except: return None
def fetch_man_chosen(video_id, lang): transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) target = transcript_list.find_manually_created_transcript(language_codes=[lang]) with open('ccaptions.txt', 'w', encoding='utf-8') as f: for snippet in target.fetch(): pprint.pprint(snippet) f.write(snippet['text']+' ') return 'ccaptions.txt'
def video_has_en_ts(self, video: str) -> bool: try: res = tsApi.list_transcripts(video) except (TranscriptsDisabled, KeyError): return False try: return True if res.find_generated_transcript(["en"]) else False except NoTranscriptFound: return False
def get_available_lang(video_id): codes = [] manual = [] transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) for transcript in transcript_list: if not transcript.is_generated: manual.append(transcript.language) codes.append(transcript.language_code) return manual, codes
def test_list_transcripts__find_generated(self): transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') with self.assertRaises(NoTranscriptFound): transcript_list.find_generated_transcript(['cs']) transcript = transcript_list.find_generated_transcript(['en']) self.assertTrue(transcript.is_generated)
def ATL(self): atl = YTA.list_transcripts(self.video_id) ret = None if DEBUG_MODE: try: ret = atl.find_manually_created_transcript(self.LANGUAGE_LIST) except Exception as e: ret = atl.find_generated_transcript(self.LANGUAGE_LIST) return ret
def test_list_transcripts(self): transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') language_codes = { transcript.language_code for transcript in transcript_list } self.assertEqual( language_codes, {'zh', 'de', 'en', 'hi', 'ja', 'ko', 'es', 'cs', 'en'})
def fetch_auto_chosen(video_id, lang): transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) target = transcript_list.find_generated_transcript(['en']) # if target: # print(target) if target: with open('tcaptions.txt', 'w', encoding='utf-8') as f: for snippet in target.translate('uk').fetch(): pprint.pprint(snippet) f.write(snippet['text'] + '\n') return 'tcaptions.txt' else: return None
def get_trans(video_url,language): trans = '' try: trans = YouTubeTranscriptApi.get_transcript(video_url, languages= [language]) except Exception: transcripts_langugages = YouTubeTranscriptApi.list_transcripts(video_url) for lang in transcripts_langugages: try: trans = lang.translate(language).fetch() break except Exception: pass return trans
def get_manually_created_transcript(self, video_id): ''' Gets the transcript of a youtube video. Args: video_id: a string representing the id of a youtube video Returns: An array of dictionaries each containing duration, start, and text key pairs that cover the manually generated transcription for the youtube video identified through video_id. If none exists, it will throw an error. ''' transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) generated = transcript_list.find_manually_created_transcript(['en']) return generated.fetch()
def __init__(self, video_id): self.video_id = video_id self.lang = [ 'en', ] try: transcription_list = YouTubeTranscriptApi.list_transcripts( self.video_id) gencc = transcription_list.find_generated_transcript([ 'en', ]) self.content = gencc.fetch() except: print("Error!")
def labelVideo(conn_dest, vid, best, transcript, filledIn, autogen, verbose): if filledIn: #Must have been manual transcript, so we need the autogen transcript_list = YouTubeTranscriptApi.list_transcripts(vid) transcript_auto = transcript_list.find_generated_transcript( ["en"]).fetch() #Stitch together the transcript into a single string #Use the tokenized string to label each word as sponsor or not seq = [] full_text = "" segList = best.copy() for t in transcript: tStart = t["start"] tEnd = tStart + t["duration"] if filledIn: for b in segList: if b[0] <= tStart: string, totalNumWords = extractText(b, transcript_auto) full_text, seq = appendData(full_text, seq, string, tStart, tEnd, best, 1, verbose) segList.remove((b[0], b[1], b[2])) raw_text = t["text"].replace("\n", " ") raw_text = re.sub(" +", " ", raw_text.replace(r"\u200b", " ")) #strip out this unicode full_text, seq = appendData(full_text, seq, raw_text, tStart, tEnd, best, 0, verbose) if filledIn: for b in segList: if b[0] > transcript[-1]["start"]: tStart = transcript[-1]["start"] tEnd = tStart + transcript[-1]["duration"] string, totalNumWords = extractText(b, transcript_auto) full_text, seq = appendData(full_text, seq, string, tStart, tEnd, best, 1, verbose) full_text = re.sub(" +", " ", full_text).replace("'", "''") #format text #insert text and labels into db cursor = conn_dest.cursor() cursor.execute( f"insert into SponsorStream values ('{vid}', '{full_text}' , '{seq}', {autogen}, {filledIn}, 1, current_date)" ) conn_dest.commit() return
def transcribe_video(video_id, translate, language_from, translate_to): if translate == True: transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) transcript = transcript_list.find_transcript([language_from]) translated_transcript = transcript.translate(translate_to) transcript_fetched = (translated_transcript.fetch()) else: transcript_fetched = YouTubeTranscriptApi.get_transcript(video_id) total_text = "" for t in transcript_fetched: total_text += (t["text"]) + " " print(total_text) pc.copy(total_text) print("text copied")
def text_download(self): captions = self.myVideo.captions.get_by_language_code('en') abs_path = self.video_path / self.filename self.abs_path_suffix = abs_path.with_suffix('.txt') if captions: transcript_list = YouTubeTranscriptApi.list_transcripts( self.myVideo.video_id) transcript = transcript_list.find_transcript(['en']) with open(self.abs_path_suffix, 'w') as f: for line in transcript.fetch(): f.write('%s\n' % line) else: print("No captions available in English") with open(self.abs_path_suffix, 'w') as f: f.write("No captions available in English") return True
def main(args): os.makedirs(args.transcript_folder, exist_ok=True) # Open the main ids file with open(args.youtube_ids_file, "r") as f: for line in f: id = line.strip() print('Processing', id) # retrieve the available transcripts transcript_list = YouTubeTranscriptApi.list_transcripts(id) # you can also directly filter for the language you are looking for, using the transcript list transcript = transcript_list.find_transcript(['en']) transcript = transcript.fetch() with open(path.join(args.transcript_folder, id + '.json'), "w") as o: json.dump(transcript, o)
def returnVideoLangages(vidID): try: languages = YTTranscriptAPI.list_transcripts(vidID) except Exception as e: print(e) print("VidID: " + vidID) return -1 ret = [] for lang in languages: ret.append({ 'code': lang.language_code, 'name': lang.language, 'isGenerated': lang.is_generated, 'isTranslatable': lang.is_translatable }) return ret
def fetch_transcript(vid_id, **kwargs) -> Tuple[str, List[Dict[str, Any]]]: params = {"format": "json", "url": f"https://www.youtube.com/watch?v={vid_id}"} | kwargs url = "https://www.youtube.com/oembed" query_string = parse.urlencode(params) url = url + "?" + query_string print(f"Fetching from {url}") with urllib.request.urlopen(url) as response: response_text = response.read() title = json.loads(response_text.decode())["title"] # retrieve the available transcripts transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id) return title, transcript_list.find_transcript(['en']).fetch()