def Get_Transcript(video_id): data = yta.get_transcript(video_id, languages=['en']) transcript = '' for value in data: for key, val in value.items(): if key == "text": val = val + "\n" val = val.encode("ascii", "ignore") transcript += val.decode() return transcript
def download_subs_from_youtube_video(video_url): try: subtitles_data = YouTubeTranscriptApi.get_transcript(video_url) return Subtitles(video_id=video_url, subtitles_data=subtitles_data) except TranscriptsDisabled: logging.getLogger().warning(f'skip {video_url} - no subtitles') return None except VideoUnavailable: logging.getLogger().warning( f'skip {video_url} - video is unavailable') return None
def test_list_transcripts(self): transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8') language_codes = { transcript.language_code for transcript in transcript_list } self.assertEqual( language_codes, {'zh', 'de', 'en', 'hi', 'ja', 'ko', 'es', 'cs', 'en'})
def load_data(video_id, keyword): # Use a breakpoint in the code line below to debug your script. data = YouTubeTranscriptApi.get_transcript(video_id) for item in data: # print(item) if keyword in item['text']: print(item['text']) timing = item['start'] print(timing) output = f'https://youtu.be/{video_id}?t={int(timing)}' print(output)
def ATL(self): atl = YTA.list_transcripts(self.video_id) ret = None if DEBUG_MODE: try: ret = atl.find_manually_created_transcript(self.LANGUAGE_LIST) except Exception as e: ret = atl.find_generated_transcript(self.LANGUAGE_LIST) return ret
def get_transcript(video_url): url_data = urlparse.urlparse(video_url) try: query = urlparse.parse_qs(url_data.query) video_id = query["v"][0] print(video_id) transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en']) return transcript, video_id except: return "Invalid URL", False
def get_subtitle_info(self): try: transcript_list = YTA.get_transcript(self.video_id, languages=self.LANGUAGE_LIST) if self.DEBUG_MODE: for subtitle in transcript_list: print(subtitle) return transcript_list except Exception as e: print('자막이 없습니다')
def get_transcript(video_id, percent): transcript = None try: transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"]) except: return {"error": "No Trasncript found"} transcript_string = "" for line in transcript: transcript_string += line["text"] + " " return get_summary(transcript_string, percent)
def getJSON(videoID, query): returnList = [] if query in final_stopWords: #if the query is not good enough to be searched,return -1 return (json.dumps(returnList)) keywordList = YouTubeTranscriptApi.get_transcript(videoID) for i in keywordList: phrase = i['text'] if query.lower() in phrase.lower(): temp = {"timestamp": str(i['start']) + 's', "phrase": i['text']} returnList.append(temp) return (json.dumps(returnList))
def TL(self): try: transcript_list = YTA.get_transcript(self.video_id, languages=self.LANGUAGE_LIST) if self.DEBUG_MODE: for subtitle in transcript_list: print(subtitle) return ' '.join(transcript_list) except Exception as e: print('자막이 없음.')
def get_youtube_cc(url): try: video_ids = [url.split('?v=')[1]] id = video_ids[0] captions = str() cc = (YouTubeTranscriptApi.get_transcripts(video_ids, languages=['de', 'en'])) for line in (cc[0][id]): captions += (' ' + line['text']) return (captions, True) except Exception as e: return ("Can't fetch from youtube captions", False)
def get_transcripts(video_url): # Parsing the URL to find the VIDEO ID url_data = urlparse(video_url) query = url_data.query video_id = query[2:] # Getting the transcripts YouTubeTranscriptApi.get_transcript(video_id) transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) transcript = transcript_list.find_transcript(['en']) trans_list = transcript.fetch() text = '' for d in trans_list: text+=d['text']+'. ' return text # Writing transcripts to the file '''
def add_picture(url): print("adding picture") urlID = url.partition('https://www.youtube.com/watch?v=')[-1] transcript = YouTubeTranscriptApi.get_transcript(urlID) f1 = open('summary.txt') summary = f1.read() f1.close() f = open("file1.txt", "w") s = set() paras = [[i, []] for i in summary.split('\n')] j = 0 for filename in os.listdir(directory): l = filename.split(".") time = float(l[0][5:]) j = 0 while (j < len(transcript)): data = transcript[j] index = 0 if (time >= (data['start'] * 1000) and time < (((data['start'] + data['duration']) * 1000) + 2000)): text = data['text'].replace('\n', ' ') t = (text, time) while index < (len(paras)): if (text in paras[index][0]): if (filename not in paras[index][1]): paras[index][1].append(filename) break index += 1 j += 1 document = Document() p = document.add_paragraph() r = p.add_run() for para in paras: if (para[0]): f.write(para[0]) r.add_text(para[0]) f.write("\n\n") r.add_text("\n\n") if (para[1]): for name in para[1]: r.add_picture(directory + '/' + name, width=Inches(3.0)) f.write(name) f.write("\n") r.add_text('\n') document.save('brevis.docx') print("writing doc") f.close()
def getTranscript(videoID): try: # Query youtube for captions transcription = YouTubeTranscriptApi.get_transcript(videoID, languages=["en"]) except: return "Video Link Invalid" output = "" for i in transcription: # Translate text to Braille output += translateBraille(u'{}'.format(i["text"])) output += "\n" return output
def trans(): if request.method == 'POST': parser = reqparse.RequestParser() parser.add_argument('url', type=str) args = parser.parse_args() url = args['url'] print("your url") print(url) print("your url") myTrans = YouTubeTranscriptApi.get_transcript(url) print(myTrans) return str(myTrans)
def get_list(self): # Code from https://github.com/shahjaidev/NLP_Radicalization_detection/blob/master/get_and_parse_transcript_and_comments.py """ Scrapes the transcript for a video specified by video_id """ try: output = YouTubeTranscriptApi.get_transcript(self.video_id) l = [] for e in output: l.append(e['text']) return (l) except: # print("An exception occurred") return (None)
def get_transcript(vid_ID): try: transcript = YouTubeTranscriptApi.get_transcript(vid_ID) except: raise Exception("This video doesn't have a captions transcript") # Takes the captions from the transcript and puts them into a list # making a list of sentences. captions_list = [ sub['text'] for sub in transcript ] return captions_list
def parse(videoid): parsedContent = '' result = YouTubeTranscriptApi.get_transcript(videoid) print(result[0]['text']) #result_list = []#json.load(result) #print(result_list) for data in result: parsedContent += data['text'] + " " print(parsedContent) return parsedContent
def build_captions_list(video_ids): result = [] for step, video_id in enumerate(video_ids): print(step, video_id, len(video_ids)) try: captions = YouTubeTranscriptApi.get_transcript(video_id) except: continue result.append({'caption': captions, 'video_id': video_id}) print(len(captions)) print('---*---') return result
def compareVideo(VideoOne, VideoTwo): VideoOneTxt = "" VideoTwoTxt = "" try: resOne = YouTubeTranscriptApi.get_transcript(VideoOne) print(resOne) resTwo = YouTubeTranscriptApi.get_transcript(VideoTwo) except: return False, False for output in resOne: VideoOneTxt += output['text'] + " " for output in resTwo: VideoTwoTxt += output['text'] + " " VideoOneBucket = getToken(VideoOneTxt) VideoTwoBucket = getToken(VideoTwoTxt) score = 0 Common = {} for word in VideoOneBucket: if word in VideoTwoBucket: score += min(VideoOneBucket[word], VideoTwoBucket[word]) Common[word] = min(VideoOneBucket[word], VideoTwoBucket[word]) return score, Common
def get_transcript_from_vidids(video_ids): script = [] for i in range(len(video_ids)): script.append( YouTubeTranscriptApi.get_transcript(video_ids[i], languages=['en'])) #print(script) with open('sample_caption.txt', "w") as filehandle: for i in range(len(script)): for listitem in script[i]: # removing punctuation and numbers and brackets filehandle.write(listitem.get('text') + " ")
def test_get_transcript__with_cookies(self): dirname, filename = os.path.split(os.path.abspath(__file__)) cookies = dirname + '/example_cookies.txt' transcript = YouTubeTranscriptApi.get_transcript('GJLlxj_dtq8', cookies=cookies) self.assertEqual( transcript, [ {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ] )
def test_get_transcript__with_proxy(self): proxies = {'http': '', 'https:': ''} transcript = YouTubeTranscriptApi.get_transcript( 'GJLlxj_dtq8', proxies=proxies ) self.assertEqual( transcript, [ {'text': 'Hey, this is just a test', 'start': 0.0, 'duration': 1.54}, {'text': 'this is not the original transcript', 'start': 1.54, 'duration': 4.16}, {'text': 'just something shorter, I made up for testing', 'start': 5.7, 'duration': 3.239} ] )
def __init__(self, url, id, API_KEY): #You can use the youtube data api v3 by uncommenting the following 3 lines and passing an API key to the class #callUrl = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={id}&key={API_KEY}" #self.respose = requests.get(callUrl) #self.data = json.loads(self.respose.text) try: self.captionData = YouTubeTranscriptApi.get_transcript(id) except: self.captionData = None self.url = url self.id = id self.title = self.getVideoTitle()
def get_video_subtitles(self, video_id, subtitles_langs): subtitles = {} for lang in subtitles_langs: try: subs = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang]) formatted_subs = [] for s in subs: formatted_subs.append(self._make_right_subtitles_format(s, video_id)) subtitles['lang'] = formatted_subs except Exception: pass return subtitles
def fetch_auto_chosen(video_id, lang): transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) target = transcript_list.find_generated_transcript(['en']) # if target: # print(target) if target: with open('tcaptions.txt', 'w', encoding='utf-8') as f: for snippet in target.translate('uk').fetch(): pprint.pprint(snippet) f.write(snippet['text'] + '\n') return 'tcaptions.txt' else: return None
def get_transcript(self, video_id): ''' Gets the transcript of a youtube video. Args: video_id: a string representing the id of a youtube video Returns: An array of dictionaries each containing duration, start, and text key pairs that cover the transcription for the youtube video identified through video_id. Uses manually created transcript if available, otherwise it will use the auto generated one. ''' transcript = YouTubeTranscriptApi.get_transcript(video_id) return transcript
def yt_trans(url, is_id=False): trans_text = '' if is_id: transcript =(YouTubeTranscriptApi.get_transcript(url)) for x in transcript: trans_text = str(trans_text) + " " + str(x['text']) return ((trans_text)) else: if 'v=' in str(url): id = str(url).split('v=')[-1] url = 'https://youtu.be/' + id transcript =(YouTubeTranscriptApi.get_transcript(id)) else: id = str(url).split('/')[-1] url = 'https://youtu.be/' + id transcript =(YouTubeTranscriptApi.get_transcript(id)) for x in transcript: trans_text = str(trans_text) + " " + str(x['text']) return ((trans_text))
def findEnd(videoId): keywords = [ "school", "high school", "shout outs", "shoutout", "shout out", "shoutouts", "comments" ] try: transcript = YouTubeTranscriptApi.get_transcript(videoId) except: return 570 for line in transcript: if (line['start'] > 490): if (checkKeyWords(keywords, line['text'])): return line['start'] return 570
def __init__(self, video_id): self.video_id = video_id self.lang = [ 'en', ] try: transcription_list = YouTubeTranscriptApi.list_transcripts( self.video_id) gencc = transcription_list.find_generated_transcript([ 'en', ]) self.content = gencc.fetch() except: print("Error!")
def trans_to_file(hlink, fi): length_of_line = 0 ids = id_from_url(hlink) q = YouTubeTranscriptApi.get_transcript(ids) results = youtube.videos().list(id=ids, part='snippet').execute() for result in results.get('items', []): title = "Title: " + result['snippet']['title'] line_string = "Transcript for YouTube video at https://www.youtube.com/watch?v={:s}\n\n{:s}\n\n".format(hlink, title) for q0 in q: a = q0['text'] st = q0['start'] en = q0['start'] + q0['duration'] stm = int(st) // 60 sts = st % 60 enm = int(en) // 60 ens = en % 60 if flag_start and flag_end: a = "({:02d}:{:05.2f}-{:02d}:{:05.2f}) {:s}".format(stm, sts, enm, ens, a) elif flag_start: a = "({:05.2f}) {:s}".format(st, a) elif flag_end: a = "(-{:05.2f}) {:s}".format(en, a) if not a.strip(): line_string += "\n" length_of_line = 0 if len(a) + length_of_line >= max_line and length_of_line > 0: line_string += "\n" + a length_of_line = len(a) else: if length_of_line: line_string += " " line_string += a length_of_line += len(a) + 1 print("Writing to", fi) f = open(fi, "w") f.write(line_string) f.close()
sys.exit("{:d} youtube video{:s} to files.".format(valid, 's' if valid == 1 else '')) video_id = re.sub(".*=", "", video_id) ids = video_id results = youtube.videos().list(id=ids, part='snippet').execute() title = "" for result in results.get('items', []): title = "Title: " + result['snippet']['title'] if not video_id: sys.exit("Specify video id or use -c for clipboard.") if not (print_output or write_output): sys.exit("Need to specify print or write output on. To launch, just use -jl.") q = YouTubeTranscriptApi.get_transcript(video_id) line_string = "Transcript for YouTube video at https://www.youtube.com/watch?v={:s}\n\n{:s}\n\n".format(video_id, title) for q0 in q: a = q0['text'] st = q0['start'] en = q0['start'] + q0['duration'] stm = int(st) // 60 sts = st % 60 enm = int(en) // 60 ens = en % 60 if flag_start and flag_end: a = "({:02d}:{:05.2f}-{:02d}:{:05.2f}) {:s}".format(stm, sts, enm, ens, a) elif flag_start: a = "({:05.2f}) {:s}".format(st, a)