def getCaptions(self, videoId):
     # Get captions
     try:  # using the manually created one
         captions = YouTubeTranscriptApi.list_transcripts(
             videoId).find_manually_created_transcript(['en']).fetch()
     except:
         try:  # using the generated one
             captions = YouTubeTranscriptApi.list_transcripts(
                 videoId).find_generated_transcript(['en']).fetch()
         # if there is no English caption including auto-generated one.
         except:
             captions = [{'text': '', 'start': -1, 'duration': -1}]
     return captions
def clean_transcript(link):
    video_id=gen_id(link)

    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    transcript = transcript_list.find_generated_transcript(['en'])
    transcript=transcript.fetch()
    profanity.load_censor_words()
    for i in transcript:
        i['text']=profanity.censor(i['text'])
        
    title=get_title(link)
    
    if not os.path.exists(((os.path.join(SAVE_TRANSCRIPT_PATH,title))+'.txt')):
        file1=open(((os.path.join(SAVE_TRANSCRIPT_PATH,title))+'.txt'),'a')
        for line in transcript:
            text=line['text']
            start=line['start']
            duration=line['duration']
            inf=[text,start,duration]
            file1.writelines(str(inf))
            file1.write('\n')
        file1.close()
        print('Transcript saved to',((os.path.join(SAVE_TRANSCRIPT_PATH,title))+'.txt'))
    else:
        print('File Already Exists!')
        print()
        
    return transcript
예제 #3
0
def fetch(Vid_ID):
    #load all possible scripts of a video
    possible_scripts = YTScript.list_transcripts(Vid_ID)
    #autogenerated seem to avoid repeats better at least for
    #'CKZ58bXtQnU'
    type = ''
    for scripts in possible_scripts:
        if scripts.is_translatable:
            if scripts.is_generated:
                type = "generated"
                autogenerated = possible_scripts.find_generated_transcript(
                    ['en'])
                script = autogenerated.fetch()
            else:
                type = "manual"
                manual = possible_scripts.find_manually_created_transcript(
                    ['en'])
                script = manual.fetch()
        if scripts.is_generated:

            type = "generated"
            autogenerated = possible_scripts.find_generated_transcript(
                ['en', 'en-US', 'en-GB'])
            script = autogenerated.fetch()
        else:
            type = "manual"
            manual = possible_scripts.find_manually_created_transcript(
                ['en', 'en-US', 'en-GB'])
            script = manual.fetch()

    return script, type
예제 #4
0
def transcribe(video_id):
    try:
        # retrieve the available transcripts
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        # iterate over all available transcripts
        for transcript in transcript_list:
            transcript = transcript.fetch()
            subtitles = ""
            for subtitle in transcript:
                subtitles += (subtitle['text'] + ' ')

            r = requests.post('http://bark.phon.ioc.ee/punctuator',
                              data={'text': subtitles})
            r2 = requests.get("https://www.youtube.com/watch?v=" + video_id)
            soup = bs(r2.content)
            title = soup.find("meta", {"name": "title"})['content']
            cleanString = re.sub('\W+', ' ', title)
            # writeToFile("./subtitles.txt", r.text)
            d = dict()
            d['subtitle'] = r.text + "<h4>URL: https://www.youtube.com/watch?v=" + video_id + "  </h4><iframe width='420' height='345' src='https://www.youtube.com/embed/" + video_id + "'></iframe>"
            d['title'] = cleanString
            return d
    except:
        err = dict()
        err['subtitle'] = "<h4>URL: https://www.youtube.com/watch?v=" + video_id + "  </h4><iframe width='420' height='345' src='https://www.youtube.com/embed/" + video_id + "'></iframe>"
        err['title'] = "Subtitle doesn't found for video"
        return err
예제 #5
0
    def getTranscriptByLink(self, video_id):
        # html = urlopen(link) #"https://www.youtube.com/watch?v=5_zrHZdhaBU"
        # soup = BeautifulSoup(html, 'html.parser')
        # nameList = soup.findAll("div", {"id": "cp-2"})
        # for name in nameList:
        #     print(name.get_text())

        vectorOfWords = ""
        #print(type(transcript_list))

        try:

            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            #transcript_list_Raw = transcript_list.find_transcript(['en'])
            #print(transcript_list.find_transcript([]))
            transcript_list_Raw = YouTubeTranscriptApi.get_transcript(
                video_id, languages=['en'])

            for transcript in transcript_list_Raw:
                # the Transcript object provides metadata properties
                #print(transcript)
                vectorOfWords = vectorOfWords + " " + transcript['text']

            print("Am I next getting here?")
            return [video_id, vectorOfWords]
        except Exception:
            print("No English Transcript. For future use add British English")
예제 #6
0
    def test_translate_transcript(self):
        transcript = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8').find_transcript(['en'])

        translated_transcript = transcript.translate('af')

        self.assertEqual(translated_transcript.language_code, 'af')
        self.assertIn('&tlang=af', translated_transcript._url)
예제 #7
0
    def get_youtube_subtitle(self, youtube_id, generated_mode, language_code):
        '''
        Get youtube subtitle using youtube_transcript_api

        Parameters
        ----------
        youtube_id : string
            youtube video ID
        generated_mode : boolean
            Generated or manual subtitle (True of False)
        language_code : list
            list of string containing subtitle language code ['en','fr','ar']

        Returns
        -------
        list
            list containing youtube subtitle information [{text, start, duration}, ...]

        '''
        transcript_list = YouTubeTranscriptApi.list_transcripts(youtube_id)
        if generated_mode:
            return transcript_list.find_generated_transcript(
                language_code).fetch()
        else:
            return transcript_list.find_manually_created_transcript(
                language_code).fetch()
예제 #8
0
def get_english_subs(x_list):
    '''Given a list of video_ids; get auto-generated subtitles using a YouTube api'''
    failed_list = []
    success_list = []
    print(f'Extracting cc will take at least {(len(x_list)*3)/60} minutes.')
    for vid in x_list:
        vid_id = vid[2]
        try:
            transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)
            tra = transcript_list.find_transcript(['en'])
            subs = tra.fetch()
            success_list.append((vid_id, subs))  # modified
            sleep(3)

        except:
            failed_list.append(vid)
    if len(failed_list) > 0:
        print('_' * 50)
        print(
            f'{len(failed_list)} transcripts out of {numb_vids} couldn\'t be extracted'
        )
        print(
            'This could be because auto-generated cc wasn\'t enabled,\nOr English couldn\'t be recognized as the language of the video.'
        )
        print('\n')
        print('The following videos couldn\'t be processed:')
        for i in failed_list:
            print(i)
            print('*' * 50)
    else:
        print('All videos transcripts were extracted')

    print('\n')
    print(f'Only {len(success_list)} transcirpts were extracted.')
    return success_list
예제 #9
0
def get_form(request):
    # if this is a POST request we need to process the form data
    if request.method == 'POST':
        # create a form instance and populate it with data from the request:
        form = YoutubeForm(request.POST)
        # check whether it's valid:
        if form.is_valid():
            # process the data in form.cleaned_data as required
            # ...
            # redirect to a new URL:
            if request.POST['video_link']:
                # grab video transcript
                video_param = request.POST['video_link']
                parsed_url = urlparse(video_param)
                if parsed_url.query is "":
                    video_id = video_param
                else:
                    url_params = parse_qs(parsed_url.query)
                    video_id = url_params.get('v')[0]
                    print(video_id)
                transcript_list = YouTubeTranscriptApi.list_transcripts(
                    video_id)
                transcript = transcript_list.find_transcript(['en-US', 'en'])
                full_transcript = transcript.fetch()
                return render(request, 'transcript.html',
                              {'transcript_items': full_transcript})
                # return template with transcript and copiable json object?
            return HttpResponseRedirect('/thanks')

    # if a GET (or any other method) we'll create a blank form
    else:
        form = YoutubeForm()

    return render(request, 'form.html', {'form': form})
예제 #10
0
def download_auto_generated_transcript(target_url, output_path):
    """
    Downloads the auto generated transcript that Google creates for YouTube.
    :param target_url: Video url
    :parma output_path: Name and path where to save the transcript in JSON format.

    Example:
    download_auto_generated_transcript("https://www.youtube.com/watch?v=vJ6MrDO0kgY", "Democratic Presidential Debate - June 26.json")
    OUT: /path/to/mp3/Democratic Presidential Debate - June 26.mp3
    """
    url = target_url.replace("https://www.youtube.com/watch?v=", '')
    transcript_list = YouTubeTranscriptApi.list_transcripts(url, 'en')

    auto_generated_transcript = None

    for transcript in transcript_list:
        if transcript.language_code == 'en':
            auto_generated_transcript = transcript.fetch()

    if auto_generated_transcript:
        with open(output_path, 'w') as f:
            json.dump(auto_generated_transcript, f, indent=4)

        return os.path.abspath(output_path)
    else:
        print("Couldn't find english transcript")
예제 #11
0
    def test_translate_transcript__not_translatable(self):
        transcript = YouTubeTranscriptApi.list_transcripts(
            'GJLlxj_dtq8').find_transcript(['en'])
        transcript.translation_languages = []

        with self.assertRaises(NotTranslatable):
            transcript.translate('af')
예제 #12
0
def get_transcription(url):
    """
    Function takes a YouTube video URL and extracts the automatically-generated transcripts from it
    """

    # Checks the format of the URL
    if "https://www.youtube.com/watch?v=" in url:
        input_url_id = url.replace("https://www.youtube.com/watch?v=", "")
    elif "https://youtu.be/" in url:
        input_url_id = url.replace("https://youtu.be/", "")

    # Creates a blank list to iterate over
    text_parts = []

    # Gets a list of all available transcripts
    try:

        list_of_transcripts = YouTubeTranscriptApi.list_transcripts(
            input_url_id)
        print("Checking for Transcriptions...")

        # Checks to see if a manual transcript is created if not, checks to see if a generated one is created
        if 'en-US' in list_of_transcripts._manually_created_transcripts:
            print("Manual Transcription Found.")
            transcript = list_of_transcripts.find_manually_created_transcript(
                ['en-US'])
        elif 'en' in list_of_transcripts._manually_created_transcripts:
            print("Manual Transcription Found.")
            transcript = list_of_transcripts.find_manually_created_transcript(
                ['en'])
        elif 'en' in list_of_transcripts._generated_transcripts:
            print("Auto-Generated Transcription Found.")
            transcript = list_of_transcripts.find_generated_transcript(['en'])

        # Saves the transcript into a variable to iterate over
        raw_transcription = transcript.fetch()

        # Indexing of raw transcripts
        iteration_of_raw = 0

        # Iterates over each dictionary and extracts 'text' key then appends the blank text_parts list
        for i in raw_transcription:
            indexed_dictionary = raw_transcription[iteration_of_raw]
            text_from_dictionary = indexed_dictionary['text']
            text_parts.append(text_from_dictionary)
            iteration_of_raw += 1
        # Defines how we want each text element to be separated with
        separator_for_each_text = " "

        # Joins the separator with the text_parts
        clean_transcription = separator_for_each_text.join(text_parts)

        # Returns the cleaned transcripts
        return clean_transcription

    except:
        print("No Transcriptions Found")
        clean_transcription = "No Transcriptions Found"
        return clean_transcription
예제 #13
0
 def _has_manually_created_transcript(self, youtube_video_id):
     transcript_list = YouTubeTranscriptApi.list_transcripts(
         youtube_video_id)
     try:
         transcript_list.find_manually_created_transcript(["en"])
     except Exception:
         return False
     return True
예제 #14
0
def get_transcript(videoId, language):
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(videoId)
        transcript = transcript_list.find_manually_created_transcript(
            [language])
        return transcript.fetch()
    except:
        return None
예제 #15
0
def fetch_man_chosen(video_id, lang):
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    target = transcript_list.find_manually_created_transcript(language_codes=[lang])
    with open('ccaptions.txt', 'w', encoding='utf-8') as f:
        for snippet in target.fetch():
            pprint.pprint(snippet)
            f.write(snippet['text']+' ')
    return 'ccaptions.txt'
예제 #16
0
 def video_has_en_ts(self, video: str) -> bool:
     try:
         res = tsApi.list_transcripts(video)
     except (TranscriptsDisabled, KeyError):
         return False
     try:
         return True if res.find_generated_transcript(["en"]) else False
     except NoTranscriptFound:
         return False
예제 #17
0
def get_available_lang(video_id):
    codes = []
    manual = []
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    for transcript in transcript_list:
        if not transcript.is_generated:
            manual.append(transcript.language)
            codes.append(transcript.language_code)
    return manual, codes
예제 #18
0
    def test_list_transcripts__find_generated(self):
        transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')

        with self.assertRaises(NoTranscriptFound):
            transcript_list.find_generated_transcript(['cs'])

        transcript = transcript_list.find_generated_transcript(['en'])

        self.assertTrue(transcript.is_generated)
예제 #19
0
    def ATL(self):
        atl = YTA.list_transcripts(self.video_id)
        ret = None

        if DEBUG_MODE:
            try:
                ret = atl.find_manually_created_transcript(self.LANGUAGE_LIST)
            except Exception as e:
                ret = atl.find_generated_transcript(self.LANGUAGE_LIST)

        return ret
예제 #20
0
    def test_list_transcripts(self):
        transcript_list = YouTubeTranscriptApi.list_transcripts('GJLlxj_dtq8')

        language_codes = {
            transcript.language_code
            for transcript in transcript_list
        }

        self.assertEqual(
            language_codes,
            {'zh', 'de', 'en', 'hi', 'ja', 'ko', 'es', 'cs', 'en'})
예제 #21
0
def fetch_auto_chosen(video_id, lang):
    transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
    target = transcript_list.find_generated_transcript(['en'])
    # if target:
    #     print(target)
    if target:
        with open('tcaptions.txt', 'w', encoding='utf-8') as f:
            for snippet in target.translate('uk').fetch():
                pprint.pprint(snippet)
                f.write(snippet['text'] + '\n')
        return 'tcaptions.txt'
    else:
        return None
예제 #22
0
 def get_trans(video_url,language):
       trans = ''
       try:
         trans = YouTubeTranscriptApi.get_transcript(video_url, languages= [language])
       except Exception:
           transcripts_langugages = YouTubeTranscriptApi.list_transcripts(video_url)
           for lang in transcripts_langugages:
             try:
               trans = lang.translate(language).fetch()
               break
             except Exception:
               pass
       return trans
예제 #23
0
 def get_manually_created_transcript(self, video_id):
     ''' Gets the transcript of a youtube video.
     Args:
         video_id: a string representing the id of a youtube video
         
     Returns:
         An array of dictionaries each containing duration, start, and text 
             key pairs that cover the manually generated transcription for 
             the youtube video identified through video_id. If none exists,
             it will throw an error.
     '''
     transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
     generated = transcript_list.find_manually_created_transcript(['en'])
     return generated.fetch()
예제 #24
0
 def __init__(self, video_id):
     self.video_id = video_id
     self.lang = [
         'en',
     ]
     try:
         transcription_list = YouTubeTranscriptApi.list_transcripts(
             self.video_id)
         gencc = transcription_list.find_generated_transcript([
             'en',
         ])
         self.content = gencc.fetch()
     except:
         print("Error!")
예제 #25
0
def labelVideo(conn_dest, vid, best, transcript, filledIn, autogen, verbose):
    if filledIn:  #Must have been manual transcript, so we need the autogen
        transcript_list = YouTubeTranscriptApi.list_transcripts(vid)
        transcript_auto = transcript_list.find_generated_transcript(
            ["en"]).fetch()

    #Stitch together the transcript into a single string
    #Use the tokenized string to label each word as sponsor or not
    seq = []
    full_text = ""
    segList = best.copy()

    for t in transcript:
        tStart = t["start"]
        tEnd = tStart + t["duration"]

        if filledIn:
            for b in segList:
                if b[0] <= tStart:
                    string, totalNumWords = extractText(b, transcript_auto)
                    full_text, seq = appendData(full_text, seq, string, tStart,
                                                tEnd, best, 1, verbose)
                    segList.remove((b[0], b[1], b[2]))

        raw_text = t["text"].replace("\n", " ")
        raw_text = re.sub(" +", " ",
                          raw_text.replace(r"\u200b",
                                           " "))  #strip out this unicode
        full_text, seq = appendData(full_text, seq, raw_text, tStart, tEnd,
                                    best, 0, verbose)

    if filledIn:
        for b in segList:
            if b[0] > transcript[-1]["start"]:
                tStart = transcript[-1]["start"]
                tEnd = tStart + transcript[-1]["duration"]
                string, totalNumWords = extractText(b, transcript_auto)
                full_text, seq = appendData(full_text, seq, string, tStart,
                                            tEnd, best, 1, verbose)

    full_text = re.sub(" +", " ", full_text).replace("'", "''")  #format text

    #insert text and labels into db
    cursor = conn_dest.cursor()
    cursor.execute(
        f"insert into SponsorStream values ('{vid}', '{full_text}' , '{seq}', {autogen}, {filledIn}, 1, current_date)"
    )
    conn_dest.commit()

    return
예제 #26
0
def transcribe_video(video_id, translate, language_from, translate_to):
    if translate == True:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        transcript = transcript_list.find_transcript([language_from])
        translated_transcript = transcript.translate(translate_to)
        transcript_fetched = (translated_transcript.fetch())
    else:
        transcript_fetched = YouTubeTranscriptApi.get_transcript(video_id)

    total_text = ""
    for t in transcript_fetched:
        total_text += (t["text"]) + " "
    print(total_text)
    pc.copy(total_text)
    print("text copied")
예제 #27
0
 def text_download(self):
     captions = self.myVideo.captions.get_by_language_code('en')
     abs_path = self.video_path / self.filename
     self.abs_path_suffix = abs_path.with_suffix('.txt')
     if captions:
         transcript_list = YouTubeTranscriptApi.list_transcripts(
             self.myVideo.video_id)
         transcript = transcript_list.find_transcript(['en'])
         with open(self.abs_path_suffix, 'w') as f:
             for line in transcript.fetch():
                 f.write('%s\n' % line)
     else:
         print("No captions available in English")
         with open(self.abs_path_suffix, 'w') as f:
             f.write("No captions available in English")
     return True
예제 #28
0
def main(args):
    os.makedirs(args.transcript_folder, exist_ok=True)
    # Open the main ids file
    with open(args.youtube_ids_file, "r") as f:
        for line in f:
            id = line.strip()
            print('Processing', id)
            # retrieve the available transcripts
            transcript_list = YouTubeTranscriptApi.list_transcripts(id)

            # you can also directly filter for the language you are looking for, using the transcript list
            transcript = transcript_list.find_transcript(['en'])
            transcript = transcript.fetch()

            with open(path.join(args.transcript_folder, id + '.json'),
                      "w") as o:
                json.dump(transcript, o)
예제 #29
0
def returnVideoLangages(vidID):
    try:
        languages = YTTranscriptAPI.list_transcripts(vidID)
    except Exception as e:
        print(e)
        print("VidID: " + vidID)
        return -1

    ret = []
    for lang in languages:
        ret.append({
            'code': lang.language_code,
            'name': lang.language,
            'isGenerated': lang.is_generated,
            'isTranslatable': lang.is_translatable
            })
    return ret
예제 #30
0
def fetch_transcript(vid_id, **kwargs) -> Tuple[str, List[Dict[str, Any]]]:
    params = {"format": "json", "url": f"https://www.youtube.com/watch?v={vid_id}"} | kwargs
    url = "https://www.youtube.com/oembed"

    query_string = parse.urlencode(params)
    url = url + "?" + query_string

    print(f"Fetching from {url}")

    with urllib.request.urlopen(url) as response:
        response_text = response.read()
        title = json.loads(response_text.decode())["title"]

    # retrieve the available transcripts
    transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)

    return title, transcript_list.find_transcript(['en']).fetch()