Exemplo n.º 1
0
def vtt_to_df(fn):
    """
    Convert vtt to DataFrame

    args:
        fn - filepath to .vtt-file

    returns:
        DataFrame
    """

    with open(fn) as f:
        text = f.read()

    vtt = WebVTTReader().read(text)

    subtitles = []
    for caption in vtt.get_captions('en-US'):
        subtitles.append({
            'time':
            dt.datetime.strptime(caption.format_start(),
                                 '%H:%M:%S.%f').strftime('%-Hh%mm%Ss'),
            'start':
            int((dt.datetime.strptime(caption.format_start(), '%H:%M:%S.%f') -
                 dt.datetime(1900, 1, 1)).total_seconds()),
            'duration': (caption.end - caption.start) / 100000,
            'text':
            caption.get_text()
        })

    df = pd.DataFrame(subtitles)
    return df
Exemplo n.º 2
0
def getCaptions(url, progress_cb, so_far, task_weight):
    ydl = youtube_dl.YoutubeDL({
        'writesubtitles': True,
        'allsubtitles': True,
        'writeautomaticsub': True
    })
    with ydl:
        res = ydl.extract_info(url, download=False)
        if res['requested_subtitles'] and res['requested_subtitles']['en']:
            print('Grabbing vtt file from ' +
                  res['requested_subtitles']['en']['url'])
            response = requests.get(res['requested_subtitles']['en']['url'],
                                    stream=True)
            b = BytesIO()
            for block in response.iter_content(1024):
                b.write(block)
            b.seek(0)
            arr = WebVTTReader().read(b.read().decode('ascii'))
            progress_cb(so_far + task_weight, so_far + task_weight)
            return arr.get_captions('en-US')
        else:
            return []
            print('Youtube Video does not have any english captions')
            return None
Exemplo n.º 3
0
        new_captions = []
        for s, sentence in enumerate(self.sentences):
            for c, caption in enumerate(sentence.captions):
                trans = match[s][c]
                new_caption = deepcopy(caption.raw_caption)
                new_caption.nodes = [CaptionNode.create_text(trans.strip())]
                new_captions.append(new_caption)

                # print(f'"{caption.raw_text}"', f'"{trans}"')
        new_caption_set = CaptionSet({'en': new_captions})
        return new_caption_set


input_file = Path("./sendung-vom-15112020-video-ut102~_type-webvtt.vtt")
read_srt = WebVTTReader().read(input_file.read_text('UTF-8'), lang='de')
sentence_manager = SentenceManager()
for raw_caption in read_srt.get_captions('de'):
    caption = MyCaption(raw_caption)
    sentence_manager.add_caption(caption)

# sentence_manager.finish()
# print(sentence_manager)

# sentence_manager.write_to_file(Path("./output.txt"))
match = sentence_manager.match_translation_from_file(
    Path("./output_fixed.txt"), Path("./translated.txt"))
new_caption_set = sentence_manager.new_caption_set_from_match(match)
srt_output = SRTWriter().write(new_caption_set)
print(srt_output)
Path("./translated.srt").write_text(srt_output, 'UTF-8')