Пример #1
0
    def test_save_specific_filename_no_extension(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)
        output_file = os.path.join(target_path, 'custom_name')

        webvtt.read(self._get_file('one_caption.vtt')).save(output_file)
        self.assertTrue(os.path.exists(os.path.join(target_path, 'custom_name.vtt')))
Пример #2
0
    def test_save_specific_filename(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)
        output_file = os.path.join(target_path, 'custom_name.vtt')

        webvtt.read(self._get_file('one_caption.vtt')).save(output_file)
        self.assertTrue(os.path.exists(output_file))
Пример #3
0
    def test_save_to_other_location(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)

        webvtt.read(self._get_file('one_caption.vtt')).save(target_path)
        self.assertTrue(
            os.path.exists(os.path.join(target_path, 'one_caption.vtt')))
Пример #4
0
def convert_file(file_name, main_lang, sub_lang):
    file_name_sub = rreplace(file_name, main_lang, sub_lang)
    #print(file_name + '\n' + file_name_sub)
    #return
    vtt_main = webvtt.read(file_name)
    vtt_sub = webvtt.read(file_name_sub)

    # while loop all korean time captions
    index_main = 0
    index_sub = 0
    while index_main < len(vtt_main):
        while index_sub < len(vtt_sub):
            caption_main = vtt_main[index_main]
            caption_sub = vtt_sub[index_sub]

            if (caption_main.start <= caption_sub.start):
                #print("##### " + caption_main.text.replace("&lrm;","").replace("\n","\n##### "))
                print("<h3>" + caption_main.text.replace("&lrm;", "") +
                      "</h3>")
                break
            else:
                print("<p>" + caption_sub.text.replace("&lrm;", "") + "</p>")
                print("")
                index_sub += 1
        index_main += 1

    # finish final z index
    while index_sub < len(vtt_sub):
        print(caption_sub.text)
        index_sub += 1
Пример #5
0
    def _parse_vtt_file(file_path) -> List[Caption]:
        captions: List[Caption] = []
        for file_caption in webvtt.read(file_path):
            for line in file_caption.lines:
                if '<c>' not in line:
                    continue
                caption = Caption(is_word_aligned=True,
                                  start=file_caption.start_in_seconds,
                                  end=file_caption.end_in_seconds)

                first = re.match(pattern_first, line, re.M | re.I)
                first_word, start = Video._remove_tags(first[0])
                # Estimate the time here because it is not given in the vtt file
                if captions:
                    junction = round((start + captions[-1][-1].start) / 2, 3)
                    if junction > 1:
                        junction = start - 1
                    captions[-1][-1].end = junction
                    first_word = Word(text=first_word, start=junction)
                else:
                    first_word = Word(
                        text=first_word,
                        start=max(round(start - 1, 3), 0))
                caption.append(first_word)

                rest = re.findall(pattern_rest, line, re.M | re.I)
                for match in rest:
                    next_word, start = Video._remove_tags(match)
                    if len(caption) == 1 and not re.search(r'\w', caption[0].text):
                        del caption[0]
                    else:
                        caption[-1].end = start
                    next_word = Word(text=next_word, start=start)
                    caption.append(next_word)
                captions.append(caption)
                break
        # If file is not word aligned with video
        if not captions:
            for file_caption in webvtt.read(file_path):
                caption = Caption(is_word_aligned=False,
                                  start=file_caption.start_in_seconds,
                                  end=file_caption.end_in_seconds)
                for word in file_caption.raw_text.split():
                    caption.append(Word(word.strip()))
                if caption:
                    caption[0].start = file_caption.start_in_seconds
                    caption[-1].end = file_caption.end_in_seconds
                    captions.append(caption)
        else:
            # Also here, estimate the time which is not given in the vtt file
            captions[-1][-1].end = round(captions[-1][-1].start + 1, 3)

        return captions
Пример #6
0
    def test_save_identifiers(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR)

        vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt'))
        vtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'))

        with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.500 --> 00:00:07.000',
            'Caption text #1',
            '',
            'second caption',
            '00:00:07.000 --> 00:00:11.890',
            'Caption text #2',
            '',
            '00:00:11.890 --> 00:00:16.320',
            'Caption text #3',
            '',
            '4',
            '00:00:16.320 --> 00:00:21.580',
            'Caption text #4',
            '',
            '00:00:21.580 --> 00:00:23.880',
            'Caption text #5',
            '',
            '00:00:23.880 --> 00:00:27.280',
            'Caption text #6'
        ]

        self.assertListEqual(lines, expected_lines)
Пример #7
0
    def test_write_captions(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('one_caption.vtt'), OUTPUT_DIR)

        out = io.StringIO()
        vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))
        new_caption = Caption('00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2'])
        vtt.captions.append(new_caption)
        vtt.write(out)

        out.seek(0)
        lines = [line.rstrip() for line in out.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.500 --> 00:00:07.000',
            'Caption text #1',
            '',
            '00:00:07.000 --> 00:00:11.890',
            'New caption text line1',
            'New caption text line2'
        ]

        self.assertListEqual(lines, expected_lines)
Пример #8
0
def getBreakdown(vtt):
    breakdowns = []
    for caption in webvtt.read(vtt):
        name = getName(caption.text)
        if name == "NO NAME FOUND":
            pass
        elif searchName(name, breakdowns):
            index = findName(name, breakdowns)
            breakdowns[index][1] = breakdowns[index][1] + timeDiff(
                caption.start, caption.end)
            breakdowns[index][2] = breakdowns[index][2] + " " + getText(
                caption.text)
        else:
            breakdowns.append([
                name,
                timeDiff(caption.start, caption.end),
                getText(caption.text)
            ])
    breakdowns = roundit(breakdowns)
    breakdowns = getSentiment(breakdowns)
    for i in breakdowns:
        print(i[0], " talked for ", i[1], " seconds")
        print("Sentiment: ")
        print(i[2])

        #print(caption.start)  # start timestamp in text format
        #print(caption.end)  # end timestamp in text format
        #print(caption.text) # caption texk
    return breakdowns


#getBreakdown("Example Transcript.vtt")
# getBreakdown("94923151321_audio_transcript_first-try.vtt")
# getBreakdown("94923151321_audio_transcript.vtt")
Пример #9
0
def parse_subtitles():
    filename = 'reference.vtt'
    f = open(filename)
    html = ""
    if filename[-4:] == '.vtt':
        for caption in webvtt.read(filename):
            time = caption.start[-6:-4]
            text = caption.text
            html += '<div class="subtitle-line">\n'
            html += '<a id="' + str(time) + '" href="#">'
            html += text
            html += '</a>\n</div>\n\n<br>\n\n'
    elif filename[-4:] == '.srt':
        print("reading SRT file")
        subtitle_generator = srt.parse(f.read())
        subtitles = list(subtitle_generator)
        for line in subtitles:
            time = line.start.seconds
            text = line.content
            html += '<div class="subtitle-line">\n'
            html += '<a id="' + str(time) + '" href="#">'
            html += text
            html += '</a>\n</div>\n\n<br>\n\n'
    else:
        print("this script only accepts vtt or srt files")
    return html
def get_caption(url):
    global video_title
    # Using Youtube-dl inside python
    ydl_opts = {
        'skip_download': True,  # Skipping the download of actual file
        'writesubtitles': True,  # Uploaded Subtitles
        "writeautomaticsub": True,  # Auto generated Subtitles
        "subtitleslangs": ['en'],  # Language Needed "en"-->English
        'outtmpl': 'test.%(ext)s',  # Saving downloaded file as 'test.en.vtt'
        'nooverwrites': False,  # Overwrite if the file exists
        'quiet': True  # Printing progress
    }

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([url])
            info_dict = ydl.extract_info(url, download=False)
            video_title = info_dict.get('title', None)
        except:
            print("Try with a YouTube URL")
    corpus = []
    for caption in webvtt.read('test.en.vtt'):
        corpus.append(caption.text)
    corpus = "".join(corpus)
    corpus = corpus.replace('\n', ' ')

    return corpus
Пример #11
0
 def to_vtt_format(self,
                   attr: str = 'raw_text') -> None:
     """attr: {start, end, text, raw_text, identifier}"""
     for i, caption in enumerate(webvtt.read(self.file_path)):
         print('------')
         print(attr.upper())
         print(getattr(caption, attr))
Пример #12
0
def main(file_loc):
    transcript = ""
    lines = []
    files = [os.path.join(file_loc, f) for f in os.listdir(file_loc) if f.endswith(".vtt")]

    for f in files:
        vtt = webvtt.read(f)
        for line in vtt:
            # Strip the newlines from the end of the text.
            # Split the string if it has a newline in the middle
            # Add the lines to an array
            lines.extend(line.text.strip().splitlines())

    # Remove repeated lines
    previous = None
    for line in lines:
        line = line.replace("&amp;", "&")
        if line == previous:
            continue
        if transcript == "":
            transcript = line
        else:
            transcript += "\n" + line
        previous = line
    previous = previous.strip()

    filename = os.path.basename(os.path.normpath(file_loc))
    with open(f"cleaned_{filename}.txt", "w", encoding='utf8', errors="ignore") as f:
        f.write(transcript)
        print(f"Saved to cleaned_{filename}.txt")
Пример #13
0
 def test_parse_with_comments(self):
     vtt = webvtt.read(self._get_file('comments.vtt'))
     self.assertEqual(len(vtt.captions), 3)
     self.assertListEqual(
         vtt.captions[0].lines,
         ['- Ta en kopp varmt te.', '- Det är inte varmt.'])
     self.assertEqual(vtt.captions[2].text, '- Ta en kopp')
Пример #14
0
    def test_parse_identifiers(self):
        vtt = webvtt.read(self._get_file('using_identifiers.vtt'))
        self.assertEqual(len(vtt.captions), 6)

        self.assertEqual(vtt.captions[1].identifier, 'second caption')
        self.assertEqual(vtt.captions[2].identifier, None)
        self.assertEqual(vtt.captions[3].identifier, '4')
Пример #15
0
def generate_transcript(file):
    """Generates the transcript of a given .vtt file."""
    if len(file) < 4 or file[-4:] != ".vtt":
        print("Error: non .vtt file passed in.")
        return ""

    vtt = webvtt.read(file)
    captions = sum([c.text.strip().splitlines() for c in vtt], [])

    brackets = ("[", "]")
    parenthesis = ("(", ")")

    captions_of_lecture = list(
        map(lambda text: ignore_descriptions(text, brackets), captions))
    captions_of_lecture = list(
        map(lambda text: ignore_descriptions(text, parenthesis),
            captions_of_lecture))
    captions_of_lecture = list(map(ignore_names, captions_of_lecture))

    transcript = ""
    prev_cap = ""

    for i in range(len(captions_of_lecture)):
        curr_cap = captions_of_lecture[i]
        if i != 0:
            prev_cap = captions_of_lecture[i - 1]

        if prev_cap != curr_cap:
            transcript += curr_cap + " "

    return transcript
Пример #16
0
def identify_file_format_data(filename, file_format, choose_index):
    if file_format == 'text/csv':
        data = pd.read_csv(filename, encoding="ISO-8859-1")
        multiple_session_data = manage_multiple_session(data)
        if choose_index is not None:
            encounter_transcripts = [
                data['Encounter - Transcript'][choose_index]
            ]
        else:
            encounter_transcripts = data['Encounter - Transcript']
        return encounter_transcripts, data
    if file_format == 'text/vtt':
        transcript = []
        data = None
        for caption in webvtt.read(filename):
            splited_text = caption.text.split(':')
            len_splited_text = len(splited_text)
            if len_splited_text > 2:
                transcript.append(caption.text)
            elif len_splited_text == 2:
                transcript.append(splited_text[1])
            else:
                transcript.append(splited_text[0])
        encounter_transcripts = [transcript]
        return encounter_transcripts, data
Пример #17
0
    def test_save_identifiers(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR)

        vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt'))
        vtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'))

        with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.500 --> 00:00:07.000',
            'Caption text #1',
            '',
            'second caption',
            '00:00:07.000 --> 00:00:11.890',
            'Caption text #2',
            '',
            '00:00:11.890 --> 00:00:16.320',
            'Caption text #3',
            '',
            '4',
            '00:00:16.320 --> 00:00:21.580',
            'Caption text #4',
            '',
            '00:00:21.580 --> 00:00:23.880',
            'Caption text #5',
            '',
            '00:00:23.880 --> 00:00:27.280',
            'Caption text #6'
        ]

        self.assertListEqual(lines, expected_lines)
Пример #18
0
 def test_parse_styles(self):
     vtt = webvtt.read(self._get_file('styles.vtt'))
     self.assertEqual(len(vtt.captions), 1)
     self.assertEqual(
         vtt.styles[0].text,
         '::cue {background-image: linear-gradient(to bottom, dimgray, lightgray);color: papayawhip;}'
     )
Пример #19
0
def get_words_from_subtitles(file):
    "Get the words from the whole serie"
    tknzr = TweetTokenizer()
    file_words = []
    for caption in webvtt.read(file):
        file_words.extend(tknzr.tokenize(caption.text))
    return file_words
Пример #20
0
def make_caption_data(video_element_name, caption_path, timecodes, duration,
                      fps, punct, mode, segmenter):
    start, end = Timecode(fps, timecodes[0]), Timecode(fps, timecodes[1])

    captions = webvtt.read(caption_path)
    captions[1].start = captions[0].start
    captions = captions[1:]

    caption_dict_list, joined_sentence = make_caption_dict_list(
        captions, fps, start, end, mode)

    sentences = segement_sentences(joined_sentence, segmenter, punct)

    if len(caption_dict_list) > 0:
        timestamps = make_timestamps(caption_dict_list, sentences, mode)
        try:
            assert len(timestamps) == len(
                sentences
            ), f'timestamps:{len(timestamps)} sentences:{len(sentences)}'
        except AssertionError as err:
            print('AssertionError:', err)
        annotation = {
            video_element_name: {
                'duration': duration,
                'timestamps': timestamps,
                'sentences': sentences
            }
        }
        return annotation
    else:
        return {}
Пример #21
0
def generate_captions_vtt(caption_path, time_per_segment, cutoff):
    """

    Function that returns a lists of captions from a given caption '.vtt' file
    Each entry in the list is the captions that were stated in the specified time_per_segment

    """

    vtt = webvtt.read(caption_path)

    caption_dict = {
        'time': [cap.start for cap in vtt.captions],
        'caption': [cap.text.split('\n') for cap in vtt.captions]
    }

    cap_df = pd.DataFrame(caption_dict)
    cap_df['time'] = cap_df['time'].apply(lambda x: pd.to_datetime(x))
    cap_df['agg_time'] = cap_df['time'].apply(lambda x: (x - cap_df['time'].iloc[0]).seconds - cutoff)

    caption_segments = []
    time_steps = [t for t in range(0, cap_df['agg_time'].iloc[-1] - cutoff, time_per_segment)]
    for idx in range(len(time_steps) - 1):
        acc = []
        for line in cap_df[(time_steps[idx] <= cap_df['agg_time'])
                           & (cap_df['agg_time'] < time_steps[idx + 1]) ]['caption'].values:
            acc += line

        seen = set()
        ordered_uniques = [line for line in acc if not (line in seen or seen.add(line))]
        caption_segments.append(' '.join(ordered_uniques))

    return caption_segments
Пример #22
0
def extract_plaintext_from_webvtt(path_to_file: str):
    """Extract plaintext from a webvtt file.

    Keyword arguments:
    path_to_file -- string -- full path to webvtt file

    Return value:
    transcript_text -- string -- plaintext of transcript
    """

    logger.debug("generate_plaintext_transcript: " + path_to_file)

    vtt = webvtt.read(path_to_file)
    transcript: str = ""

    # loop over text lines and create one block of plain text with spaces
    for line in vtt:
        transcript += line.text.strip() + " "

    # remove trailing space
    transcript_text = transcript.strip()

    logger.debug("Text extracted from: " + path_to_file)

    return transcript_text
def convert_intermedate_form(sub_path, files, id, save_location):
    segments = []
    text = []
    i = 0
    track = convert_to_spec(files)
    try:
        sub = webvtt.read(sub_path)
        sub = sub[10:-10]
        for caption in sub:
            clean_text = caption.text.replace(
                '\n', ' ').replace(',', ' ').replace('-', ' ')
            clean_text = ''.join([character for character in clean_text if (
                character.isalpha() or character == ' ')])
            start = (caption.start.split(":"))
            s = float(start[0])*3600+float(start[1])*60+float(start[2])
            end = (caption.end.split(":"))
            e = float(end[0])*3600+float(end[1])*60+float(end[2])
            temp = track[s*1000:e*1000]
            if len(clean_text.split(' ')) <= 3 or (e-s) < 3 or (e-s) > 20:
                continue
            if not path.exists(save_location+'/'+id+'/'):
                os.makedirs(save_location+'/'+id+'/')

            with open(save_location+'/'+id+'/'+str(i)+".txt", "w") as text_file:
                text_file.write(clean_text.replace('\n', ' '))

            temp.export(save_location + '/' +
                        id+'/'+str(i)+".wav", format="wav")

            i += 1
    except MalformedCaptionError as e:
        pass
Пример #24
0
def parse_subs_into_word_tockens_list(source_path, subs_file):
    result = []
    for caption in webvtt.read(source_path):
        arr = caption.raw_text.split('\n')
        if(len(arr) > 1):
            line = arr[1]
            line = re.sub('[<][\/]?[c][^<]*[>]', "", line)
            line = line.replace(" ", "").lower()
            if line:
                line = "<" + caption.start + ">" + line + "<" + caption.end + ">"
                tockens = list(filter(None, re.split('[<>]', line)))
                count = 0
                while count < len(tockens) - 2:
                    start = tockens[count]
                    text = tockens[count + 1]
                    end = tockens[count + 2]
                    toAppend = {
                            'filename': subs_file,
                            'start': start,
                            'end': end,
                            'text': text
                            }
                    result.append(toAppend)
                    count += 2
    return result
def file_writing(path):
    vtt = WebVTT()
    caption = Caption()
    emotion = ""

    for line in webvtt.read('static/subtitle.vtt'):
        emotion = predict(str(line.text))

        if emotion is "joy":
            caption = Caption(
                line.start, line.end,
                "<c.green> " + emotion + ": " + line.text + "</c>")
        elif emotion is "fear":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")

        elif emotion is "anger":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
        elif emotion is "sadness":
            caption = Caption(line.start, line.end,
                              "<c.red> " + emotion + ": " + line.text + "</c>")
        elif emotion is "neutral":
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
        else:
            caption = Caption(
                line.start, line.end,
                "<c.blue> " + emotion + ": " + line.text + "</c>")
        vtt.captions.append(caption)
    vtt.save('static/my_captions.vtt')
Пример #26
0
    def test_save_updated_identifiers(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR)

        vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt'))
        vtt.captions[0].identifier = 'first caption'
        vtt.captions[1].identifier = None
        vtt.captions[3].identifier = '44'
        last_caption = Caption('00:00:27.280', '00:00:29.200',
                               'Caption text #7')
        last_caption.identifier = 'last caption'
        vtt.captions.append(last_caption)
        vtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'))

        with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'),
                  'r',
                  encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT', '', 'first caption', '00:00:00.500 --> 00:00:07.000',
            'Caption text #1', '', '00:00:07.000 --> 00:00:11.890',
            'Caption text #2', '', '00:00:11.890 --> 00:00:16.320',
            'Caption text #3', '', '44', '00:00:16.320 --> 00:00:21.580',
            'Caption text #4', '', '00:00:21.580 --> 00:00:23.880',
            'Caption text #5', '', '00:00:23.880 --> 00:00:27.280',
            'Caption text #6', '', 'last caption',
            '00:00:27.280 --> 00:00:29.200', 'Caption text #7'
        ]

        self.assertListEqual(lines, expected_lines)
Пример #27
0
def Subtitles_Processor(subtitle,video_title,video_title_summary,video_categories_list):
    #OPENS A .VTT FILE, REPLACE SIMBOLS, PARCE IT AND INSERT EVERY SINGLE WORD INTO A DATABASE.
    print(("INSERTING DATA INTO {} TABLE. THIS PROCESS CAN TAKE A WHILE.").format(video_title))
    for caption in webvtt.read(PATH+subtitle):
        minute=(caption.start)[3:5]
        seconds=(caption.start)[6:8]
        caption=caption.text
        #REPLACING UNNECESARY SYMBOLS.
        caption=caption.replace("\\"," ")
        caption=caption.replace("\n"," ")
        caption=caption.replace("'d"," would")
        caption=caption.replace("'t", " not")
        caption=caption.replace("'s", " is")
        caption=caption.replace("'m"," am")
        caption=caption.replace("'ll", " will")
        caption=caption.replace("'re", " are")
        caption=caption.replace('?'," ")
        caption=caption.replace('"'," ")
        caption=caption.replace("."," ")
        caption=caption.replace(','," ")
        caption=caption.replace('-'," ")
        caption=caption.replace(':'," ")
        caption=caption.replace(';',"")
        caption=caption.replace("'","")

        #SPLITTING SENTENCES INTO WORDS
        sentence=caption.split(' ')
        for word in sentence:
            #INSERT WORD BY WORD INTO THE VIDEO'S TABLE
            word=word.lower()
            InsertData(minute, seconds, word,video_title,video_title_summary,video_categories_list)
    print("COMPLETED\n----------------\n")
Пример #28
0
def estimate_timing(filename):
    pieces = []
    for caption in webvtt.read(filename):
        start, end = caption.start_in_seconds, caption.end_in_seconds
        speaker, text = split_speaker(caption.text)
        all_stresses = []
        for word in text.split():
            stress_string = stresses(word)
            if word.endswith(","):
                stress_string += "3"
            elif word.endswith("."):
                stress_string += "4"
            all_stresses.append(stress_string)
        stress_pattern = "".join(all_stresses)
        stress_pattern = stress_pattern.rstrip("4")
        dt = end - start
        n_beats = 0
        for stress_type in stress_pattern:
            n_beats += PAUSE_PROPORTION.get(stress_type, 1)
        pieces.append(n_beats / dt)

    tempo_distribution = remove_outliers(pieces)
    beats_per_second = max(tempo_distribution)

    return 6
Пример #29
0
    def test_write_captions(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('one_caption.vtt'), OUTPUT_DIR)

        out = io.StringIO()
        vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))
        new_caption = Caption('00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2'])
        vtt.captions.append(new_caption)
        vtt.write(out)

        out.seek(0)
        lines = [line.rstrip() for line in out.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.500 --> 00:00:07.000',
            'Caption text #1',
            '',
            '00:00:07.000 --> 00:00:11.890',
            'New caption text line1',
            'New caption text line2'
        ]

        self.assertListEqual(lines, expected_lines)
Пример #30
0
 def test_parse_styles(self):
     vtt = webvtt.read(self._get_file('styles.vtt'))
     self.assertEqual(len(vtt.captions), 1)
     self.assertEqual(
         vtt.styles[0].text,
         '::cue {background-image: linear-gradient(to bottom, dimgray, lightgray);color: papayawhip;}'
     )
    def cutYoutube(self, src="audio"):
        outputList = {
            "vttPath": self.vttPath,
            "wavPath": self.wavPath,
            "name": self.v,
            "sub_lang": self.sub_lang,
            "text": "",
            "start": "",
            "end": "",
            "key": "",
            "cutFile": ""
        }
        i = 0
        sound = AudioSegment.from_file(self.wavPath)
        for caption in webvtt.read(self.vttPath):
            outputList['text'] = (caption.text.replace("\n",
                                                       ""))  # caption text
            outputList['start'] = (caption.start)  # caption text
            outputList['end'] = (caption.end)  # caption text
            outputList['cutFile'] = src + "/" + self.v + \
                "_" + str(i) + "."+self.audioType  # caption text

            first_half = sound[timemath(caption.start):timemath(caption.end)]
            first_half.export(src + "/" + self.v + "_" + str(i) + ".wav",
                              format="wav")
            i = i + 1
            self.saveCSV(outputList)
def video_indices(subtitle, text_summary, tolerance=0):
    text_summary = open(text_summary, "r").read()
    sents = [word_tokenize(s.lower()) for s in sent_tokenize(text_summary)]
    captions = webvtt.read(subtitle)
    cuttimes = []
    for sent in sents:
        beg = ' '.join(sent[:5])
        last = ' '.join(sent[-6:-1])
        for i, caption in enumerate(captions):
            captext = caption.text.replace("\n", " ")
            captext = filter_captext(captext)
            if beg in captext:
                start = caption.start
                for j, cap in enumerate(captions):
                    if i > j:
                        continue
                    captext = cap.text.replace("\n", " ")
                    captext = filter_captext(captext)
                    if last in captext:
                        end = caption.end
                        break
                timestamp = [start, end]
                cuttimes.append([
                    min(timestamp, key=time_for_one),
                    max(timestamp, key=time_for_one)
                ])
                break
    return cutttimes_filter(cuttimes, 10, 1000)
Пример #33
0
    def test_parse_identifiers(self):
        vtt = webvtt.read(self._get_file('using_identifiers.vtt'))
        self.assertEqual(len(vtt.captions), 6)

        self.assertEqual(vtt.captions[1].identifier, 'second caption')
        self.assertEqual(vtt.captions[2].identifier, None)
        self.assertEqual(vtt.captions[3].identifier, '4')
Пример #34
0
def vtt_to_df(vtt_filename):
    vtt=webvtt.read(vtt_filename)
    lines = []
    starts = []
    ends = []
    for line in vtt:
        extend_text=line.text.strip().splitlines()
        repeat=len(extend_text)
        lines.extend(extend_text)
        starts.extend([line.start] * repeat)
        ends.extend([line.end] * repeat)

    previous = None
    new_lines=[]
    new_starts=[]
    new_ends=[]

    for l,s,e in zip(lines,starts,ends):
        if l == previous:
            continue
        else:
            new_lines.append(l)
            new_starts.append(s)
            new_ends.append(e)
            previous = l

    df={"start":new_starts,"end":new_ends,"text":new_lines}
    df=pd.DataFrame(df)
    return df
Пример #35
0
    def from_vtt_file(self, filepath):
        for caption in webvtt.read(filepath):
            h, m, s = caption.start.split(':')
            start_time = datetime.timedelta(hours=float(h),
                                            minutes=float(m),
                                            seconds=float(s)).total_seconds()
            h, m, s = caption.end.split(':')
            end_time = datetime.timedelta(hours=float(h),
                                          minutes=float(m),
                                          seconds=float(s)).total_seconds()

            start_frame = round(start_time * self.item.fps)
            annotation_definition = entities.Subtitle(text=caption.text,
                                                      label='Text')
            annotation = entities.Annotation.new(
                annotation_definition=annotation_definition,
                frame_num=start_frame,
                item=self.item,
                start_time=start_time)

            annotation.add_frames(annotation_definition=annotation_definition,
                                  frame_num=start_frame,
                                  end_time=end_time)

            self.annotations.append(annotation)
Пример #36
0
 def test_webvtt_parse_get_caption_data(self):
     vtt = webvtt.read(self._get_file('one_caption.vtt'))
     self.assertEqual(vtt.captions[0].start_in_seconds, 0.5)
     self.assertEqual(vtt.captions[0].start, '00:00:00.500')
     self.assertEqual(vtt.captions[0].end_in_seconds, 7)
     self.assertEqual(vtt.captions[0].end, '00:00:07.000')
     self.assertEqual(vtt.captions[0].lines[0], 'Caption text #1')
     self.assertEqual(len(vtt.captions[0].lines), 1)
Пример #37
0
 def test_captions_prevent_write(self):
     vtt = webvtt.read(self._get_file('sample.vtt'))
     self.assertRaises(
         AttributeError,
         setattr,
         vtt,
         'captions',
         []
     )
Пример #38
0
 def test_clean_cue_tags(self):
     vtt = webvtt.read(self._get_file('cue_tags.vtt'))
     self.assertEqual(
         vtt.captions[1].text,
         'Like a big-a pizza pie'
     )
     self.assertEqual(
         vtt.captions[2].text,
         'That\'s amore'
     )
Пример #39
0
 def test_parse_with_comments(self):
     vtt = webvtt.read(self._get_file('comments.vtt'))
     self.assertEqual(len(vtt.captions), 3)
     self.assertListEqual(
         vtt.captions[0].lines,
         ['- Ta en kopp varmt te.',
          '- Det är inte varmt.']
     )
     self.assertEqual(
         vtt.captions[2].text,
         '- Ta en kopp'
     )
Пример #40
0
    def test_save_updated_identifiers(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR)

        vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt'))
        vtt.captions[0].identifier = 'first caption'
        vtt.captions[1].identifier = None
        vtt.captions[3].identifier = '44'
        last_caption = Caption('00:00:27.280', '00:00:29.200', 'Caption text #7')
        last_caption.identifier = 'last caption'
        vtt.captions.append(last_caption)
        vtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'))

        with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            'first caption',
            '00:00:00.500 --> 00:00:07.000',
            'Caption text #1',
            '',
            '00:00:07.000 --> 00:00:11.890',
            'Caption text #2',
            '',
            '00:00:11.890 --> 00:00:16.320',
            'Caption text #3',
            '',
            '44',
            '00:00:16.320 --> 00:00:21.580',
            'Caption text #4',
            '',
            '00:00:21.580 --> 00:00:23.880',
            'Caption text #5',
            '',
            '00:00:23.880 --> 00:00:27.280',
            'Caption text #6',
            '',
            'last caption',
            '00:00:27.280 --> 00:00:29.200',
            'Caption text #7'
        ]

        self.assertListEqual(lines, expected_lines)
Пример #41
0
    def test_save_captions(self):
        os.makedirs(OUTPUT_DIR)
        copy(self._get_file('one_caption.vtt'), OUTPUT_DIR)

        vtt = webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))
        new_caption = Caption('00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2'])
        vtt.captions.append(new_caption)
        vtt.save()

        with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'), 'r', encoding='utf-8') as f:
            lines = [line.rstrip() for line in f.readlines()]

        expected_lines = [
            'WEBVTT',
            '',
            '00:00:00.500 --> 00:00:07.000',
            'Caption text #1',
            '',
            '00:00:07.000 --> 00:00:11.890',
            'New caption text line1',
            'New caption text line2'
        ]

        self.assertListEqual(lines, expected_lines)
Пример #42
0
 def test_parse_captions_with_bom(self):
     vtt = webvtt.read(self._get_file('captions_with_bom.vtt'))
     self.assertEqual(len(vtt.captions), 4)
Пример #43
0
 def test_captions(self):
     vtt = webvtt.read(self._get_file('sample.vtt'))
     self.assertIsInstance(vtt.captions, list)
Пример #44
0
 def test_metadata_headers_multiline(self):
     vtt = webvtt.read(self._get_file('metadata_headers_multiline.vtt'))
     self.assertEqual(len(vtt.captions), 2)
Пример #45
0
 def test_webvtt__parse_captions(self):
     self.assertTrue(webvtt.read(self._get_file('sample.vtt')).captions)
Пример #46
0
 def test_webvtt_parse_get_captions(self):
     self.assertEqual(
         len(webvtt.read(self._get_file('sample.vtt')).captions),
         16
     )
Пример #47
0
 def test_webvtt_timestamps_format(self):
     vtt = webvtt.read(self._get_file('sample.vtt'))
     self.assertEqual(vtt.captions[2].start, '00:00:11.890')
     self.assertEqual(vtt.captions[2].end, '00:00:16.320')
Пример #48
0
 def test_webvtt_caption_without_cue_text(self):
     vtt = webvtt.read(self._get_file('missing_caption_text.vtt'))
     self.assertEqual(len(vtt.captions), 5)
Пример #49
0
    def test_save_to_other_location(self):
        target_path = os.path.join(OUTPUT_DIR, 'test_folder')
        os.makedirs(target_path)

        webvtt.read(self._get_file('one_caption.vtt')).save(target_path)
        self.assertTrue(os.path.exists(os.path.join(target_path, 'one_caption.vtt')))
Пример #50
0
 def test_webvtt_total_length(self):
     self.assertEqual(
         webvtt.read(self._get_file('sample.vtt')).total_length,
         64
     )
Пример #51
0
import webvtt
prevline = "HI I AM SHOUTING."
captions = []

from os import listdir
files = listdir("./subs/")

for vttFile in files:
    output = open("./subs/" + vttFile + ".txt", 'w')

    for caption in webvtt.read("./subs/"+vttFile):
        #print(caption.start)
        #print(caption.end)
        
        
        for line in caption.text.split("\n"):
            if line == " ":
                continue
            elif line != prevline:
                output.write(line)
                output.write('\n')
                prevline = line
    output.close()
    
    
"""
print(captions[0])
print("----")
print(captions[1].split('\n'))
print("----")
print(captions[2])
Пример #52
0
 def test_sequence_iteration(self):
     vtt = webvtt.read(self._get_file('sample.vtt'))
     self.assertIsInstance(vtt[0], Caption)
     self.assertEqual(len(vtt), len(vtt.captions))