def download_and_convert_subtitles(path, lang_and_url, c): real_subtitles = {} for lang in lang_and_url: path_lang = os.path.join(path, lang + ".vtt") if not os.path.exists(path_lang): try: subtitle = c.get_page(lang_and_url[lang]) subtitle = re.sub(r'^0$', '1', str(subtitle), flags=re.M) subtitle = html.unescape(subtitle) with open(path_lang, 'w') as f: f.write(subtitle) if not is_webvtt(path_lang): webvtt = WebVTT().from_srt(path_lang) webvtt.save() real_subtitles[lang] = lang + ".vtt" except HTTPError as e: if e.code == 404 or e.code == 403: logging.error("Fail to get subtitle from {}".format( lang_and_url[lang])) pass except Exception as e: logging.error("Error when converting subtitle {} : {}".format( lang_and_url[lang], e)) pass else: real_subtitles[lang] = lang + ".vtt" return real_subtitles
class SRTCaptionsTestCase(unittest.TestCase): def setUp(self): self.webvtt = WebVTT() self.srtcaptions = SRTCaptions() os.makedirs(OUTPUT_DIR) def _get_file(self, filename): return os.path.join(SUBTITLES_DIR, filename) def tearDown(self): if os.path.exists(OUTPUT_DIR): rmtree(OUTPUT_DIR) def test_convert_from_srt_to_vtt_and_back_gives_same_file(self): copy(self._get_file('sample.srt'), OUTPUT_DIR) self.webvtt.from_srt(os.path.join(OUTPUT_DIR, 'sample.srt')) self.webvtt.save() self.srtcaptions.from_vtt(os.path.join(OUTPUT_DIR, 'sample.vtt')) self.srtcaptions.save(os.path.join(OUTPUT_DIR, 'sample_converted.srt')) with open(os.path.join(OUTPUT_DIR, 'sample.srt'), 'r', encoding='utf-8') as f: original = f.read() with open(os.path.join(OUTPUT_DIR, 'sample_converted.srt'), 'r', encoding='utf-8') as f: converted = f.read() self.assertEqual(original.strip(), converted.strip())
def file_writing(path): vtt = WebVTT() caption = Caption() emotion = "" for line in webvtt.read('static/subtitle.vtt'): emotion = predict(str(line.text)) if emotion is "joy": caption = Caption( line.start, line.end, "<c.green> " + emotion + ": " + line.text + "</c>") elif emotion is "fear": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "anger": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "sadness": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") elif emotion is "neutral": caption = Caption( line.start, line.end, "<c.blue> " + emotion + ": " + line.text + "</c>") else: caption = Caption( line.start, line.end, "<c.blue> " + emotion + ": " + line.text + "</c>") vtt.captions.append(caption) vtt.save('static/my_captions.vtt')
def create_overview_vtt(video_id, nb_img, image, duration, overviewfilename): msg = "\ncreate overview vtt file" image_width = image["image_width"] image_height = image["image_height"] image_url = image["image_url"] # creating webvtt file webvtt = WebVTT() for i in range(0, nb_img): if nb_img == 99: start = format(float(duration * i / 100), '.3f') end = format(float(duration * (i + 1) / 100), '.3f') else: start = format(float(i), '.3f') end = format(float(i + 1), '.3f') start_time = time.strftime('%H:%M:%S', time.gmtime(int(str(start).split('.')[0]))) start_time += ".%s" % (str(start).split('.')[1]) end_time = time.strftime( '%H:%M:%S', time.gmtime(int( str(end).split('.')[0]))) + ".%s" % (str(end).split('.')[1]) caption = Caption( '%s' % start_time, '%s' % end_time, '%s#xywh=%d,%d,%d,%d' % (image_url, image_width * i, 0, image_width, image_height)) webvtt.captions.append(caption) webvtt.save(overviewfilename) if check_file(overviewfilename): msg += "\n- overviewfilename :\n%s" % overviewfilename else: msg = "overviewfilename Wrong file or path : "\ + "\n%s" % overviewfilename add_encoding_log(video_id, msg) change_encoding_step(video_id, -1, msg) send_email(msg, video_id) return msg
def download_and_convert_subtitles(output_path, subtitles, instance_connection): processed_subtitles = {} for lang in subtitles: subtitle_file = pathlib.Path(output_path).joinpath(f"{lang}.vtt") if not subtitle_file.exists(): try: raw_subtitle = instance_connection.get_page(subtitles[lang]) if not raw_subtitle: logger.error( f"Subtitle fetch failed from {subtitles[lang]}") continue subtitle = html.unescape( re.sub(r"^0$", "1", str(raw_subtitle), flags=re.M)) with open(subtitle_file, "w") as sub_file: sub_file.write(subtitle) if not is_webvtt(subtitle_file): webvtt = WebVTT().from_srt(subtitle_file) webvtt.save() processed_subtitles[lang] = f"{lang}.vtt" except Exception as exc: logger.error( f"Error while converting subtitle {subtitles[lang]} : {exc}" ) else: processed_subtitles[lang] = f"{lang}.vtt" return processed_subtitles
def transcribe(): command = [ 'ffmpeg', '-nostdin', '-loglevel', 'quiet', '-i', sys.argv[1], '-ar', str(sample_rate), '-ac', '1', '-f', 's16le', '-' ] process = subprocess.Popen(command, stdout=subprocess.PIPE) results = [] while True: data = process.stdout.read(4000) if len(data) == 0: break if rec.AcceptWaveform(data): results.append(rec.Result()) results.append(rec.FinalResult()) vtt = WebVTT() for i, res in enumerate(results): words = json.loads(res).get('result') if not words: continue start = timeString(words[0]['start']) end = timeString(words[-1]['end']) content = ' '.join([w['word'] for w in words]) caption = Caption(start, end, textwrap.fill(content)) vtt.captions.append(caption) # save or return webvtt if len(sys.argv) > 2: vtt.save(sys.argv[2]) else: print(vtt.content)
def generate_vtt_file(all_preds, logits, save_path): vtt = WebVTT() predictions = all_preds labels, starts, ends = get_labels_start_end_time(predictions, [1]) # smaller boundaries for ix in range(len(labels)): if ix == len(labels)-1: break diff = starts[ix+1]-ends[ix] starts[ix+1] -= floor(diff/2) ends[ix] += floor(diff/2) # load i3d classes i3d_scores = logits with open('data/info/bslcp/info.pkl', 'rb') as f: info_data = pickle.load(f) # for start, end in zip(starts, ends): for start, end in zip(starts, ends): if logits is not None: i3d_score = np.sum(np.asarray(i3d_scores)[start:end], axis=0) ind = np.argpartition(i3d_score, -10)[-10:] ind = ind[np.argsort(-i3d_score[ind])] classes = [info_data['words'][ix] for ix in ind] class_str = ','.join(classes) else: class_str = '' start = (start + 8) / 25 end = (end + 8) / 25 start_dt = datetime.timedelta(seconds=start) start_str = str(start_dt) if '.' not in start_str: start_str = f'{start_str}.000000' end_dt = datetime.timedelta(seconds=end) end_str = str(end_dt) if '.' not in end_str: end_str = f'{end_str}.000000' # creating a caption with a list of lines caption = Caption( start_str, end_str, [class_str] ) # adding a caption vtt.captions.append(caption) # save to a different file vtt.save(f'{save_path}/demo.vtt')
class SubtitleWrapper: def __init__(self): self.vtt = WebVTT() def write_caption(self, start, end, line): caption = Caption(start, end, line) self.vtt.captions.append(caption) def save_caption(self, path): self.vtt.save('{}.vtt'.format(path)) def read_caption(self, vtt_file): return WebVTT.read(vtt_file)
def readVtt(input_file, output_file, input_language, output_language): webvtt = WebVTT().read(input_file) for sentence in webvtt: print(sentence.text) translateSentence = translate(sentence.text, input_language, output_language) if both_language: sentence.text = sentence.text + " (" + translateSentence + ")" else: sentence.text = translateSentence print(sentence.text) webvtt.save() os.rename(input_file, input_file + ".old") os.rename(input_file.replace(".srt", ".vtt"), input_file) print(">", input_file, "saved!")
def download_and_convert_subtitles(path, transcripts_data, already_in_vtt, headers): for lang in transcripts_data: path_lang = os.path.join(path, lang + ".vtt") try: subtitle = get_page(transcripts_data[lang], headers).decode('utf-8') with open(path_lang, 'w') as f: f.write(str(subtitle)) if not already_in_vtt: exec_cmd( "sed -i 's/^0$/1/' " + path_lang ) #This little hack is use because WebVTT.from_srt check is the first line is 1 webvtt = WebVTT().from_srt(path_lang) webvtt.save() except HTTPError as e: if e.code == 404 or e.code == 403: pass
def transcode(source): try: sourceDir = os.path.dirname(source) sourceFile = os.path.basename(source) sourceFileNoExt = os.path.splitext(sourceFile)[0] targetFile = sourceFileNoExt + '.vtt' convert_ending(source) clean_file(source) targetFull = sourceDir + '/' + targetFile logging.debug(prelog + 'targetFull: ' + targetFull) webvtt = WebVTT().from_srt(source) webvtt.save(targetFull) return targetFull except: logging.exception(prelog) if os.path.isfile(source): shutil.move(source, source + '.failed') pass return None
def readSrt(input_file, output_file, input_language, output_language): print('processing file', input_file) subs = SubRipFile.open(input_file) print(">", "read file", input_file) for sentence in subs: print(sentence.text) translateSentence = translate(sentence.text, input_language, output_language) if both_language: sentence.text = sentence.text + " (" + translateSentence + ")" else: sentence.text = translateSentence print(sentence.text) subs.save(output_file, 'utf-8') webvtt = WebVTT().from_srt(output_file) webvtt.save() os.rename(input_file, input_file + ".old") os.remove(output_file) os.rename(output_file.replace(".srt", ".vtt"), input_file) print(">", output_file, "saved!")
def translate(self): newVTT = WebVTT() fileName = self.fileNameWOType + '.vtt' for caption in webvtt.read(fileName): # print(caption.start) # print(caption.end) # print(caption.text) translation = Translate.AWSTranslate.translate_text( Text=caption.text, SourceLanguageCode=self.sourceLanguage, TargetLanguageCode=self.targetLanguage) newCaption = Caption(caption.start, caption.end, translation.get('TranslatedText')) newCaption.identifier = caption.identifier newVTT.captions.append(newCaption) translatedFileName = self.fileNameWOType + '_' + self.targetLanguage + '.vtt' newVTT.save(translatedFileName) return 1
def process_video_url(url, pk): vid_id = get_youtube_vid_id(url) captions = YouTubeTranscriptApi.get_transcript(video_id=vid_id) vtt = WebVTT() for t in captions: start = datetime.timedelta(milliseconds=t["start"] * 1000) end = datetime.timedelta(milliseconds=t["duration"] * 1000) + start if "." not in str(start): start = str(start) + ".000" if "." not in str(end): end = str(end) + ".000" caption = Caption( start=str(start), end=str(end), text=t["text"] ) vtt.captions.append(caption) if not os.path.isdir(CACHE): os.mkdir(CACHE) path = os.path.join(CACHE, f"{vid_id}.vtt") vtt.save(path) transcript = File(open(path, "rb")) os.remove(path) obj = VidSpark.management.models.Video.objects.get(pk=pk) obj.transcript = transcript obj.save()
def text_extract(): try: # creating a folder named data if os.path.exists('static/Text'): shutil.rmtree('static/Text') os.makedirs('static/Text') # if not created then raise error except OSError: print('Error: Creating directory of data') vtt_pos = WebVTT() vtt_neg = WebVTT() vtt_neu = WebVTT() caption = Caption() emotion = "" for line in webvtt.read('static/subtitle.vtt'): emotion = predict(str(line.text)) if emotion == "joy": caption = Caption( line.start, line.end, "<c.green> " + emotion + ": " + line.text + "</c>") vtt_pos.captions.append(caption) elif emotion == "anger" or emotion == "sadness" or emotion == "fear": caption = Caption(line.start, line.end, "<c.red> " + emotion + ": " + line.text + "</c>") vtt_neg.captions.append(caption) elif emotion == "neutral": caption = Caption( line.start, line.end, "<c.blue> " + emotion + ": " + line.text + "</c>") vtt_neu.captions.append(caption) vtt_pos.save('static/Text/positive.vtt') vtt_neg.save('static/Text/negative.vtt') vtt_neu.save('static/Text/neutral.vtt')
class WebVTTTestCase(unittest.TestCase): def setUp(self): self.webvtt = WebVTT() def _get_file(self, filename): return os.path.join(SUBTITLES_DIR, filename) def tearDown(self): if os.path.exists(OUTPUT_DIR): rmtree(OUTPUT_DIR) def test_create_caption(self): caption = Caption('00:00:00.500', '00:00:07.000', ['Caption test line 1', 'Caption test line 2']) self.assertEqual(caption.start, '00:00:00.500') self.assertEqual(caption.start_in_seconds, 0.5) self.assertEqual(caption.end, '00:00:07.000') self.assertEqual(caption.end_in_seconds, 7) self.assertEqual(caption.lines, ['Caption test line 1', 'Caption test line 2']) def test_save_captions(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.vtt'), OUTPUT_DIR) self.webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt')) new_caption = Caption( '00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2']) self.webvtt.captions.append(new_caption) self.webvtt.save() with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1', 'New caption text line2' ] self.assertListEqual(lines, expected_lines) def test_srt_conversion(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.srt'), OUTPUT_DIR) self.webvtt.from_srt(os.path.join(OUTPUT_DIR, 'one_caption.srt')) self.webvtt.save() self.assertTrue( os.path.exists(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))) with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', ] self.assertListEqual(lines, expected_lines) def test_sbv_conversion(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('two_captions.sbv'), OUTPUT_DIR) self.webvtt.from_sbv(os.path.join(OUTPUT_DIR, 'two_captions.sbv')) self.webvtt.save() self.assertTrue( os.path.exists(os.path.join(OUTPUT_DIR, 'two_captions.vtt'))) with open(os.path.join(OUTPUT_DIR, 'two_captions.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.378 --> 00:00:11.378', 'Caption text #1', '', '00:00:11.378 --> 00:00:12.305', 'Caption text #2 (line 1)', 'Caption text #2 (line 2)', ] self.assertListEqual(lines, expected_lines) def test_save_to_other_location(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) self.webvtt.read(self._get_file('one_caption.vtt')).save(target_path) self.assertTrue( os.path.exists(os.path.join(target_path, 'one_caption.vtt'))) def test_save_specific_filename(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) output_file = os.path.join(target_path, 'custom_name.vtt') self.webvtt.read(self._get_file('one_caption.vtt')).save(output_file) self.assertTrue(os.path.exists(output_file)) def test_save_specific_filename_no_extension(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) output_file = os.path.join(target_path, 'custom_name') self.webvtt.read(self._get_file('one_caption.vtt')).save(output_file) self.assertTrue( os.path.exists(os.path.join(target_path, 'custom_name.vtt'))) def test_caption_timestamp_update(self): c = Caption('00:00:00.500', '00:00:07.000') c.start = '00:00:01.750' c.end = '00:00:08.250' self.assertEqual(c.start, '00:00:01.750') self.assertEqual(c.end, '00:00:08.250') def test_caption_text(self): c = Caption(text=['Caption line #1', 'Caption line #2']) self.assertEqual(c.text, 'Caption line #1\nCaption line #2') def test_caption_receive_text(self): c = Caption(text='Caption line #1\nCaption line #2') self.assertEqual(len(c.lines), 2) self.assertEqual(c.text, 'Caption line #1\nCaption line #2') def test_supported_formats(self): self.assertListEqual(WebVTT().supported_formats(), [sf[0] for sf in SUPPORTED_FORMATS]) def test_update_text(self): c = Caption(text='Caption line #1') c.text = 'Caption line #1 updated' self.assertEqual(c.text, 'Caption line #1 updated') def test_update_text_multiline(self): c = Caption(text='Caption line #1') c.text = 'Caption line #1\nCaption line #2' self.assertEqual(len(c.lines), 2) self.assertEqual(c.text, 'Caption line #1\nCaption line #2') def test_update_text_wrong_type(self): c = Caption(text='Caption line #1') self.assertRaises(AttributeError, setattr, c, 'text', 123) def test_manipulate_lines(self): c = Caption(text=['Caption line #1', 'Caption line #2']) c.lines[0] = 'Caption line #1 updated' self.assertEqual(c.lines[0], 'Caption line #1 updated') def test_captions(self): self.webvtt.read(self._get_file('sample.vtt')) self.assertIsInstance(self.webvtt.captions, list) def test_captions_prevent_write(self): self.webvtt.read(self._get_file('sample.vtt')) self.assertRaises(AttributeError, setattr, self.webvtt, 'captions', []) def test_sequence_iteration(self): self.webvtt.read(self._get_file('sample.vtt')) self.assertIsInstance(self.webvtt[0], Caption) self.assertEqual(len(self.webvtt), len(self.webvtt.captions)) def test_save_no_filename(self): webvtt = WebVTT() self.assertRaises(MissingFilenameError, webvtt.save) def test_malformed_start_timestamp(self): self.assertRaises(MalformedCaptionError, Caption, '01:00')
# -*- coding: utf-8 -*- import pysrt import webvtt from webvtt import WebVTT, Caption subs = pysrt.open( 'Tanmay Bakshi - New Google Employee Indian Boy Going To Ninth Grade.srt', encoding='utf-8') vtt = WebVTT() for ligne in subs: print(str(ligne.start)) print(str(ligne.end)) print(str(ligne.text)) caption = Caption(str(ligne.start), str(ligne.end), str(ligne.text)) #print(caption.start) #print(caption.end) #print (var2) vtt.captions.append(caption) vtt.save('_fr.vtt')
vtt = WebVTT() vtt.read(filename) stmp = StringIO() print("<div>", file=stmp) for caption in vtt: print('<span data-start="{}" data-end="{}">{}</span>'.format( caption.start, caption.end, caption.text), file=stmp) print("</div>", file=stmp) # Translate driver = TranslationDriver(args.lang) strans = driver.translate(stmp.getvalue()) # Convert translated HTML back to VTT vtt = WebVTT() soup = BeautifulSoup(strans, "lxml") for span in soup.find_all("span"): start = span["data-start"] end = span["data-end"] caption = Caption(start, end, span.text) vtt.captions.append(caption) # Remove the english file os.remove(filename) outfile = filename.replace(".en.", ".{}.".format(args.lang)) vtt.save(outfile) print(green(outfile, bold=True))
class WebVTTTestCase(unittest.TestCase): def setUp(self): self.webvtt = WebVTT() def _get_file(self, filename): return os.path.join(SUBTITLES_DIR, filename) def tearDown(self): if os.path.exists(OUTPUT_DIR): rmtree(OUTPUT_DIR) def test_create_caption(self): caption = Caption('00:00:00.500', '00:00:07.000', ['Caption test line 1', 'Caption test line 2']) self.assertEqual(caption.start, '00:00:00.500') self.assertEqual(caption.start_in_seconds, 0.5) self.assertEqual(caption.end, '00:00:07.000') self.assertEqual(caption.end_in_seconds, 7) self.assertEqual(caption.lines, ['Caption test line 1', 'Caption test line 2']) def test_write_captions(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.vtt'), OUTPUT_DIR) out = io.StringIO() self.webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt')) new_caption = Caption( '00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2']) self.webvtt.captions.append(new_caption) self.webvtt.write(out) out.seek(0) lines = [line.rstrip() for line in out.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1', 'New caption text line2' ] self.assertListEqual(lines, expected_lines) def test_save_captions(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.vtt'), OUTPUT_DIR) self.webvtt.read(os.path.join(OUTPUT_DIR, 'one_caption.vtt')) new_caption = Caption( '00:00:07.000', '00:00:11.890', ['New caption text line1', 'New caption text line2']) self.webvtt.captions.append(new_caption) self.webvtt.save() with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'New caption text line1', 'New caption text line2' ] self.assertListEqual(lines, expected_lines) def test_srt_conversion(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.srt'), OUTPUT_DIR) self.webvtt.from_srt(os.path.join(OUTPUT_DIR, 'one_caption.srt')) self.webvtt.save() self.assertTrue( os.path.exists(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))) with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', ] self.assertListEqual(lines, expected_lines) def test_sbv_conversion(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('two_captions.sbv'), OUTPUT_DIR) self.webvtt.from_sbv(os.path.join(OUTPUT_DIR, 'two_captions.sbv')) self.webvtt.save() self.assertTrue( os.path.exists(os.path.join(OUTPUT_DIR, 'two_captions.vtt'))) with open(os.path.join(OUTPUT_DIR, 'two_captions.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.378 --> 00:00:11.378', 'Caption text #1', '', '00:00:11.378 --> 00:00:12.305', 'Caption text #2 (line 1)', 'Caption text #2 (line 2)', ] self.assertListEqual(lines, expected_lines) def test_save_to_other_location(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) self.webvtt.read(self._get_file('one_caption.vtt')).save(target_path) self.assertTrue( os.path.exists(os.path.join(target_path, 'one_caption.vtt'))) def test_save_specific_filename(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) output_file = os.path.join(target_path, 'custom_name.vtt') self.webvtt.read(self._get_file('one_caption.vtt')).save(output_file) self.assertTrue(os.path.exists(output_file)) def test_save_specific_filename_no_extension(self): target_path = os.path.join(OUTPUT_DIR, 'test_folder') os.makedirs(target_path) output_file = os.path.join(target_path, 'custom_name') self.webvtt.read(self._get_file('one_caption.vtt')).save(output_file) self.assertTrue( os.path.exists(os.path.join(target_path, 'custom_name.vtt'))) def test_caption_timestamp_update(self): c = Caption('00:00:00.500', '00:00:07.000') c.start = '00:00:01.750' c.end = '00:00:08.250' self.assertEqual(c.start, '00:00:01.750') self.assertEqual(c.end, '00:00:08.250') def test_caption_timestamp_format(self): c = Caption('01:02:03.400', '02:03:04.500') self.assertEqual(c.start, '01:02:03.400') self.assertEqual(c.end, '02:03:04.500') c = Caption('02:03.400', '03:04.500') self.assertEqual(c.start, '00:02:03.400') self.assertEqual(c.end, '00:03:04.500') def test_caption_text(self): c = Caption(text=['Caption line #1', 'Caption line #2']) self.assertEqual(c.text, 'Caption line #1\nCaption line #2') def test_caption_receive_text(self): c = Caption(text='Caption line #1\nCaption line #2') self.assertEqual(len(c.lines), 2) self.assertEqual(c.text, 'Caption line #1\nCaption line #2') def test_update_text(self): c = Caption(text='Caption line #1') c.text = 'Caption line #1 updated' self.assertEqual(c.text, 'Caption line #1 updated') def test_update_text_multiline(self): c = Caption(text='Caption line #1') c.text = 'Caption line #1\nCaption line #2' self.assertEqual(len(c.lines), 2) self.assertEqual(c.text, 'Caption line #1\nCaption line #2') def test_update_text_wrong_type(self): c = Caption(text='Caption line #1') self.assertRaises(AttributeError, setattr, c, 'text', 123) def test_manipulate_lines(self): c = Caption(text=['Caption line #1', 'Caption line #2']) c.lines[0] = 'Caption line #1 updated' self.assertEqual(c.lines[0], 'Caption line #1 updated') def test_captions(self): self.webvtt.read(self._get_file('sample.vtt')) self.assertIsInstance(self.webvtt.captions, list) def test_captions_prevent_write(self): self.webvtt.read(self._get_file('sample.vtt')) self.assertRaises(AttributeError, setattr, self.webvtt, 'captions', []) def test_sequence_iteration(self): self.webvtt.read(self._get_file('sample.vtt')) self.assertIsInstance(self.webvtt[0], Caption) self.assertEqual(len(self.webvtt), len(self.webvtt.captions)) def test_save_no_filename(self): webvtt = WebVTT() self.assertRaises(MissingFilenameError, webvtt.save) def test_malformed_start_timestamp(self): self.assertRaises(MalformedCaptionError, Caption, '01:00') def test_set_styles_from_text(self): style = Style() style.text = '::cue(b) {\n color: peachpuff;\n}' self.assertListEqual(style.lines, ['::cue(b) {', ' color: peachpuff;', '}']) def test_get_styles_as_text(self): style = Style() style.lines = ['::cue(b) {', ' color: peachpuff;', '}'] self.assertEqual(style.text, '::cue(b) {color: peachpuff;}') def test_save_identifiers(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR) self.webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt')) self.webvtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt')) with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', 'second caption', '00:00:07.000 --> 00:00:11.890', 'Caption text #2', '', '00:00:11.890 --> 00:00:16.320', 'Caption text #3', '', '4', '00:00:16.320 --> 00:00:21.580', 'Caption text #4', '', '00:00:21.580 --> 00:00:23.880', 'Caption text #5', '', '00:00:23.880 --> 00:00:27.280', 'Caption text #6' ] self.assertListEqual(lines, expected_lines) def test_save_updated_identifiers(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('using_identifiers.vtt'), OUTPUT_DIR) self.webvtt.read(os.path.join(OUTPUT_DIR, 'using_identifiers.vtt')) self.webvtt.captions[0].identifier = 'first caption' self.webvtt.captions[1].identifier = None self.webvtt.captions[3].identifier = '44' last_caption = Caption('00:00:27.280', '00:00:29.200', 'Caption text #7') last_caption.identifier = 'last caption' self.webvtt.captions.append(last_caption) self.webvtt.save(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt')) with open(os.path.join(OUTPUT_DIR, 'new_using_identifiers.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', 'first caption', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', '', '00:00:07.000 --> 00:00:11.890', 'Caption text #2', '', '00:00:11.890 --> 00:00:16.320', 'Caption text #3', '', '44', '00:00:16.320 --> 00:00:21.580', 'Caption text #4', '', '00:00:21.580 --> 00:00:23.880', 'Caption text #5', '', '00:00:23.880 --> 00:00:27.280', 'Caption text #6', '', 'last caption', '00:00:27.280 --> 00:00:29.200', 'Caption text #7' ] self.assertListEqual(lines, expected_lines)
def genarateSUB(url, lang): url = url language = lang # fine If any english captions available in the yputube url video captionTitle = Extract_Caption.extractTitle(url) videoName = Extract_Caption.download_video(url) wavFilePath = extractWavAudio.extractWAV(url) # spliting the audio file in to multiple audio AudioSplit.split(wavFilePath, captionTitle) # initiate the subtitle file path vtt = WebVTT() # initiate slite wav file num_files = len(os.listdir('../Datas/Splits/' + captionTitle + '/')) cnt = 0 start = 0 end = 5 for i in range(1, num_files + 1): flag = 0 text, confidence = ms_asr.transcribe('../Datas/Splits/' + captionTitle + '/' + str(i) + '.wav') print("Text: ", text) print("Confidence: ", confidence) if text == " ": translated_text = " " else: translated_text = TRANSLATR_TO_TEXT.translateFromTXT( text, language) flag = 1 cnt += 1 print("Translated Text: ", translated_text) if flag == 1: start_hours = start // 3600 temp = start % 3600 start_min = temp // 60 start_sec = temp % 60 end_hours = end // 3600 temp = end % 3600 end_min = temp // 60 end_sec = temp % 60 if (start_hours <= 9): start_hours = '0' + str(start_hours) else: start_hours = str(start_hours) if (start_min <= 9): start_min = '0' + str(start_min) else: start_min = str(start_min) if (start_sec <= 9): start_sec = '0' + str(start_sec) else: start_sec = str(start_sec) if (end_hours <= 9): end_hours = '0' + str(end_hours) else: end_hours = str(end_hours) if (end_min <= 9): end_min = '0' + str(end_min) else: end_min = str(end_min) if (end_sec <= 9): end_sec = '0' + str(end_sec) else: end_sec = str(end_sec) caption = Caption( start_hours + ':' + start_min + ':' + start_sec + '.001 ', end_hours + ':' + end_min + ':' + end_sec + '.000\n', str(translated_text) + '\n') vtt.captions.append(caption) start += 5 end += 5 vttFilePath = "../webApp/static/SubtitleFile/" + captionTitle + "_" + language + ".vtt" vtt.save(vttFilePath) vttName = captionTitle + "_" + language + ".vtt" files = glob.glob('../Datas/Splits/' + captionTitle + '/*') for f in files: os.remove(f) os.rmdir('../Datas/Splits/' + captionTitle) os.remove(wavFilePath) if language.__eq__('ta'): retlan = 'Tamil' if language.__eq__('si'): retlan = 'Sinhala', return videoName, vttName
def add_word(self, word, collection, start, end, name, add_type, word_type, group, word_id='', wordset_id=''): clean_word = word.strip() puresave_filename = name.split('.')[0] + "~" + clean_word # row = { # "videaname":puresave_filename, # "wordbase_type":"video" # } # w = WordbaseHelper() # w.init_word(row,clean_word) # w.insert(row,collection) data = { 'filename': puresave_filename, 'wordbase_collection': collection, 'word': word, 'add_type': add_type, 'word_type': word_type, 'group': group, 'word_id': word_id, 'wordset_id': wordset_id, } work_dir = "D:\BaiduYunDownload" file_path = "" parent_path = "" double_loop_flag = False for parent, dirnames, filenames in os.walk(work_dir, followlinks=True): for filename in filenames: if filename == name: parent_path = parent file_path = os.path.join(parent, filename) double_loop_flag = True break if double_loop_flag: break start_time = float(start) end_time = float(end) pure_filename = name.split('.')[0] subfile_path = os.path.join(parent_path, pure_filename + ".srt") video_clip = VideoFileClip(file_path) clip = video_clip.subclip(start_time, end_time) target = "D:\BaiduYunDownload\\videos\\" + puresave_filename + ".mp4" clip.write_videofile(target, codec='libx264', verbose=False, audio=True) video_clip.close() subtitle = SSAFile.load(subfile_path) text = ''' 1 00:00:00,000 --> 00:00:00,000 ''' temp = SSAFile().from_string(text) for sub in subtitle: if sub.start >= start_time * 1000 and sub.end <= end_time * 1000: text = sub.text.replace( clean_word, '<c.video-heightlight>' + clean_word + '</c>') sub.text = text sub.shift(s=-start_time) temp.append(sub) sub_target = "D:\BaiduYunDownload\\videos\\" + puresave_filename temp.save(sub_target + '.srt') vtt = WebVTT().from_srt(sub_target + '.srt') vtt.save(sub_target + '.vtt') files = { "video": open(target, "rb"), "subtitle": open(sub_target + '.vtt', "rb") } # print(files) # r = requests.post('http://127.0.0.1:5000/video', data=data,files=files) r = requests.post('http://' + server_ip + '/video', data=data, files=files) # print(r.request) return "true"