def convert_to_vtt(self, new_file): try: webvtt.from_srt(new_file).save(new_file[:-3] + "vtt") new_file = new_file[:-3] + "vtt" return new_file except UnicodeDecodeError: print("************ codecs ***********") with codecs.open(new_file, "r", encoding="latin-1") as sourceFile: with codecs.open(new_file[:-3] + "txt", "w", "utf-8") as targetFile: contents = sourceFile.read() targetFile.write(contents) webvtt.from_srt( new_file[:-3] + "txt").save(new_file[:-3] + "vtt") new_file = new_file[:-3] + "vtt" return new_file except webvtt.errors.MalformedFileError: print("************ " "The file does not have a valid format. !!!!! " "************") print(new_file) print("************ ************") return ""
def convert_vtt_caption(request, file): from nas.utils.utils2 import WebVTTWriter f = File.objects.get(pk=file) vtt = webvtt.from_srt(f.file.path) captions = vtt.captions content = WebVTTWriter().write(captions) return JsonResponse(data={"content": content})
def srtToCaptions(self, vttObject): captions = [] srt = "" # Get metadata s3 = boto3.client('s3') try: self.logger.debug("Getting data from s3://" + vttObject["Bucket"] + "/" + vttObject["Key"]) srt = S3Helper().readFromS3(vttObject["Bucket"], vttObject["Key"]) self.logger.debug(srt) except Exception as e: raise e #buffer = StringIO(srt) f = NamedTemporaryFile(mode='w+', delete=False) f.write(srt) f.close() for srtcaption in webvtt.from_srt(f.name): caption = {} self.logger.debug(srtcaption) caption["start"] = self.formatTimeVTTtoSeconds(srtcaption.start) caption["end"] = self.formatTimeVTTtoSeconds(srtcaption.end) caption["caption"] = srtcaption.lines[0] self.logger.debug("Caption Object:{}".format(caption)) captions.append(caption) return captions
def srt_translate(upload_folder, download_folder, file_name, file_name_persian): with open("%s%s" % (download_folder, file_name_persian), "a+") as f: counter = 1 captions = webvtt.from_srt("%s%s" % (upload_folder, file_name)) for caption in captions: f.write(str(counter)) f.write("\n") # print(caption.start,"-->", caption.end) f.write(caption.start) f.write(" --> ") f.write(caption.end) f.write("\n") # print(translator.translate(caption.text, src="en", dest="fa").text) f.write( translator.translate(caption.text, src="en", dest="fa").text) f.write("\n\n") # print() counter += 1 f.close() print("Done. --> %s%s" % (upload_folder, file_name_persian)) return file_name_persian # #TODO detect file format # if y == ".srt": # srt_translate(file_name, file_name_persian) # elif y == ".vtt": # vtt_translate(file_name, file_name_persian) # elif y == ".sbv": # pass # else: # print("dont detect!!!")
def save_media_caption_file(file_guid, language, file_name, f_handle): """ Save caption file to the proper """ ret = False print("Trying to save caption file: " + file_guid + "/" + language + "/" + \ file_name) try: file_name = os.path.basename(file_name) parts = os.path.splitext(file_name) dest_path = get_media_file_path(file_guid) dest_path = dest_path.replace(".mp4", "_" + language + parts[1].lower()) out_file = open(dest_path, 'wb') out_file.write(f_handle.read()) out_file.close() # Do we need to convert to VTT? if dest_path.lower().endswith("srt"): vtt = webvtt.from_srt(dest_path) output_caption_file = dest_path.replace("srt", "vtt") vtt.save(output_caption_file) #print("Saved " + language + " to " + output_caption_file) ret = True except Exception as ex: print("Error saving caption file! " + file_guid + "/" + dest_path + "/" + language + "\n" + str(ex)) return ret
def test_srt_parse_get_caption_data(self): vtt = webvtt.from_srt(self._get_file('one_caption.srt')) self.assertEqual(vtt.captions[0].start_in_seconds, 0.5) self.assertEqual(vtt.captions[0].start, '00:00:00.500') self.assertEqual(vtt.captions[0].end_in_seconds, 7) self.assertEqual(vtt.captions[0].end, '00:00:07.000') self.assertEqual(vtt.captions[0].lines[0], 'Caption text #1') self.assertEqual(len(vtt.captions[0].lines), 1)
def srt2vtt(srt_filename, vtt_filename): try: #webvtt does not accept io.buffered only string path vtt = webvtt.from_srt(srt_filename) #webvtt does not accept io.buffered only string path vtt.save(vtt_filename) except webvtt.errors.MalformedCaptionError as e: return str(e)
def test_srt_parse_get_caption_data(self): vtt = webvtt.from_srt(self._get_file('one_caption.srt')) self.assertEqual(vtt.captions[0].start_in_seconds, 0.5) self.assertEqual(vtt.captions[0].start, '00:00:00.500') self.assertEqual(vtt.captions[0].end_in_seconds, 7) self.assertEqual(vtt.captions[0].end, '00:00:07.000') self.assertEqual(vtt.captions[0].lines[0], 'Caption text #1') self.assertEqual(len(vtt.captions[0].lines), 1)
def to_vtt(self, filename): """ Get the VTT content given an SRT file. Will return the original content if the file is already in VTT format. """ if filename.lower().endswith('.vtt'): return filename import webvtt with self._file_lock: try: webvtt.read(filename) return filename except Exception: webvtt.from_srt(filename).save() return '.'.join(filename.split('.')[:-1]) + '.vtt'
def craete_vtt_from(srt_file): try: vtt = webvtt.from_srt(srt_file) path_vtt_file = os.path.splitext(srt_file)[0] + '.vtt' vtt.save(path_vtt_file) #! The file does not have a valid format. except: return
def get_subtitle_file(filename: str) -> WebVTT: file_ext = os.path.splitext(filename)[1] if file_ext == ".srt": return webvtt.from_srt(filename) elif file_ext == ".sbv": return webvtt.from_sbv(filename) elif file_ext == ".vtt": return webvtt.read(filename) else: raise ValueError(filename)
def _init_content(self): self.content = [] obj = webvtt.from_srt(self.file) for index, caption in enumerate(obj.captions): self.content.append({ "start": caption.start, "end": caption.end, "text": filter_typos(caption.text), "identifier": str(index + 1) })
def convert_to_vtt(srt_path): srt_content = None for encoding in [ 'utf-8', 'iso-8859-1', ]: try: with open(str(srt_path), encoding=encoding) as f: srt_content = f.read().strip() break except UnicodeDecodeError: pass assert srt_content, 'no detectable encoding' with tempfile.NamedTemporaryFile(mode='w', suffix='.srt', encoding='utf-8', delete=False) as tmp_srt: tmp_srt.write(srt_content) webvtt.from_srt(tmp_srt.name).save() with open(tmp_srt.name.replace('.srt', '.vtt')) as vtt: return vtt.read()
def search_subs(film, lang): url = 'https://subtitle-api.org/videos/{}/subtitles'.format(film.imdb_id) params = dict( lang=lang, format="SUBRIP", ) r = requests.get(url=url, params=params) if r.status_code != 200: return resp = json.loads(r.text) for item in resp['items']: url = 'https://subtitle-api.org/videos/{}/subtitles/{}'.format( film.imdb_id, item['id']) r = requests.get(url=url, params=params) if r.status_code != 200: continue temp_file = NamedTemporaryFile(delete=True) temp_file.write(r.content) temp_file.flush() if lang == "ru": film.ru_sub_srt.save(film.imdb_id + "_ru.srt", File(temp_file), save=True) webvtt.from_srt(settings.MEDIA_ROOT + '/' + str(film.ru_sub_srt)).save() film.ru_sub_vtt = film.imdb_id + "_ru.vtt" film.save() else: film.en_sub_srt.save(film.imdb_id + "_en.srt", File(temp_file), save=True) webvtt.from_srt(settings.MEDIA_ROOT + '/' + str(film.en_sub_srt)).save() film.en_sub_vtt = film.imdb_id + "_en.vtt" film.save() return
def test_convert_from_srt_to_vtt_and_back_gives_same_file(self): copy(self._get_file('sample.srt'), OUTPUT_DIR) vtt = webvtt.from_srt(os.path.join(OUTPUT_DIR, 'sample.srt')) vtt.save_as_srt(os.path.join(OUTPUT_DIR, 'sample_converted.srt')) with open(os.path.join(OUTPUT_DIR, 'sample.srt'), 'r', encoding='utf-8') as f: original = f.read() with open(os.path.join(OUTPUT_DIR, 'sample_converted.srt'), 'r', encoding='utf-8') as f: converted = f.read() self.assertEqual(original.strip(), converted.strip())
def generate_time_data(yt,framesdir="frames"):#0 time in s 1 data (image or text) 2 type ("image" or "text") 3 time as timestamped yt.maxseconds=int(yt.meta["duration"]) yt.maxframes=int(os.popen('mediainfo --Output="Video;%FrameCount%" '+yt.videofile).read().replace("\n","")) time_data=[] if(yt.subs==1): if(yt.subfile[-3:]=="srt"): subvtt=webvtt.from_srt(yt.subfile) if(yt.subfile[-3:]=="vtt"): subvtt=webvtt.read(yt.subfile) #0 time in s 1 data (image or text) 2 type ("image" or "text") 3 time as timestamped for i in subvtt: ti_me=sum(x * float(t) for x, t in zip([3600, 60, 1], i.end.split(":"))) time_data.append([ti_me, i.text, "text", i.start]) no_duplicates=[] buf=[] for elements in time_data: if(elements[2]=="text"): t=elements[0] for i in elements[1].split("\n"): if(i.replace(" ","")!=""): buf.append([t,i,elements[3]]) lastone=[] for i in buf: if(i[1]!=lastone): no_duplicates.append([i[0],i[1],"text",i[2]]) lastone=i[1] time_data=no_duplicates frameslist=os.listdir(framesdir) frames=[] for i in frameslist: framenr=int(i[:i.find(".")]) framepos=framenr/yt.maxframes framesec=framepos*yt.maxseconds hour_=int(framesec/3600) minute_=int((framesec/60)-hour_*60) second_=int((framesec-(hour_*3600))-(minute_*60)) tstmp=str(hour_).zfill(2)+":"+str(minute_).zfill(2)+":"+str(second_).zfill(2) time_data.append([framesec,framenr,"image", tstmp]) time_data.sort(key=lambda tup:tup[0]) yt.time_data=time_data return time_data
def test_srt_conversion(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.srt'), OUTPUT_DIR) vtt = webvtt.from_srt(os.path.join(OUTPUT_DIR, 'one_caption.srt')) vtt.save() self.assertTrue(os.path.exists(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))) with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', ] self.assertListEqual(lines, expected_lines)
def test_srt_conversion(self): os.makedirs(OUTPUT_DIR) copy(self._get_file('one_caption.srt'), OUTPUT_DIR) vtt = webvtt.from_srt(os.path.join(OUTPUT_DIR, 'one_caption.srt')) vtt.save() self.assertTrue(os.path.exists(os.path.join(OUTPUT_DIR, 'one_caption.vtt'))) with open(os.path.join(OUTPUT_DIR, 'one_caption.vtt'), 'r', encoding='utf-8') as f: lines = [line.rstrip() for line in f.readlines()] expected_lines = [ 'WEBVTT', '', '00:00:00.500 --> 00:00:07.000', 'Caption text #1', ] self.assertListEqual(lines, expected_lines)
def fix_subtitle_sequencing(filename): if os.path.isfile(filename + ".bk"): print("Not overwriting original backup for {}, skipping.".format( filename)) return subs = None if os.path.splitext(filename)[1] == ".srt": subs = webvtt.from_srt(filename) elif os.path.splitext(filename)[1] == ".sbv": subs = webvtt.from_sbv(filename) # Adjust timing and stretch subtitles for fixing the live ones which # get messed up by Youtube if "--fix-live" in sys.argv: for i in range(len(subs)): start = parse_time_stamp(subs[i].start) start -= timedelta(seconds=8) if start < timedelta(hours=0, minutes=0, seconds=0, milliseconds=0): start = timedelta(hours=0, minutes=0, seconds=0, milliseconds=0) end = start + timedelta(seconds=4) subs[i].start = format_time_stamp(start) subs[i].end = format_time_stamp(end) for i in range(len(subs) - 1): end = parse_time_stamp(subs[i].end) next_start = parse_time_stamp(subs[i + 1].start) if end > next_start: subs[i].end = subs[i + 1].start if not "--dry" in sys.argv: shutil.copy(filename, filename + ".bk") out_srt = os.path.splitext(filename)[0] + ".srt" with open(out_srt, "w", encoding="utf8") as f: subs.write(f, format="srt")
def audio_crop(video_id): video = VideoUpload.objects.get(pk=video_id) vtt = webvtt.from_srt(str(video.subfile.path)) vtt.save() f = pysrt.open(str(video.subfile.path)) time_start_end = settings.MEDIA_ROOT + str( video.id) + '/subtitle/' + '/tse.txt' file = open(time_start_end, "w") for i in range(len(f)): timestamp = "{}, {}:{}:{}.{}, {}:{}:{}.{}\n".format( i + 1, f[i].start.hours, f[i].start.minutes, f[i].start.seconds, f[i].start.milliseconds, f[i].end.hours, f[i].end.minutes, f[i].end.seconds, f[i].end.milliseconds) file.write(timestamp) file.close() ip_video = settings.MEDIA_ROOT + str(video.videofile) os.system('mkdir {}'.format(settings.MEDIA_ROOT + str(video.id) + '/audio')) os.system('mkdir {}'.format(settings.MEDIA_ROOT + str(video.id) + '/audio/crop')) ip_audio = settings.MEDIA_ROOT + str(video.id) + '/audio' + "/audio.mp3" os.system('ffmpeg -hide_banner -i {} -vn {}'.format(ip_video, ip_audio)) time.sleep(3) crop = open(time_start_end, "r") for line in crop: res = tuple(map(str, line.split(', '))) z = res[2].rstrip() op_audio = settings.MEDIA_ROOT + str( video.id) + '/audio/crop' + "/op_{}.mp3".format(res[0]) os.system('ffmpeg -hide_banner -loglevel panic -i {} -ss {} -to {} {}'. format(ip_audio, res[1], z, op_audio)) time.sleep(0.2) crop.close()
def srt_text(self): obj = webvtt.from_srt(self.filename) content = " ".join( [filter_typos(caption.text) for caption in obj.captions]) return content
def test_srt_empty_caption_text(self): self.assertTrue(webvtt.from_srt(self._get_file('missing_caption_text.srt')).captions)
def test_sbv_parse_captions(self): self.assertEqual( len(webvtt.from_srt(self._get_file('sample.srt')).captions), 5)
def test_srt_empty_gets_removed(self): captions = webvtt.from_srt( self._get_file('missing_caption_text.srt')).captions self.assertEqual(len(captions), 4)
def test_srt_timestamps_format(self): vtt = webvtt.from_srt(self._get_file('sample.srt')) self.assertEqual(vtt.captions[2].start, '00:00:11.890') self.assertEqual(vtt.captions[2].end, '00:00:16.320')
def test_srt_parse_captions(self): self.assertTrue(webvtt.from_srt(self._get_file('sample.srt')).captions)
def test_srt_empty_caption_text(self): self.assertTrue( webvtt.from_srt( self._get_file('missing_caption_text.srt')).captions)
def pull_youtube_caption(yt_url, media_guid): # Download the specified caption file. if is_media_captions_present(media_guid): print("VTT file present.") time.sleep(5) return True # Pull the db info media_file = db(db.media_files.media_guid == media_guid).select().first() if media_file is None: print("ERROR - Unable to find a db record for " + str(media_guid)) # Slight pause - let scheduler grab output return False (w2py_folder, applications_folder, app_folder) = get_app_folders() target_file = get_media_file_path(media_guid, "srt") from pytube import YouTube try: yt = YouTube(yt_url.replace("/embed/", "/watch?v="), proxies=get_youtube_proxies()) except HTTPError as ex: if ex.code == 429: # Need to try again later # Pass this exception up the stack #raise ex pass print("HTTP ERROR: " + str(ex)) # Slight pause - let scheduler grab output time.sleep(5) return False except Exception as ex: msg = "Bad YT URL? " + yt_url + " -- " + str(ex) print(msg) return False for cap in yt.captions: lang = cap.code output_caption_file = target_file.replace(".srt", "_" + lang + ".srt") #print("Trying to saving " + lang + " to " + output_caption_file) try: print("Saving " + lang + " to " + output_caption_file) #caption_url = cap.url #r = requests.get(caption_url) caption_srt = cap.generate_srt_captions() # Save SRT file f = open(output_caption_file, "wb") f.write(caption_srt.encode('utf-8')) f.close() # Convert to webvtt format vtt = webvtt.from_srt(output_caption_file) output_caption_file = output_caption_file.replace("srt", "vtt") vtt.save(output_caption_file) print("Saved " + lang + " to " + output_caption_file) except Exception as ex: print("Error - unable to grab caption for lang: " + yt_url + " / " + lang + \ "\n\n" + str(ex)) continue # Slight pause - let scheduler grab output time.sleep(5) return True
def test_srt_total_length(self): self.assertEqual( webvtt.from_srt(self._get_file('sample.srt')).total_length, 23)
def convert_files_to_vtt(self): for path, directories, files in os.walk(self.srt_path): for file in files: webvtt.from_srt(f"{path}/{file}").save( f"{self.vtt_path}{file.replace('srt', 'vtt')}" )
def test_srt_parse_captions(self): self.assertTrue(webvtt.from_srt(self._get_file('sample.srt')).captions)
def test_srt_empty_gets_removed(self): captions = webvtt.from_srt(self._get_file('missing_caption_text.srt')).captions self.assertEqual(len(captions), 4)
import os import webvtt path = input('Filepath where your files are located: ') files = [os.path.join(path, name) for path, subdirs, files in os.walk(path) for name in files] captionfiles = ['.srt', '.webvtt', '.vtt'] for f in files: extension = os.path.splitext(f)[-1] if extension.lower() in captionfiles: try: if extension == '.srt': caption = webvtt.from_srt(f) else: caption = webvtt.read(f) txtcontent = [cap.text for cap in caption] outputtxt = os.path.basename(f).replace(extension, '.txt') with open(outputtxt, 'w') as outputfile: outputfile.write('\n'.join(txtcontent)) except Exception as e: print("{} for {}".format(e, f))
def test_srt_timestamps_format(self): vtt = webvtt.from_srt(self._get_file('sample.srt')) self.assertEqual(vtt.captions[2].start, '00:00:11.890') self.assertEqual(vtt.captions[2].end, '00:00:16.320')
def test_sbv_parse_captions(self): self.assertEqual( len(webvtt.from_srt(self._get_file('sample.srt')).captions), 5 )
path = os.path.join(zipDir, dirs.pop()) if os.path.isdir(path): for anotherPath in os.listdir(path): dirs.append(anotherPath) elif os.path.isfile(path): files.append(path) # get all srt file srts = [] for file in files: if file[-3:] == 'srt': srts.append(file) # convert try: for srt in srts: webvtt.from_srt(srt).save() except Exception: for srt in srts: f = open(srt, 'r', encoding='utf-16') sub = "WEBVTT\n\n" for line in f: temp = line[:-1] if not temp.isdigit(): sub += line.replace(',', '.') f.close() vtt = srt.replace('.srt', '.vtt') f = open(vtt, 'w') f.write(sub) f.close() except Exception as e:
#!/usr/bin/env python3 import webvtt webvtt = webvtt.from_srt('/Users/hepting/Downloads/01_CS-428+828-201930_otter_ai.srt') webvtt.save()
def test_srt_total_length(self): self.assertEqual( webvtt.from_srt(self._get_file('sample.srt')).total_length, 23 )