def is_valid(self): """Check if a subtitle text is a valid SubRip format""" try: pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE) return True except pysrt.Error as e: if e.args[0] > 80: return True except: logger.exception('Unexpected error when validating subtitle') return False
def is_valid(self): """Check if a :attr:`text` is a valid SubRip format. :return: whether or not the subtitle is valid. :rtype: bool """ if self._is_valid: return True text = self.text if not text: return False # valid srt try: pysrt.from_string(text, error_handling=pysrt.ERROR_RAISE) except Exception: logger.error("PySRT-parsing failed, trying pysubs2") else: self._is_valid = True return True # something else, try to return srt try: logger.debug("Trying parsing with PySubs2") try: # in case of microdvd, try parsing the fps from the subtitle subs = pysubs2.SSAFile.from_string(text) if subs.format == "microdvd": logger.info("Got FPS from MicroDVD subtitle: %s", subs.fps) else: logger.info("Got format: %s", subs.format) except pysubs2.UnknownFPSError: # if parsing failed, use frame rate from provider sub_fps = self.get_fps() if not isinstance(sub_fps, float) or sub_fps < 10.0: # or use our media file's fps as a fallback sub_fps = self.plex_media_fps logger.info( "No FPS info in subtitle. Using our own media FPS for the MicroDVD subtitle: %s", self.plex_media_fps) subs = pysubs2.SSAFile.from_string(text, fps=sub_fps) unicontent = self.pysubs2_to_unicode(subs) self.content = unicontent.encode(self.get_encoding()) except: logger.exception("Couldn't convert subtitle %s to .srt format: %s", self, traceback.format_exc()) return False self._is_valid = True return True
def is_valid_subtitle(subtitle_text): """Check if a subtitle text is a valid SubRip format :return: `True` if the subtitle is valid, `False` otherwise :rtype: bool """ try: pysrt.from_string(subtitle_text, error_handling=pysrt.ERROR_RAISE) return True except pysrt.Error: return False
def __init__(self, movie_sub_number): """ Inicia la clase, lee el archivo .srt y lo parsea """ self.movie_sub_number = str(movie_sub_number) self.filename = CONFIG.subtitles_path + self.movie_sub_number + ".srt" self.all_frames = None # Lee el archivo srt with open(self.filename, "rb") as f: file_content = f.read() try: self.raw_sub = pysrt.from_string(file_content.decode("utf-8")) except Exception: self.raw_sub = pysrt.from_string( file_content.decode("latin-1"))
def test_windows1252(self): srt_string = codecs.open(self.windows_path, encoding='windows-1252').read() srt_file = pysrt.from_string(srt_string, encoding='windows-1252', eol='\r\n') self.assertEquals(len(srt_file), 1332) self.assertEquals(srt_file.eol, '\r\n') self.assertRaises(UnicodeDecodeError, pysrt.open, self.utf8_path, encoding='ascii')
def srt_to_html(srt_string): """ Takes an str formatted string, and returns marked up html. Strips all timestamps; this is to simply render it neatly. """ sequences = pysrt.from_string(srt_string.decode('utf-8')) return "\n".join(f"<p>{x.text}</p>" for x in sequences)
def is_valid_subtitle(subtitle_text): """Check if a subtitle text is a valid SubRip format :return: `True` if the subtitle is valid, `False` otherwise :rtype: bool """ try: pysrt.from_string(subtitle_text, error_handling=pysrt.ERROR_RAISE) return True except pysrt.Error as e: if e.args[0] > 80: return True except: logger.exception('Unexpected error when validating subtitle') return False
def download_subtitles(movie_title): # Creating subliminal.video object using movie title. video = subliminal.Video.fromname(movie_title) print("Downloading subtitles for '", movie_title, "'...", sep='') # Downloading subtitles for created video object. If several are # available, subtitles with higher rating will be chosen. All available # providers are used for searching. best_subtitles = \ subliminal.download_best_subtitles({video}, {babelfish.Language('eng')}) if not best_subtitles[video]: print("No subtitles found for '", movie_title, "'...", sep='') return [] # This line can enable saving downloaded files for further use. Default # directory is the directory, where running script is located. # Note: when the script is running in non-sudo mode on Linux, # downloaded files will be saved in user Home directory. # subliminal.save_subtitles(video, [best_subtitles[video][0]]) # Converting list of subtitles to string, so pysrt module can then convert # it to its own format. subtitles_string = '' for item in [best_subtitles[video][0]]: subtitles_string += item.text # Converting string to list of strings without any SRT-special content # (text only) and returning it. return pysrt.from_string(subtitles_string)
def is_valid(self): """Check if a :attr:`text` is a valid SubRip format. :return: whether or not the subtitle is valid. :rtype: bool """ if self._is_valid: return True text = self.text if not text: return False # valid srt try: pysrt.from_string(text, error_handling=pysrt.ERROR_RAISE) except Exception: logger.error("PySRT-parsing failed, trying pysubs2") else: self._is_valid = True return True # something else, try to return srt try: logger.debug("Trying parsing with PySubs2") try: # in case of microdvd, try parsing the fps from the subtitle subs = pysubs2.SSAFile.from_string(text) if subs.format == "microdvd": logger.info("Got FPS from MicroDVD subtitle: %s", subs.fps) else: logger.info("Got format: %s", subs.format) except pysubs2.UnknownFPSError: # if parsing failed, suggest our media file's fps logger.info("No FPS info in subtitle. Using our own media FPS for the MicroDVD subtitle: %s", self.plex_media_fps) subs = pysubs2.SSAFile.from_string(text, fps=self.plex_media_fps) unicontent = self.pysubs2_to_unicode(subs) self.content = unicontent.encode(self._guessed_encoding) except: logger.exception("Couldn't convert subtitle %s to .srt format: %s", self, traceback.format_exc()) return False self._is_valid = True return True
def is_valid(self): """Check if a :attr:`text` is a valid SubRip format. :return: whether or not the subtitle is valid. :rtype: bool """ if not self.text: return False try: pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE) except pysrt.Error as e: if e.args[0] < 80: return False return True
def save_srt_to_file(self, srt_string): # create temp backup of file backup = self._basename + "-backup." + self._extension shutil.copyfile(self.filename, backup) print "Created backup at {}".format(backup) # print 'srt[100]: %s' % self.srt_string[:100] subs = pysrt.from_string(srt_string) subs.clean_indexes() subs.save(self.filename, encoding='utf-8')
def reset_index(sub_unicode): subs = pysrt.from_string(sub_unicode) for i in range(1, len(subs) + 1): subs[i - 1].index = i new_sub = StringIO.StringIO() subs.write_into(new_sub) new_sub_unicode = new_sub.getvalue() new_sub.close() return new_sub_unicode
def get_srts(video_id): result_list = [] try: part_url = urllib.urlencode( {'url': 'https://www.youtube.com/watch?v=' + video_id}) url = URL + '?' + part_url content = get_url(url) dom = BeautifulSoup(content, 'lxml') eng_url = dom.find('div', { 'id': 'show' }).find_all('b')[0].find_all('a')[0]['href'][2:] if not dom.find('div', { 'id': 'show' }).contents[2].strip().startswith('English'): raise Exception('Correct language not found for video ' + video_id) url = URL + eng_url content = get_url(url) content = filter(lambda x: x in set(string.printable), content) subs = pysrt.from_string(content) num = 0 for s in subs: result = { 'videoId': video_id, 'startMinutes': s.start.minutes, 'endMinutes': s.end.minutes, 'startSeconds': s.start.seconds, 'endSeconds': s.end.seconds, 'text': s.text_without_tags, 'num': num } if (result['endMinutes'] < result['startMinutes']): result['endMinutes'] = result['startMinutes'] result['endSeconds'] = result['startSeconds'] result_list.append(result) num = num + 1 except: print 'Unable to capture subtitles for video %s' % video_id traceback.print_exc() result_list = [] return {'result_list': result_list}
def test_windows1252(self): srt_string = codecs.open(self.windows_path, encoding='windows-1252').read() srt_file = pysrt.from_string(srt_string, encoding='windows-1252', eol='\r\n') self.assertEqual(len(srt_file), 1332) self.assertEqual(srt_file.eol, '\r\n') self.assertRaises(UnicodeDecodeError, pysrt.open, self.utf8_path, encoding='ascii')
def generateSub(args, _subtitle, _filename): subs = pysrt.from_string(str(_subtitle).decode('utf-8')) output = args.OUTPUT + _filename #file = pysrt.SubRipFile() text = '' for index in range(len(subs)): if subs[index].text != '': if args.VERBOSE: print "Translating line:" + cleanhtml(subs[index].text) subs[index].text = translate( cleanhtml(subs[index].text).encode('utf-8'), args.LANG_TO, args.LANG_FROM) subs.save(output)
def remove_ads_and_save(sub_contents, path): sub_contents = sub_contents.decode('iso-8859-15') srt_sub = pysrt.from_string(sub_contents) index = 0 while index < len(srt_sub): srt_sub[index].index = index + 1 sub_item = srt_sub[index] if True in [True for word in ADS_WORDS if word in sub_item.text.lower()]: del srt_sub[index] else: index += 1 srt_sub.save(path, encoding='utf-8')
def put_subtitles(video, subt_str): lines = pysrt.from_string(subt_str) for line in lines: start_time = datetime.time(line.start.hours, line.start.minutes, line.start.seconds, line.start.milliseconds * 1000) end_time = datetime.time(line.end.hours, line.end.minutes, line.end.seconds, line.end.milliseconds * 1000) subtitle = models.Subtitle(video=video, content=line.text, start_time=start_time, end_time=end_time) subtitle.save()
def remove_ads_and_save(sub_contents, path): sub_contents = sub_contents.decode('iso-8859-15') srt_sub = pysrt.from_string(sub_contents) index = 0 while index < len(srt_sub): srt_sub[index].index = index + 1 sub_item = srt_sub[index] if True in [ True for word in ADS_WORDS if word in sub_item.text.lower() ]: del srt_sub[index] else: index += 1 srt_sub.save(path, encoding='utf-8')
def load_transcript(video): if video.srt_extension == '': return None path = '/app/data/subs/orig/{}.{}.srt'.format(video.item_name(), video.srt_extension) # TODO(wcrichto): small subset of documents are failing with utf8 decode errors try: subs = pysrt.from_string(open(path, 'rb').read().decode('utf-8')) except Exception: print(video.path) return None # In practice, seems like subs are usually about 5 seconds late, so this is a hand-tuned shift subs.shift(seconds=-5) return subs
def srt_to_vtt(s, subs_shift=0): subs = pysrt.from_string(s) subs.shift(seconds=subs_shift) entry_fmt = '{position}\n{start} --> {end}\n{text}' def fmt_time(t): return '{:02d}:{:02d}:{:02d}.{:03d}'.format(t.hours, t.minutes, t.seconds, t.milliseconds) entries = [ entry_fmt.format(position=i, start=fmt_time(sub.start), end=fmt_time(sub.end), text=sub.text) for i, sub in enumerate(subs) ] return '\n\n'.join(['WEBVTT'] + entries)
def translate(request): # First get the srt data import pdb;pdb.set_trace() ret_dict = {} if request.method == 'POST': final_str = '' # f = request.FILES['subs_file'] fs = unicode(request.POST.items()[0][1]) subs = pysrt.from_string(fs) # Get the translated version st_to_tl = '' for sub in subs: st_to_tl += sub.text + ' | ' st_to_tl = strip_tags(st_to_tl[:-3]) st_list = do_translate(st_to_tl) for i, sub in enumerate(subs): try: next_start = subs[i+1].start except Exception as e: print e final_str +=' ' + st_list[i] + '.' break break_duration = next_start - sub.end break_duration = to_milliseconds(break_duration) final_str += st_list[i] + '<break time="' + unicode(break_duration) + 'ms"/>' ret_dict['success'] = '1' ret_dict['api_key'] = '59e482ac28dd52db23a22aff4ac1d31e' ssml = '<?xml version="1.0"?> ' + \ '<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" ' + \ 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ' + \ 'xsi:schemaLocation="http://www.w3.org/2001/10/synthesis ' + \ 'http://www.w3.org/TR/speech-synthesis/synthesis.xsd" ' + \ 'xml:lang="en-US">' + final_str + '</speak>' import pdb;pdb.set_trace() ssml = smart_str(ssml) ret_dict['ssml'] = ssml json_response = json.dumps(ret_dict) else: json_response = '{"Hello" : "world"}' return HttpResponse(json_response, content_type="application/json")
def subsrt(srt, left, right, srt_padding): sliced = srt.slice(starts_after=left - srt_padding, ends_before=right + srt_padding) if not sliced: return None # NOTE: The result of slice still references srt items in # the original srt. There seems no way a way to deep copy, # So export as a text and recreate from it. buf = StringIO() sliced.write_into(buf) ss = pysrt.from_string(buf.getvalue()) # Do some modifications on it. ss.clean_indexes() ss.shift(milliseconds=-left) path = tempfile_path('.srt') ss.save(path, encoding='utf-8') return path
def srt_to_vtt(s): subs = pysrt.from_string(s) subs.shift( seconds=-5) # Seems like TV news captions are delayed by a few seconds entry_fmt = u'{position}\n{start} --> {end}\n{text}' def fmt_time(t): return u'{:02d}:{:02d}:{:02d}.{:03d}'.format(t.hours, t.minutes, t.seconds, t.milliseconds) entries = [ entry_fmt.format(position=i, start=fmt_time(sub.start), end=fmt_time(sub.end), text=sub.text) for i, sub in enumerate(subs) ] return u'\n\n'.join([u'WEBVTT'] + entries)
def reset_index(sub_unicode): '''Reset SRT subtitles index. The subtitle index increases incrementally from 1. Args: sub_unicode: unicode object containing SRT subtitles Returns: new_sub_unicode: Reordered unicode SRT object. ''' subs = pysrt.from_string(sub_unicode) for i in range(1, len(subs) + 1): subs[i - 1].index = i new_sub = StringIO.StringIO() subs.write_into(new_sub) new_sub_unicode = new_sub.getvalue() new_sub.close() return new_sub_unicode
def test_compare_from_string_and_from_path(self): unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read() iterator = zip(pysrt.open(self.utf8_path), pysrt.from_string(unicode_content)) for file_item, string_item in iterator: self.assertEqual(str(file_item), str(string_item))
def test_compare_from_string_and_from_path(self): unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read() iterator = izip(pysrt.open(self.utf8_path), pysrt.from_string(unicode_content)) for file_item, string_item in iterator: self.assertEquals(unicode(file_item), unicode(string_item))
def processFolder(foldercursor): filelist = glob.glob(foldercursor["address"]+srt_extension) folder_name=foldercursor["_id"] if len(filelist) == 0 : movies_info_collection.update_one({'_id':folder_name},{"$set": {'finished':True}}) movies_info_collection.update_one({'_id':folder_name},{"$set": {'srt_count':0}}) return None #if no .srt file, skip movies_info_collection.update_one({'_id':folder_name},{"$set": {'srt_count':len(filelist)}}) #================================================= #hint: reduce the read and write (io) to disk #================================================= movies_content_in_a_folder = [] #create a list to store matched subtitles matched_content_in_a_folder = [] for filename in filelist : try: with open(filename) as f : content = f.read() movie_content = {} #print "check_encoding..." content_encoding = checkEncoding(content) if not content_encoding : continue #print "decoding..." movie_content["content"] = content.decode(content_encoding, 'ignore').encode("utf-8") #remove html tags pattern = re.compile('<[^>]*>|{[^}]*}') movie_content["content"]=pattern.sub('',movie_content["content"]) #print "check language..." movie_content["language"] = checkLanguage(movie_content["content"]) #convert t-chinese to s-chinese #if movie_content["language"] == "zh" : # except: # pass #movie_content["content"] = opencc.convert(movie_content["content"]).encode("utf-8") #print "parsing srt..." movie_content["parsed_content"] = pysrt.from_string(movie_content["content"].decode("utf-8", 'ignore')) #pysrt.from_string(movie_content["content"],xencoding='utf_8') #pysrt.from_string(movie_content["content"].decode("utf-8", 'ignore')) movie_content["total_lines"] = len(movie_content["parsed_content"]) movie_content["filename"] = filename movies_content_in_a_folder.append(movie_content) except Exception as e: print e #walk through all english subtitles #check if one srt file folder's srt file is zh and bilingual if len(movies_content_in_a_folder) == 1 : if movies_content_in_a_folder[0]['language'] == 'zh': if (checkBilingualzhSubtitles(movies_content_in_a_folder[0]['parsed_content'])): movies_info_collection.update_one({'_id':folder_name},{"$set": {'bilingual_zh_en':True}}) #TODO: change to set finished_en_subtitles = set() finished_zh_subtitles = set() #================================================= #start the iteration for every file in the folder #================================================= #check eng subtitles counts en_subtitle_counts = 0 for en_movie_content in movies_content_in_a_folder: if en_movie_content["language"]!="en": continue else: en_subtitle_counts += 1 if en_movie_content["total_lines"] in finished_en_subtitles: continue #print en_movie_content['filename'] #print finished_en_subtitles #print finished_zh_subtitles #compare begins failed_zh_attempts=set() #record the failed zh srt file(s) for a en srt file # 1st, check the zh srt which has the same number of lines with this eng subtitle for zh_movie_content in movies_content_in_a_folder: if zh_movie_content["language"]!="zh" : continue if zh_movie_content["total_lines"] != en_movie_content["total_lines"] : continue #this line is optional if zh_movie_content["total_lines"] in failed_zh_attempts : continue result = compare_subtitles(en_movie_content['parsed_content'], zh_movie_content['parsed_content']) if result : #if this is a bilingual subtitle, delete eng content in the zh srt file if (checkBilingualzhSubtitles(zh_movie_content['parsed_content'])): for line in result: line[3] = line[3].replace(line[2],"") movies_info_collection.update_one({'_id':folder_name},{"$set": {'bilingual_zh_en':True}}) matched_content_in_a_folder = matched_content_in_a_folder + result finished_en_subtitles.add(en_movie_content["total_lines"]) finished_zh_subtitles.add(zh_movie_content["total_lines"]) movies_info_collection.update_one({'_id':folder_name},{"$set": {'matchedd':True}}) break else : failed_zh_attempts.add(zh_movie_content["total_lines"]) # if this eng sub has been successfully processed, skip to next eng subtitle if en_movie_content["total_lines"] in finished_en_subtitles : continue # if this eng sub has not been paired, check zh srt with lines other than the same lines for zh_movie_content in movies_content_in_a_folder: if zh_movie_content["language"]!="zh":continue if zh_movie_content["total_lines"] in finished_zh_subtitles:continue#saves time, but optional if zh_movie_content["total_lines"] in failed_zh_attempts:continue #print "begin to compare:",en_movie_content['filename'], zh_movie_content['filename'] result = compare_subtitles(en_movie_content['parsed_content'], zh_movie_content['parsed_content']) if result : #if this is a bilingual subtitle, delete eng content in the zh srt file if (checkBilingualzhSubtitles(zh_movie_content['parsed_content'])): for line in result: line[3] = line[3].replace(line[2],"") movies_info_collection.update_one({'_id':folder_name},{"$set": {'bilingual_zh_en':True}}) matched_content_in_a_folder = matched_content_in_a_folder + result finished_en_subtitles.add(en_movie_content["total_lines"]) finished_zh_subtitles.add(zh_movie_content["total_lines"]) movies_info_collection.update_one({'_id':folder_name},{"$set": {'matched':True}}) break else : failed_zh_attempts.add(zh_movie_content["total_lines"]) #if there is no eng subtitles if en_subtitle_counts == 0: movies_info_collection.update_one({'_id':folder_name},{"$set": {'no_en_subtitle':True}}) #check if there is bilingual zh-en subtitle for zh_movie_content in movies_content_in_a_folder: if zh_movie_content["language"]!="zh": continue if (checkBilingualzhSubtitles(zh_movie_content['parsed_content'])): movies_info_collection.update_one({'_id':folder_name},{"$set": {'bilingual_zh_en':True}}) break #TODO: output if matched_content_in_a_folder: for line in matched_content_in_a_folder: print "%s%f%f%s%s" % (folder_name.encode("utf8"), srttime2totaltime(line[0]), srttime2totaltime(line[1]), line[2], line[3]) #mark this folder as processed movies_info_collection.update_one({'_id':folder_name},{"$set": {'finished':True}})
def read_dialogues(self): subs = pysrt.from_string(self.contents) buffer = '' for quote in subs: buffer += quote.text + '\n' return unicode(buffer)
SUPPORTED_EXTENSIONS = [".xml", ".vtt"] if __name__ == "__main__": directory = "." help_text = u"path to the {} directory (defaults to current directory)" parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", type=str, default=directory, help=help_text.format("input", directory)) parser.add_argument("-o", "--output", type=str, default=directory, help=help_text.format("output", directory)) a = parser.parse_args() filenames = [ fn for fn in os.listdir(a.input) if fn[-4:].lower() in SUPPORTED_EXTENSIONS ] for fn in tqdm(filenames): with codecs.open("{}/{}".format(a.input, fn), 'rb', "utf-8") as f: text = f.read() text = to_srt(text, fn[-4:]) text = strip_html(text) subs = stack_subs(pysrt.from_string(text)) subs.save("{}/{}.srt".format(a.output, fn), encoding='utf-8')
def test_utf8(self): unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read() self.assertEqual(len(pysrt.from_string(unicode_content)), 1332) self.assertRaises(UnicodeDecodeError, open(self.windows_path).read)
def test_utf8(self): unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read() self.assertEquals(len(pysrt.from_string(unicode_content)), 1332) self.assertRaises(UnicodeDecodeError, pysrt.from_string, open(self.windows_path).read())
def test_windows1252(self): srt_string = codecs.open(self.windows_path, encoding="windows-1252").read() srt_file = pysrt.from_string(srt_string, encoding="windows-1252", eol="\r\n") self.assertEqual(len(srt_file), 1332) self.assertEqual(srt_file.eol, "\r\n") self.assertRaises(UnicodeDecodeError, pysrt.open, self.utf8_path, encoding="ascii")