def get_processors(self): if not self.data_dict: return [] return [ # remove broken HI tag colons (ANNOUNCER'., ". instead of :) after at least 3 uppercase chars # don't modify stuff inside quotes NReProcessor(re.compile( r'(?u)(^[^"\'’ʼ❜‘‛”“‟„]*(?<=[A-ZÀ-Ž]{3})[A-ZÀ-Ž-_\s0-9]+)' r'(["\'’ʼ❜‘‛”“‟„]*[.,‚،⹁、;]+)(\s*)(?!["\'’ʼ❜‘‛”“‟„])'), r"\1:\3", name="OCR_fix_HI_colons", supported=lambda p: not p.only_uppercase), # fix F'bla NReProcessor(re.compile(r'(?u)(\bF)(\')([A-zÀ-ž]*\b)'), r"\1\3", name="OCR_fix_F"), WholeLineProcessor(self.data_dict["WholeLines"], name="OCR_replace_line"), MultipleWordReProcessor(self.data_dict["WholeWords"], name="OCR_replace_word"), MultipleWordReProcessor(self.data_dict["BeginLines"], name="OCR_replace_beginline"), MultipleWordReProcessor(self.data_dict["EndLines"], name="OCR_replace_endline"), MultipleWordReProcessor(self.data_dict["PartialLines"], name="OCR_replace_partialline"), MultipleLineProcessor(self.data_dict["PartialWordsAlways"], name="OCR_replace_partialwordsalways") ]
class ReverseRTL(SubtitleModification): identifier = "reverse_rtl" description = "Reverse punctuation in RTL languages" exclusive = True order = 50 languages = [Language(l) for l in ('heb', 'ara', 'fas')] long_description = "Some playback devices don't properly handle right-to-left markers for punctuation. " \ "Physically swap punctuation. Applicable to languages: hebrew, arabic, farsi, persian" processors = [ # new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2 #NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2", # name="CM_RTL_reverse") NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2", name="CM_RTL_reverse") ]
class CommonFixes(SubtitleTextModification): identifier = "common" description = "Basic common fixes" exclusive = True order = 40 long_description = """\ Fix common and whitespace/punctuation issues in subtitles """ processors = [ # -- = ... StringProcessor("-- ", '... ', name="CM_doubledash"), # '' = " StringProcessor("''", '"', name="CM_double_apostrophe"), # remove leading ... NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), # remove "downloaded from" tags NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"), # no space after ellipsis NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"), # multiple spaces NReProcessor(re.compile(r'(?u)[\s]{2,}'), " ", name="CM_multiple_spaces"), # no space after starting dash NReProcessor(re.compile(r'(?u)^-(?![\s-])'), "- ", name="CM_dash_space"), # remove starting spaced dots (not matching ellipses NReProcessor(re.compile(r'(?u)^(?!\s?(\.\s\.\s\.)|(\s?\.{3}))[\s.]*'), "", name="CM_starting_spacedots"), # space missing before doublequote # ReProcessor(re.compile(r'(?u)(?<!^)(?<![\s(\["])("[^"]+")'), r' \1', name="CM_space_before_dblquote"), # space missing after doublequote # ReProcessor(re.compile(r'(?u)("[^"\s][^"]+")([^\s.,!?)\]]+)'), r"\1 \2", name="CM_space_after_dblquote"), # space before ending doublequote? # remove >> NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"), # replace uppercase I with lowercase L in words NReProcessor(re.compile(ur'(?u)([A-zÀ-ž][a-zà-ž]+)(I+)'), lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))), name="CM_uppercase_i_in_word"), # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be # countdowns otherwise); don't break up ellipses NReProcessor(re.compile( r'(?u)([0-9]+[0-9:\']*(?<!\.\.)\s+(?!\.\.)[0-9,.:\']*(?=[0-9]+)[0-9,.:\'\s]+)(?=\s|$)' ), lambda match: match.group(1).replace(" ", ""), name="CM_spaces_in_numbers"), # uppercase after dot NReProcessor(re.compile(ur'(?u)((?:[^.\s])+\.\s+)([a-zà-ž])'), lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), # remove spaces before punctuation NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]))'), r"\1", name="CM_punctuation_space"), ] post_processors = empty_line_post_processors
class CommonFixes(SubtitleTextModification): identifier = "common" description = "Basic common fixes" exclusive = True order = 40 long_description = "Fix common and whitespace/punctuation issues in subtitles" processors = [ # normalize hyphens NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), # -- = em dash NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"), # line = _/-/\s NReProcessor(re.compile(r'(?u)(^\W*[-_.:<>~"\']+\W*$)'), "", name="CM_non_word_only"), # remove >> NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"), # line = : text NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), # fix music symbols NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'), lambda x: u"♪ " if x.group(1) else u" ♪", name="CM_music_symbols"), # '' = " NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), # double quotes instead of single quotes inside words NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"), # normalize quotes NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), lambda match: '"' + (" " if match.group(2).endswith(" ") else ""), name="CM_normalize_quotes"), # normalize single quotes NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), # remove leading ... NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), # remove "downloaded from" tags NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"), # no space after ellipsis NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"), # no space before spaced ellipsis NReProcessor(re.compile(r'(?u)(?<=[^\s])(?<!\s)\. \. \.'), " . . .", name="CM_ellipsis_no_space2"), # multiple spaces NReProcessor(re.compile(r'(?u)[\s]{2,}'), " ", name="CM_multiple_spaces"), # more than 3 dots NReProcessor(re.compile(r'(?u)\.{3,}'), "...", name="CM_dots"), # no space after starting dash NReProcessor(re.compile(r'(?u)^-(?![\s-])'), "- ", name="CM_dash_space"), # remove starting spaced dots (not matching ellipses) NReProcessor( re.compile(r'(?u)^(?!\s?(\.\s\.\s\.)|(\s?\.{3}))(?=\.+\s+)[\s.]*'), "", name="CM_starting_spacedots"), # space missing before doublequote # ReProcessor(re.compile(r'(?u)(?<!^)(?<![\s(\["])("[^"]+")'), r' \1', name="CM_space_before_dblquote"), # space missing after doublequote # ReProcessor(re.compile(r'(?u)("[^"\s][^"]+")([^\s.,!?)\]]+)'), r"\1 \2", name="CM_space_after_dblquote"), # space before ending doublequote? # replace uppercase I with lowercase L in words NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'), lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))), name="CM_uppercase_i_in_word"), # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be # countdowns otherwise); don't break up ellipses NReProcessor(re.compile( r'(?u)(\b[0-9]+[0-9:\']*(?<!\.\.)\s+(?!\.\.)[0-9,.:\'\s]*(?=[0-9]+)[0-9,.:\'])' ), lambda match: match.group(1).replace(" ", "") if match.group(1).count(" ") == 1 else match.group(1), name="CM_spaces_in_numbers"), # uppercase after dot NReProcessor(re.compile( ur'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'), lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), # remove double interpunction NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'), lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""), name="CM_double_interpunct"), # remove spaces before punctuation; don't break spaced ellipses NReProcessor( re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"), # add space after punctuation NReProcessor(re.compile(r'(?u)(([^\s]*)([!?.,:])([A-zÀ-ž]{2,}))'), lambda match: u"%s%s %s" % (match.group(2), match.group(3), match.group(4)) if not get_tld(match.group(1), fail_silently=True, fix_protocol=True) else match.group(1), name="CM_punctuation_space2"), # fix lowercase I in english NReProcessor(re.compile(r'(?u)(\b)i(\b)'), r"\1I\2", name="CM_EN_lowercase_i", supported=lambda p: p.language == ENGLISH), ] post_processors = empty_line_post_processors
def get_signature(cls, **kwargs): string_args = ",".join( ["%s=%s" % (key, value) for key, value in kwargs.iteritems()]) return "%s(%s)" % (cls.identifier, string_args) @classmethod def merge_args(cls, args1, args2): raise NotImplementedError class SubtitleTextModification(SubtitleModification): pass TAG = ur"(?:\s*{\\[iusb][0-1]}\s*)*" EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag") empty_line_post_processors = [ # empty tag EMPTY_TAG_PROCESSOR, # empty line (needed?) NReProcessor(re.compile(r'^[\s-]+$'), "", name="empty_line"), ] class EmptyEntryError(Exception): pass
class CommonFixes(SubtitleTextModification): identifier = "common" description = "Basic common fixes" exclusive = True order = 40 long_description = """\ Fix common and whitespace/punctuation issues in subtitles """ processors = [ # -- = em dash NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"), # line = _/-/\s NReProcessor(re.compile(r'(?u)(^\W*[-_.]+\W*$)'), "", name="CM_non_word_only"), # multi space NReProcessor(re.compile(r'(?u)(\s{2,})'), " ", name="CM_multi_space"), # fix music symbols NReProcessor(re.compile(ur'(?u)(^[*#¶\s]*[*#¶]+[*#¶\s]*$)'), u"♪", name="CM_music_symbols"), # '' = " StringProcessor("''", '"', name="CM_double_apostrophe"), # remove leading ... NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), # remove "downloaded from" tags NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"), # no space after ellipsis NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"), # no space before spaced ellipsis NReProcessor(re.compile(r'(?u)(?<=[^\s])(?<!\s)\. \. \.'), " . . .", name="CM_ellipsis_no_space2"), # multiple spaces NReProcessor(re.compile(r'(?u)[\s]{2,}'), " ", name="CM_multiple_spaces"), # more than 3 dots NReProcessor(re.compile(r'(?u)\.{3,}'), "...", name="CM_dots"), # no space after starting dash NReProcessor(re.compile(r'(?u)^-(?![\s-])'), "- ", name="CM_dash_space"), # remove starting spaced dots (not matching ellipses) NReProcessor(re.compile(r'(?u)^(?!\s?(\.\s\.\s\.)|(\s?\.{3}))(?=\.+\s+)[\s.]*'), "", name="CM_starting_spacedots"), # space missing before doublequote # ReProcessor(re.compile(r'(?u)(?<!^)(?<![\s(\["])("[^"]+")'), r' \1', name="CM_space_before_dblquote"), # space missing after doublequote # ReProcessor(re.compile(r'(?u)("[^"\s][^"]+")([^\s.,!?)\]]+)'), r"\1 \2", name="CM_space_after_dblquote"), # space before ending doublequote? # remove >> NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"), # replace uppercase I with lowercase L in words NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'), lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))), name="CM_uppercase_i_in_word"), # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be # countdowns otherwise); don't break up ellipses NReProcessor( re.compile(r'(?u)(\b[0-9]+[0-9:\']*(?<!\.\.)\s+(?!\.\.)[0-9,.:\'\s]*(?=[0-9]+)[0-9,.:\'])'), lambda match: match.group(1).replace(" ", "") if match.group(1).count(" ") == 1 else match.group(1), name="CM_spaces_in_numbers"), # uppercase after dot NReProcessor(re.compile(ur'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'), lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"), # remove double interpunction NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'), lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""), name="CM_double_interpunct"), # remove spaces before punctuation; don't break spaced ellipses NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"), ] post_processors = empty_line_post_processors
class HearingImpaired(SubtitleTextModification): identifier = "remove_HI" description = "Remove Hearing Impaired tags" exclusive = True order = 20 long_description = "Removes tags, text and characters from subtitles that are meant for hearing impaired people" processors = [ # full bracket entry, single or multiline; starting with brackets and ending with brackets FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}), "", name="HI_brackets_full"), # uppercase text before colon (at least 3 uppercase chars); at start or after a sentence, # possibly with a dash in front; ignore anything ending with a quote NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])' r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "", name="HI_before_colon_caps"), # any text before colon (at least 3 chars); at start or after a sentence, # possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if # a space is inside the text; ignore anything ending with a quote NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])' r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9]|//)'), lambda match: match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0) else "" if not match.group(1).startswith(" ") else " ", name="HI_before_colon_noncaps"), # brackets (only remove if at least 3 chars in brackets) NReProcessor(re.compile(r'(?sux)-?%(t)s["\']*[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]]["\']*[\s:]*%(t)s' % {"t": TAG}), "", name="HI_brackets"), #NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}), # "", name="HI_bracket_open_start"), #NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "", # name="HI_bracket_open_end"), # text before colon (and possible dash in front), max 11 chars after the first whitespace (if any) # NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"), # starting text before colon (at least 3 chars) #NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "", # name="HI_before_colon"), # text in brackets at start, after optional dash, before colon or at end of line # fixme: may be too aggressive #NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "", # name="HI_brackets_special"), # all caps line (at least 4 consecutive uppercase chars) NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps", supported=lambda p: not p.only_uppercase), # remove MAN: NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"), # dash in front # NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"), # all caps at start before new sentence NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1", name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase), ] post_processors = empty_line_post_processors last_processors = [ # remove music symbols NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}), "", name="HI_music_symbols_only"), # remove music entries NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[*#¶♫♪]+\s*.+|.+\s*[*#¶♫♪]+\s*$)'), "", name="HI_music", entry=True), ]