class SubtitleConverter: def __init__(self, input, output, rawOutput=None, language=None, meta=None, encoding=None, alwaysSplit=False): """Creates a new converter for a given input and output (as file objects). A second file object for the raw output can also be provided. Args: input(file object or list of file objects): raw subtitle files output(file object): XML subtitle file for the tokenised output rawOutput(file object): XML subtitle file for the untokenised output language(Language object): language for the subtitle meta(dict): meta-data to append to the end of the XML file(s) encoding(str): file encoding to use to read the raw subtitle files alwaysSplit(bool): whether to always split subtitle blocks as new sentences (default is false). """ self.lang = language self.alwaysSplit = alwaysSplit if self.lang and self.lang.alwaysSplit: self.alwaysSplit = True self.inputs = input if isinstance(input,list) else [input] self.encodings = [encoding] if encoding else [] self.encodings += (self.lang.encodings if self.lang else []) if not self.lang or self.lang.codes[0] in difficult_langs: detected = detectEncoding(self.inputs[0], self.encodings) self.encodings = [detected] + self.encodings self.output = output self.rawOutput = rawOutput self.meta = meta def doConversion(self): """Performs the conversion process, reading the full subtitle file and writing the converted content into the output file. """ self.curLine = None # Current line in the raw file self.curBlock = None # Current block self.curLineIndex = 0 # Current line index in the raw file self.timeOffset = 0 # Time offset (for multi-CD subtitles) self.sid = 0 # Current sentence identifier self.nbTokens = 0 # Total number of words self.nbIgnoredBlocks = 0 # Number of ignored subtitle blocks self.sentence = Sentence() # Tokens in the current sentence self.text = "" # Collection of all subtitle lines # Starting the tokeniser and spellchecker self.tokeniser = Tokeniser(self.lang) self.spellchecker = SpellChecker(self.lang) self._startDocument() # Looping on the subtitle blocks block = self._readBlock() while block: # Ignoring spurious subtitle blocks if block.isSpurious(): self.nbIgnoredBlocks += 1 block = self._readBlock() continue self._writeBlock(block) block = self._readBlock() self._flushDocument() self.tokeniser.close() def _startDocument(self): """Writes the header of the XML subtitle file. """ id = self.meta["id"] if self.meta and "id" in self.meta else "" if not id and self.inputs and hasattr(self.inputs[0],"name"): id = os.path.basename(self.inputs[0].name).split(".")[0] id = id.encode("utf-8") self.output.write(b'<?xml version="1.0" encoding="utf-8"?>\n') self.output.write(b'<document id="' + id + b'">\n') if self.rawOutput: self.rawOutput.write(b'<?xml version="1.0" encoding="utf-8"?>\n') self.rawOutput.write(b'<document id="' + id + b'">\n') def _readBlock(self, recursive=0): """Reads one subtitle block and returns it. """ block = SubtitleBlock() block.previous = self.curBlock block.offset = self.timeOffset # Reads the very first line if not self.curLine: self._readline() elif recursive > 20: raise RuntimeError("Wrong encoding format for subtitle") # Continues until a non-empty line is found while self.curLine and not self.curLine.strip(): self._readline() # If we arrive at the end of the file object, checks whether any # other file should be read (in case of multi-CD subtitles). If yes, # opens the new file and continue. Else, returns None. if not self.curLine: self.inputs.pop(0) self.curLineIndex = 0 if self.inputs: nextBlock = self._readBlock() lasttime = tosecs(block.previous.end) if block.previous else 0 # shifting the start and end times after the first CD if nextBlock and nextBlock.start and lasttime > tosecs(nextBlock.start): nextBlock.start = addsecs(nextBlock.start, lasttime-self.timeOffset) nextBlock.end = addsecs(nextBlock.end, lasttime-self.timeOffset) self.timeOffset = lasttime return nextBlock else: return None # Detects the subtitle identifier numberMatch = numberRegex.match(self.curLine) if numberMatch: block.setId(int(numberMatch.group(1))) self._readline() else: block.setId((self.curBlock.id+1) if self.curBlock else 1) # Ignores empty lines while self.curLine and not self.curLine.strip(): self._readline() # Detects the start and end time timingMatch = timingRegex.match(self.curLine) if not timingMatch: sys.stderr.write("Cannot parse timing (line number: %i): %s" %(self.curLineIndex, self.curLine)) self._readline() self.nbIgnoredBlocks += 1 return self._readBlock(recursive+1) block.setTiming(timingMatch.group(1), timingMatch.group(2)) # Reads the subtitle content until we arrive at the next subtitle ID # or the end of the file (NB: simply stopping at an empty line does # not always work, since some files strangely contain empty lines # within subtitle blocks). self._readline() while self.curLine.strip(): block.addLine(self.curLine) self._readline() while self.curLine and not numberRegex.match(self.curLine): block.addLine(self.curLine) self._readline() self.curBlock = block return block def _readline(self): """ Reads the next line in the file, decodes it according to the current encoding, and returns it. If a decoding error is detected, tries to change the encoding if an alternative is possible. """ if self.inputs: binaryLine = self.inputs[0].readline() self.curLine = None while self.curLine==None and self.encodings: encoding = self.encodings[0] try: self.curLine = binaryLine.decode(encoding) except UnicodeDecodeError: # If we get a decoding error, removes the encoding from # the list of possible encodings, and retry. self.encodings.remove(encoding) if self.curLine==None: raise RuntimeError("Decoding error (encoding: %s, line: %i)" %(encoding, self.curLineIndex)) elif self.curLineIndex==0: self.curLine = self.curLine.lstrip("\ufeff") self.curLineIndex += 1 def _writeBlock(self, block): """ Processes the block content by doing sentence segmentation, tokenisation, and writes the results into the XML file. """ # First check whether the block is a continuation of the previous # sentence. If not, "flush" the current sentence to start a new one. if not self._isContinuation(block): self._flushSentence() self.sentence.addStamp("T%sS"%block.id, block.start) # Loops on each line of the subtitle block for linenum in range(0,len(block.lines)): self.sentence.addRawChar(' ' if self.sentence.raw else '') self._recordLine(block, linenum) self.sentence.addStamp("T%sE"%block.id, block.end) def _recordLine(self, block, linenum): """ Records the subtitle line, checking for the occurrence of end-of-sentence markers along the way, and flushing the current sentence in that case. """ # Doing the actual tokenisation line = block.lines[linenum] tokens = self.tokeniser.tokenise(line) curPos = 0 # Current character position in the line upperline = len([c for c in line if c.isupper() or not c.isalpha()]) > 2*len(line)/3 for i, token in enumerate(tokens): curPos += len(token) # Assume a new sentence if an utterance started with "-" is found if (token=="-" and i < len(tokens)-1 and (tokens[i+1][0].isupper() or (self.lang and self.lang.unicase))): self._flushSentence() # Handle all-uppercase tokens emphasised = block.isEmphasised(linenum, curPos) prev = self.sentence.lastToken if token.isupper() and ((not token.istitle() and self.spellchecker.lm) or upperline): corrected = self.spellchecker.recapitalise(token, prev, upperline) if corrected != token: self.sentence.addToken(token, emphasised|(not upperline), alternative=corrected) else: self.sentence.addToken(token, emphasised) # Usual case else: corrected, prob = self.spellchecker.spellcheck(token, prev) if prev in stopPunctuations2 and corrected.istitle(): self._flushSentence() emphasised = block.isEmphasised(linenum, curPos) if corrected == token: self.sentence.addToken(token, emphasised) elif prob > 0.8: self.sentence.addToken(corrected, emphasised, initial=token) else: self.sentence.addToken(token, emphasised, alternative=corrected) while curPos < len(line) and line[curPos].isspace(): self.sentence.addRawChar(line[curPos]) curPos += 1 # Do not flush the sentence for the last token in the last line if ((linenum==len(block.lines)-1 and i==len(tokens)-1) or (i < len(tokens)-1 and tokens[i+1]=="\"")): continue if token[0] in stopPunctuations1: self._flushSentence() elif (token[0] in stopPunctuations2 and i > 0 and (i==len(tokens)-1 or tokens[i+1][0].isupper() or tokens[i+1][0]=="l" or (self.lang and self.lang.unicase))): self._flushSentence() def _isContinuation(self, block): """Returns true if the block is likely to be a continuation of the current sentence """ if (not self.sentence or not block.lines or not block.previous or not block.previous.lines): return True elif self.alwaysSplit: return False score = 0 #Initial continuation score # Scoring based on the end of the previous block lastline = block.previous.lines[-1].rstrip(")]} ") stopEndings = stopPunctuations1 + stopPunctuations2 + ["\""] if lastline.endswith("..."): score += 2 elif lastline and lastline[-1] in stopEndings: score += -3 # Scoring based on the beginning of the current block newline = block.lines[0].lstrip("'[*# ") if not newline: score += -2 elif lastline.endswith("-") and newline.startswith("-"): score += 2 elif newline[0] in ["-","\"", "¿", "¡", "'"]: score += -2 elif newline.startswith("..."): score += 2 elif newline[0].isupper(): score += -3 elif newline[0].islower(): score += 2 elif newline[0].isnumeric() or (self.lang and self.lang.unicase): score += 1 # Scoring based on time gaps if block.start and block.previous.end: pause = tosecs(block.start) - tosecs(block.previous.end) score += (-1 if pause > PAUSE_THR1 else 0) score += (-1 if pause > PAUSE_THR2 else 0) # Scoring based on sentence lengths score += (-1 if self.sentence.getNbStamps() >3 else 0) score += (-1 if self.sentence.getNbTokens() > WORDS_THR else 0) return True if score > 0 else False def _flushSentence(self): """ Writes the tokens to the XML file (and the untokenised output if that option is activated) and clears the current sentence. """ nbTokens = self.sentence.getNbTokens() if not nbTokens: return self.nbTokens += nbTokens self.sid += 1 self._pruneTokens() self._writeTokens() if self.rawOutput: self._writeRaw() # We record the text content for language identification purposes self.text += self.sentence.rawCorrected + "\n" self.sentence = Sentence() def _pruneTokens(self): entities= self.sentence.entities for i in range(1, len(entities)-4): if (entities[i][0]=="w" and (entities[i][1]=="..." or entities[i][1]=="-") and entities[i+1][0]=="time" and entities[i+2][0]=="time" and entities[i+3][0]=="w" and entities[i+3][1]==entities[i][1]): self.sentence.entities = entities[0:i] +entities[i+1:i+3] + entities[i+4:] self.sentence.raw = self.sentence.raw.replace("... ...", " ") self.sentence.raw = self.sentence.raw.replace("- -", " ") break def _writeTokens(self): """ Writes the tokens in self.sentence to the XML file. """ builder = et.TreeBuilder() sattrs = {"id":str(self.sid)} if self.sentence.isEmphasised(): sattrs.update({"emphasis":"true"}) for w in self.sentence.getTokens(): del w[2]["emphasis"] builder.start("s",sattrs) tokid = 0 entities= self.sentence.getEntities() for i, entity in enumerate(entities): if entity[0]=="w": token = entity[1] tokid += 1 builder.data("\n ") wattrs = {"id":"%i.%i"%(self.sid,tokid)} wattrs.update(entity[2]) builder.start("w",wattrs) builder.data(token) builder.end("w") # Write a <time> entity elif entity[0]=="time": builder.data("\n ") builder.start("time",entity[1]) builder.end("time") builder.data("\n ") builder.end("s") tree = et.ElementTree(builder.close()) self.output.write(b" ") tree.write(self.output, encoding='utf-8') self.output.write(b"\n") def _writeRaw(self): """ Writes the raw sentence to the XML file. """ builder = et.TreeBuilder() attrs = {"id":str(self.sid)} builder.start("s",attrs) # Add timing info at the beginning of the sentence entities = self.sentence.getEntities() if entities and entities[0][0] == "time": builder.data("\n ") builder.start("time",entities[0][1]) builder.end("time") builder.data("\n") builder.data(self.sentence.raw) # Add timing info at the end of the sentence if entities and entities[-1][0] == "time": builder.data("\n ") builder.start("time",entities[-1][1]) builder.end("time") builder.data("\n ") builder.end("s") tree = et.ElementTree(builder.close()) self.rawOutput.write(b" ") tree.write(self.rawOutput, encoding='utf-8') self.rawOutput.write(b"\n") def _extractMetadata(self): """ Extracts meta-data on the subtitle and the conversion process, in order to append it to the end of the XML file. """ meta = self.meta if self.meta else {} if "id" in meta: del meta["id"] meta["subtitle"] = meta["subtitle"] if "subtitle" in meta else {} meta["conversion"] = {} if self.lang: meta["subtitle"]["language"] = self.lang.name # Performs language identification langProb = self.lang.getProb(self.text) if langProb < 0.1 and not isinstance(self, BilingualConverter): msg = "Subtitle is not encoded in " + self.lang.name msg += " (distrib: " + str(utils.getProbDist(self.text)) + ")" raise RuntimeError(msg) meta["subtitle"]["confidence"] = str(langProb) if self.curBlock: meta["subtitle"]["blocks"] = str(self.curBlock.id) meta["subtitle"]["duration"] = self.curBlock.end meta["conversion"]["sentences"] = str(self.sid) meta["conversion"]["tokens"] = str(self.nbTokens) meta["conversion"]["encoding"] = self.encodings[0] meta["conversion"]["ignored_blocks"] = str(self.nbIgnoredBlocks) sc = self.spellchecker meta["conversion"]["unknown_words"] = str(sc.nbUnknowns) meta["conversion"]["corrected_words"] = str(sc.nbCorrections) meta["conversion"]["truecased_words"] = str(sc.nbTruecased) return meta def _flushDocument(self): """ Adds the final meta-data to the XML file, and closes the XML document. """ self._flushSentence() meta = self._extractMetadata() metaBuilder = et.TreeBuilder() metaBuilder.start("meta") for part in meta: metaBuilder.data("\n ") metaBuilder.start(part) if isinstance(meta[part],dict): for key in meta[part]: metaBuilder.data("\n ") metaBuilder.start(key) metaBuilder.data(meta[part][key]) metaBuilder.end(key) metaBuilder.data("\n ") metaBuilder.end(part) metaBuilder.data("\n ") metaBuilder.end("meta") tree = et.ElementTree(metaBuilder.close()) for fd in [self.output,self.rawOutput]: if fd: fd.write(b" ") tree.write(fd, encoding='utf-8') fd.write(b"\n</document>\n") def closeOutputs(self): if self.output != sys.stdout.buffer: self.output.close() if self.rawOutput: self.rawOutput.close()