def applyRegexes(inputFile, outputFile, regularFile): """Apply the regular expressions contained in 'regularFile'. params: - inputFile : a text file in 'utf-8' encoding - outputFile : the result text file in 'utf-8' encoding - regularFile : the file containing the regular expressions to apply. """ regexFormula = RegularExpressionFormula(rulesFile=regularFile) io = Ioread() fd = io.openFile(inputFile) count, linesList = 0, [] #Read first line l = fd.readline() while l != "": l = l.rstrip().strip() #Remove punctuation using regular expressions linesList.append(regexFormula.apply(l, FRENCH)) count += 1 if count % 50000 == 0: print "Processed %d values" % count #Read next line l = fd.readline() io.closeFile(fd) strContent = u"\n".join(linesList) io.writeFileContent(outputFile, strContent)
def testAcronyms(self): f = RegularExpressionFormula(None, RegexList.removeComments(ACRONYMREGEXLIST)) testList = [(u"ADG SPO PS",u"a. d. g. s. p. o. p. s."), (u"ADG SPO PS PDCC",u"a. d. g. s. p. o. p. s. p. d. c. c."), (u"A ADG SPO PS PDCCC",u"A a. d. g. s. p. o. p. s. p. d. c. c. c."), (u"ABCDs ABCs ABs",u"a. b. c. d. s. a. b. c. s. a. b. s.")] for t, gt in testList: resultString = f.apply(t, 0, False) resultString = re.sub(ACRONYMDELIMITER, u"", resultString, flags=re.UNICODE) self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
def testApostrophe(self): f = RegularExpressionFormula(None, RegexList.removeComments(APOSTHROPHELIST)) testList = [(u"d'avant", u"d' avant")] self.verifyEqual(testList, f, 1)
def setSubstitutionList(self, regexList): """Set the user regexes substitution list. param regexList: a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ self.substitutionRegexFormula = RegularExpressionFormula(None) substitutionList = [] for row in regexList: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList)
def __init__(self, inputFile, outputDir): """Default constructor. """ self.inputFile = inputFile self.outputDir = outputDir self.tempDir = outputDir self.formattedText = None self.debug = False self.regexFile = None self.lmModeling = False self.filterSentences = False self.removePunctuation = False self.verbalizePunctuation = False self.doc = None self.wordClassifier = None self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = []
def testRegexTypes(self): TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.", ur"6", ur"0", ur"")] TESTLIST = [(u"ADG", u"a. d. g."), (u"ADG/LA", u"ADG/LA"), (u"a ADG b", u"a a. d. g. b"), (u"l ADG ", u"l a. d. g. "), (u"l'ADG'", u"l'a. d. g.'"), (u"\"ADG\"", u"\"a. d. g.\""), (u"\"ADG", u"\"a. d. g."), (u"e-ADG-", u"e-a. d. g.-"), (u"l'ADG,", u"l'a. d. g.,"), (u"l'ADG.", u"l'a. d. g.."), (u"l'ADG?", u"l'a. d. g.?"), (u"l'ADG!", u"l'a. d. g.!"), (u"l'ADG;", u"l'a. d. g.;"), (u"l'ADG:", u"l'a. d. g.:")] f = RegularExpressionFormula(None, RegexList.removeComments(TYPEREGEXLIST)) for t, gt in TESTLIST: r = f.apply(t, 0) self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
def testAcronyms(self): f = RegularExpressionFormula( None, RegexList.removeComments(ACRONYMREGEXLIST)) testList = [ (u"ADG SPO PS", u"a. d. g. s. p. o. p. s."), (u"ADG SPO PS PDCC", u"a. d. g. s. p. o. p. s. p. d. c. c."), (u"A ADG SPO PS PDCCC", u"A a. d. g. s. p. o. p. s. p. d. c. c. c."), (u"ABCDs ABCs ABs", u"a. b. c. d. s. a. b. c. s. a. b. s.") ] for t, gt in testList: resultString = f.apply(t, 0, False) resultString = re.sub(ACRONYMDELIMITER, u"", resultString, flags=re.UNICODE) self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
def testContractionPrefixes(self): f = RegularExpressionFormula(None, RegexList.removeComments(CONTRACTIONPREFIXELIST)) for p, s, t, i, c in CONTRACTIONPREFIXELIST: if not p.find("gr1"): resultString = f.apply(p, 1, False) self.assertEquals(s.encode('utf-8'), resultString.encode('utf-8')) testList = [(ur"d une",ur"d' une"),(ur"j' ai",ur"j' ai"), (ur"l' y ",ur"l' y "), (ur"m' a",ur"m' a"), (ur"n' est",ur"n' est"),(ur"n' a",ur"n' a"), (ur"d' y",ur"d' y"),(ur"c' en",ur"c' en"), (ur"qu' y",ur"qu' y"), (ur"qu' en",ur"qu' en"), (ur"-t-on",ur" -t-on")] for p, gt in testList: resultString = f.apply(p, 1, False) self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
def testDates(self): f = RegularExpressionFormula(None, RegexList.removeComments(DATEREGEXLIST)) testList = [ (u"01.01.2015", u"01 01 2015"), (u"01/01/2015", u"01 01 2015"), (u"01.01.15", u"01 01 15"), ] self.verifyEqual(testList, f, 0)
def testContractionPrefixes(self): f = RegularExpressionFormula( None, RegexList.removeComments(CONTRACTIONPREFIXELIST)) for p, s, t, i, c in CONTRACTIONPREFIXELIST: if not p.find("gr1"): resultString = f.apply(p, 1, False) self.assertEquals(s.encode('utf-8'), resultString.encode('utf-8')) testList = [(ur"d une", ur"d' une"), (ur"j' ai", ur"j' ai"), (ur"l' y ", ur"l' y "), (ur"m' a", ur"m' a"), (ur"n' est", ur"n' est"), (ur"n' a", ur"n' a"), (ur"d' y", ur"d' y"), (ur"c' en", ur"c' en"), (ur"qu' y", ur"qu' y"), (ur"qu' en", ur"qu' en"), (ur"-t-on", ur" -t-on")] for p, gt in testList: resultString = f.apply(p, 1, False) self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
def setRegexList(self, regexList): """Set both validation and substitution user regexes. param regexList: a list of the following form: [u'matching pattern', u'substitution', u'type', u'language id'] """ #Reset current lists self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] substitutionList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) else: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList)
def setSubstitutionList(self, regexList): """Set the user regexes substitution list. param regexList: a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ self.substitutionRegexFormula = RegularExpressionFormula(None) substitutionList = [] for row in regexList: substitutionList.append((row[0],row[1],row[2],row[3])) self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList)
def testRegexTypes(self): TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.",ur"6",ur"0",ur"")] TESTLIST = [(u"ADG",u"a. d. g."), (u"ADG/LA",u"ADG/LA"), (u"a ADG b",u"a a. d. g. b"), (u"l ADG ",u"l a. d. g. "), (u"l'ADG'",u"l'a. d. g.'"), (u"\"ADG\"",u"\"a. d. g.\""), (u"\"ADG",u"\"a. d. g."), (u"e-ADG-",u"e-ADG-"), (u"l'ADG,",u"l'a. d. g.,"), (u"l'ADG.",u"l'a. d. g.."), (u"l'ADG?",u"l'a. d. g.?"), (u"l'ADG!",u"l'a. d. g.!"), (u"l'ADG;",u"l'a. d. g.;"), (u"l'ADG:",u"l'a. d. g.:")] f = RegularExpressionFormula(None, RegexList.removeComments(TYPEREGEXLIST)) for t, gt in TESTLIST: r = f.apply(t, 0) self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
def setRegexList(self, regexList): """Set both validation and substitution user regexes. param regexList: a list of the following form: [u'matching pattern', u'substitution', u'type', u'language id'] """ #Reset current lists self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] substitutionList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0],row[3])) else: substitutionList.append((row[0],row[1],row[2],row[3])) self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList)
class DataPreparationAPI(): """Import sentences from one file, classifying sentences into languages. """ logger = logging.getLogger("Asrt.DataPreparationAPI") def __init__(self, inputFile, outputDir): """Default constructor. """ self.inputFile = inputFile self.outputDir = outputDir self.tempDir = outputDir self.formattedText = None self.debug = False self.regexFile = None self.lmModeling = False self.filterSentences = False self.removePunctuation = False self.verbalizePunctuation = False self.doc = None self.wordClassifier = None self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] ##################### #Getters and setters # def setInputFile(self, inputFile): self.inputFile = inputFile def setOutputDir(self, outputDir): self.outputDir = outputDir def setTempDir(self, tempDir): self.tempDir = tempDir def setFormattedText(self, formattedText): self.formattedText = formattedText def getCleanedText(self): if self.doc != None: return self.doc.getCleanedText() return "" def getCleanedTextPerLanguage(self): if self.doc != None: return self.doc.getCleanedTextPerLanguage() return "" def setDebugMode(self, debug): self.debug = debug def setRegexFile(self, regexFile): self.regexFile = regexFile def setRegexList(self, regexList): """Set the acronyms to be used. param acronymList: a list of the following form: ['matching pattern', 'substitution', 'type', 'language id'] """ substitutionList = [] #Skip header for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0],row[3])) else: substitutionList.append((row[0],row[1],row[2],row[3])) self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList) def setLMModeling(self, modelNgram): self.lmModeling = modelNgram def setFilterSentences(self, filterSentences): self.filterSentences = filterSentences def setRemovePunctuation(self, removePunctuation): self.removePunctuation = removePunctuation def setVerbalizePunctuation(self, verbalizePunctuation): self.verbalizePunctuation = verbalizePunctuation def getDocument(self): """Get the underlying 'TextDocument'. """ return self.doc ##################### #Public interface # def trainClassifier(self): """Train the underlying classifier. """ if self.wordClassifier == None: self.logger.info("Prepare the word classifier ...") self.wordClassifier = WordClassifier() self.wordClassifier.train() def getRegexes(self): """Fetch validation and substitution regexes from csv file. """ #User did not specified rules if self.regexFile == None: return #Are regexes already loaded in API if self.substitutionRegexFormula.hasPatterns() or \ len(self.validationPatternList) > 0: return regexList = RegexList().loadFromFile(self.regexFile) self.setRegexList(regexList) def prepareDocument(self, language = 0): """Segment the document into sentences and prepare them. param language: an int between 0-4 - unknown : 0 - french : 1 - german : 2 - english : 3 - italian : 4 """ if language> 4 or language < 0: raise Exception("Unknown language") #Done at the API level to share resources between #documents self.logger.info("Getting regexes") self.getRegexes() if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Using following regexes substitution:\n" +\ str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3])) if len(self.validationPatternList) > 0: self.logger.info("Using following regexes for sentence validation:\n" +\ str(self.validationPatternList[0:3])) try: self.logger.info("Document file: %s" % self.inputFile) #The main document self.doc = TextDocument(self.inputFile, language, self.substitutionRegexFormula, self.validationPatternList, self.outputDir) if self.inputFile != None: self.logger.info("Load file, convert to text when pdf document") self.doc.loadDocumentAsSentences(self.tempDir) elif self.formattedText != None: self.logger.info("Load text string as sentences") self.doc.loadAsSentences(self.formattedText) else: raise Exception("No input file or text string provided!") #print self.doc.getCleanedText() #Control character and strip self.logger.info("Cleaning control characters") self.doc.cleanTextSentences() #print self.doc.getCleanedText() if language == 0: self.logger.info("Classifying sentences") self.doc.setClassifier(self.wordClassifier) self.doc.classifySentences() else: self.doc.setSentencesLanguage(language) #print self.doc.getCleanedText() #User's supplied regular expression if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Applying user regular expressions per language") self.doc.normalizeTextSentences() #print self.doc.getCleanedText() if self.filterSentences: self.logger.info("Filtering data") self.doc.filterTextSentences() #If LM option is selected, it will be done at #the prepareLM stage if self.removePunctuation and not self.lmModeling: self.doc.removeTextPunctuation() if self.verbalizePunctuation and not self.removePunctuation: self.doc.verbalizeTextPunctuation() #print self.doc.getCleanedText() #After language id has been set as it depends of #languages (i.e. numbers expansion) if self.lmModeling: self.logger.info("Preparing for language modeling") self.doc.prepareLM() except Exception, e: errorMessage = "An error as occurred when importing sentences: %s\n%s" % (str(e), self.inputFile) errorMessage = getErrorMessage(e, errorMessage) self.logger.critical(errorMessage) raise Exception(e) return self.doc
class LMPreparationFormula(): """Main formula for language modeling text preparation. """ logger = logging.getLogger("Asrt.LMPreparationFormula") ordDict = {} abbreviationsDict = {} #Regular expressions formulas dateFormula = RegularExpressionFormula( None, RegexList.removeComments(DATEREGEXLIST)) apostropheFormula = RegularExpressionFormula( None, RegexList.removeComments(APOSTHROPHELIST)) contractionPrefixFormula = RegularExpressionFormula( None, RegexList.removeComments(CONTRACTIONPREFIXELIST)) acronymFormula = RegularExpressionFormula( None, RegexList.removeComments(ACRONYMREGEXLIST)) PUNCTUATIONREGEX = re.compile(PUNCTUATIONPATTERN, flags=re.UNICODE) ALLPUNCTUATIONSYMBOLS = "".join(PUNCTUATIONEXCLUDE + DOTCOMMAEXCLUDE) def __init__(self): """Default constructor. """ self.strText = "" self.languageId = 0 self.keepNewWords = False self.numberFormula = { FRENCH: FrenchNumberFormula, GERMAN: GermanNumberFormula } ##################### #Getters and setters # def getText(self): return self.strText def getLanguageId(self): """Return a number between 0 and 4: 0:'unknown', 1:'French', 2:'German', 3:'English', 4:'Italian' """ return self.languageId def setText(self, strText): """Set the underlying text with 'strText'. param strText: an utf-8 encoded string """ self.strText = strText def setLanguageId(self, languageId): """Set the language id. param 'languageId': a value between 0 and 4: 0:'unknown', 1:'French', 2:'German', 3:'English', 4:'Italian' """ self.languageId = languageId LMPreparationFormula.ordDict = {} LMPreparationFormula.ordDict = LMPreparationFormula._getOrdDict( self.languageId) def setKeepNewWords(self, keepNewWords): """Keep new words. """ self.keepNewWords = keepNewWords ################## #Public interface # def prepareText(self): """Prepare 'strText' for language modeling. Heuristic is : Noise words filtering Character based normalization Dates normalization Language based abbreviations expansion Word based normalization Acronyms normalization Contraction prefixes separation Lowercase normalization return the normalized text in utf-8 encoding """ #print self.strText #Some preprocessing self._filterNoiseWords() self._normalizeUtf8() #Before punctuation removal, some rules #are applied self._normalizeDates() self._expandAbbreviations() if not self.keepNewWords: self._expandNumberInWords() #print self.strText #Removal of some of punctuation symbols self._normalizePunctuation(PUNCTUATIONEXCLUDE) #print self.strText #Dot and comman punctuation symbols are still needed self._normalizeWords() #print self.strText self._normalizeContractionPrefixes() #print self.strText #Make sure no punctuation is remaining self._normalizePunctuation(self.ALLPUNCTUATIONSYMBOLS) #print self.strText if not self.keepNewWords: self._expandAcronyms() #print self.strText self._normalizeCase() #print self.strText return self.strText ################## #Implementation # def _filterNoiseWords(self): """Do not keep some words considered as noise. For example words consisting of 4 or more punctuation characters. """ wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE) newWordsList = [] for w in wordsList: if not LMPreparationFormula._isNoise(w): newWordsList.append(w) self.strText = u" ".join(newWordsList) return self.strText def _normalizeUtf8(self): """Some punctuation characters are normalized. """ languageId = self.getLanguageId() #Mapping dictionary ordDict = LMPreparationFormula._getOrdDict(languageId) utf8List = [] #Loop through unicode characters for i, c in enumerate(self.strText): if ord(c) in ordDict: utf8List.append(ordDict[ord(c)]) else: utf8List.append(c) self.strText = u"".join(utf8List).rstrip().strip() if len(self.strText) > 1 and \ self.strText[-1] in self.ALLPUNCTUATIONSYMBOLS and \ self.strText[-2].isdigit(): self.strText = self.strText.rstrip(self.ALLPUNCTUATIONSYMBOLS) self.strText = re.sub(SPACEPATTERN, u" ", self.strText, flags=re.UNICODE) def _normalizeDates(self): """Normalize dates. """ self.strText = self.dateFormula.apply(self.strText, self.languageId) def _expandAbbreviations(self): """Expand language abbreviations. """ aDict = self._getAbbreviationsDict() if self.languageId not in aDict: return wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE) newWordsList = [] for w in wordsList: wByte = w.encode('utf-8') if wByte in aDict[self.languageId]: newWordsList.append(aDict[self.languageId][wByte]) else: newWordsList.append(w) self.strText = u" ".join(newWordsList) def _expandNumberInWords(self): """If there are numbers in words, split them. i.e. A1 --> A. 1 P3B --> P. 3 B. P5B4 --> P. 5 B. 4 PPB5 --> PPB 5 (acronyms are expanded later on) """ wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE) newWordsList = [] for w in wordsList: tokenList = re.split(CAPTURINGDIGITPATTERN, w, flags=re.UNICODE) #Numbers need to contain a digit #Ordinal numbers are not expanded if not re.search(u"[0-9]", w) or w.endswith(EXPANDEXCEPTIONS): newWordsList.append(w) #We have a match elif len(tokenList) > 1: #Single letter acronyms for i, t in enumerate(tokenList): #Digit return false if len(t) == 1 and t.isupper(): tokenList[i] = tokenList[i] + u"." newWord = u" ".join(tokenList).strip() #Group P . 5 into P. 5 newWord = re.sub(GROUPINGDOTCOMMAPATTERN, u"\g<2> ", newWord) newWordsList.append(newWord) else: newWordsList.append(w) self.strText = u" ".join(newWordsList) def _expandAcronyms(self): """Acronyms are splitted. i.e. PDC --> p. d. c. """ self.strText = self.acronymFormula.apply(self.strText, self.languageId) self.strText = re.sub(ACRONYMDELIMITER, u"", self.strText, flags=re.UNICODE) def _normalizePunctuation(self, excludeList): """Some punctuation characters are normalized: - Removal by spacing - Single, double quotes - Exclamation, Interrogation marks - Braces, round, square, curly - Slashes, back, forward - Sharp symbol - Star, plus, minus - Comma, column, semi-column, dot (keep it for abbreviations) - Lower, greater equal sign - Alone diacritics marks (circumflex accent) - Hyphen, underscore - Back quote - Pipe - Tilde - Modification - Percent % --> percent - Ampersand & --> and - At sign @ --> at - Dollars symbol $ --> dollars param 'excludeList' : a list of exclude punctuation symbols """ unicodeList, prevC, beforePrevC = [], u"", u"" for i, c in enumerate(self.strText): strC = c.encode('utf-8') #For date format, i.e. 21-Jul if strC in excludeList: #Keep dots after uppercase letters if beforePrevC in (""," ") and not prevC.isdigit() \ and strC == ".": unicodeList.append(c) unicodeList.append(u" ") #Keep some special characters if they appear after a non-space value elif self.keepNewWords and prevC not in ( "", " ") and strC in PUNCTUATIONKEEPINWORD: unicodeList.append(c) elif self.languageId != 0 and strC in PUNCTUATIONMAP: unicodeList.append(u" " + PUNCTUATIONMAP[strC][self.languageId] + u" ") else: unicodeList.append(c) beforePrevC = prevC prevC = strC self.strText = u"".join(unicodeList).rstrip().strip() self.strText = re.sub(u"(^- *| - |-$)", u"", self.strText, flags=re.UNICODE) self.strText = re.sub(u"(- )", u" ", self.strText, flags=re.UNICODE) self.strText = re.sub(SPACEPATTERN, u" ", self.strText, flags=re.UNICODE) def _normalizeWords(self): """Word base normalization. This is language dependant. - Contraction prefixes, suffixes --> separate - Abbreviations --> normalize - Acronyms (upper case words) --> split into letters - Decimal numbers --> add comma or dot words - Ordinal numbers --> transform - Cardinal numbers --> transform """ languageId = self.getLanguageId() if languageId not in self.numberFormula: #self.logger.warning("LM preparation not implemented for language id %d" % languageId) return numberFormula = self.numberFormula[languageId] self.strText = numberFormula.apply(self.strText) def _normalizeContractionPrefixes(self): """Contraction prefixes are separated and acronyms are normalized. """ self.strText = self.apostropheFormula.apply(self.strText, self.languageId) self.strText = self.contractionPrefixFormula.apply( self.strText, self.languageId, False) def _normalizeCase(self): """Case normalization (change to lower case) """ self.strText = self.strText.lower() @staticmethod def _getOrdDict(langId): """Utf-8 characters mapping in the form of a code point dictionary. """ if len(LMPreparationFormula.ordDict.keys()) > 0: return LMPreparationFormula.ordDict #Substitution dictionary, assume one character only ordDict = {} for match, sub, comment, languageId in UTF8MAP: if ord(match) in ordDict: raise Exception("Already in dictionary '%s' '%s'!" % (unichr(ord(match)), comment.encode('utf8'))) if (langId == int(languageId) or int(languageId) == 0): ordDict[ord(match)] = sub LMPreparationFormula.ordDict = ordDict return LMPreparationFormula.ordDict @staticmethod def _getAbbreviationsDict(): """Get the abbreviations dictionary with keys encoded in byte string for comparison. """ if len(LMPreparationFormula.abbreviationsDict.keys()) > 0: return LMPreparationFormula.abbreviationsDict aDict = {} for lang in ABBREVIATIONS.keys(): if lang not in aDict: aDict[lang] = {} for k, v in ABBREVIATIONS[lang].items(): aDict[lang][k.encode('utf-8')] = v LMPreparationFormula.abbreviationsDict = aDict return LMPreparationFormula.abbreviationsDict @staticmethod def _isNoise(strWord): """Check if 'strWord' is a noise word. return True or False """ return LMPreparationFormula.PUNCTUATIONREGEX.search(strWord) != None @staticmethod def _applyRegexes(strText, regexList): for p, r, t in regexList: strText = re.sub(p, r, strText, flags=re.UNICODE) return strText
def resetAllPatterns(self): """Empty all validation and substitution regexes. """ self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = []
help="enable debug output", dest="debug", action="store_true") # Parse arguments args = parser.parse_args() regexFile = args.regexFile[0] inputText = args.inputText[0] languageId = int(args.language[0]) # Flags display = args.display debug = args.debug setupLogging(logging.INFO) substitutionPatternList = [] for line in RegexList.loadFromFile(regexFile): if int(line[RegexList.TYPEINDICE]) != -1: substitutionPatternList.append(line) f = RegularExpressionFormula(None, substitutionPatternList) if display: f.displayPatterns(languageId) result = f.apply(inputText, languageId, debug) print(("Result --------------\n", result.encode('utf-8'), "\n---------------------"))
class DataPreparationAPI(): """Import sentences from one file, classifying sentences into languages. """ logger = logging.getLogger("Asrt.DataPreparationAPI") def __init__(self, inputFile, outputDir): """Default constructor. """ self.inputFile = inputFile self.outputDir = outputDir self.tempDir = outputDir self.formattedText = None self.debug = False self.regexFile = None self.lmModeling = False self.filterSentences = False self.removePunctuation = False self.verbalizePunctuation = False self.segmentWithNLTK = True self.keepNewWords = False self.doc = None self.wordClassifier = None self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] ##################### #Getters and setters # def setInputFile(self, inputFile): self.inputFile = inputFile def setOutputDir(self, outputDir): self.outputDir = outputDir def setTempDir(self, tempDir): self.tempDir = tempDir def setFormattedText(self, formattedText): self.formattedText = formattedText def getCleanedText(self): if self.doc != None: return self.doc.getCleanedText() return "" def getCleanedTextPerLanguage(self): if self.doc != None: return self.doc.getCleanedTextPerLanguage() return "" def setDebugMode(self, debug): self.debug = debug def setRegexFile(self, regexFile): self.regexFile = regexFile def setRegexList(self, regexList): """Set both validation and substitution user regexes. param regexList: a list of the following form: [u'matching pattern', u'substitution', u'type', u'language id'] """ #Reset current lists self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] substitutionList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) else: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList) def getSubstitutionList(self): """Get the user defined substitution list. return a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ return self.substitutionRegexFormula.getSubstitutionPatterns() def setSubstitutionList(self, regexList): """Set the user regexes substitution list. param regexList: a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ self.substitutionRegexFormula = RegularExpressionFormula(None) substitutionList = [] for row in regexList: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList) def getValidationList(self): """Get the user defined validation list. return a four columns list of lists: [u'matching pattern', u'', u'-1', u'0'] """ validationList = [] for pattern, regexType in self.validationPatternList: validationList.append(pattern, u"", regexType, u"0") return validationList def setValidationList(self, regexList): """Set the user regexes validation list. Filter 'regexList' for validation rules only. param regexList: a four columns list of lists: ['matching pattern', 'substitution', 'type', 'language id'] """ self.validationPatternList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) def setLMModeling(self, modelNgram): self.lmModeling = modelNgram def setFilterSentences(self, filterSentences): self.filterSentences = filterSentences def setRemovePunctuation(self, removePunctuation): self.removePunctuation = removePunctuation def setVerbalizePunctuation(self, verbalizePunctuation): self.verbalizePunctuation = verbalizePunctuation def setSegmentWithNLTK(self, segmentWithNLTK): self.segmentWithNLTK = segmentWithNLTK def setKeepNewWords(self, keepNewWords): self.keepNewWords = keepNewWords def getDocument(self): """Get the underlying 'TextDocument'. """ return self.doc ##################### #Public interface # def trainClassifier(self): """Train the underlying classifier. """ if self.wordClassifier == None: self.logger.info("Prepare the word classifier ...") self.wordClassifier = WordClassifier() self.wordClassifier.train() def getRegexes(self): """Fetch validation and substitution regexes from csv file. """ #User did not specified rules if self.regexFile == None: return #Are regexes already loaded in API if self.substitutionRegexFormula.hasPatterns() or \ len(self.validationPatternList) > 0: return regexList = RegexList().loadFromFile(self.regexFile) self.setRegexList(regexList) def resetAllPatterns(self): """Empty all validation and substitution regexes. """ self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] def prepareDocument(self, language=0): """Segment the document into sentences and prepare them. param language: an int between 0-4 - unknown : 0 - french : 1 - german : 2 - english : 3 - italian : 4 """ if language > 4 or language < 0: raise Exception("Unknown language") #Done at the API level to share resources between #documents self.logger.info("Getting regexes") self.getRegexes() if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Using following regexes substitution:\n" +\ str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3])) if len(self.validationPatternList) > 0: self.logger.info("Using following regexes for sentence validation:\n" +\ str(self.validationPatternList[0:3])) try: self.logger.info("Document file: %s" % self.inputFile) #The main document self.doc = TextDocument(self.inputFile, language, self.substitutionRegexFormula, self.validationPatternList, self.outputDir, self.segmentWithNLTK, self.keepNewWords) if self.inputFile != None: self.logger.info( "Load file, convert to text when pdf document") self.doc.loadDocumentAsSentences(self.tempDir) elif self.formattedText != None: self.logger.info("Load text string as sentences") self.doc.loadAsSentences(self.formattedText) else: raise Exception("No input file or text string provided!") #print self.doc.getCleanedText() #Control character and strip self.logger.info("Cleaning control characters") self.doc.cleanTextSentences() #print self.doc.getCleanedText() if language == 0: self.logger.info("Classifying sentences") self.doc.setClassifier(self.wordClassifier) self.doc.classifySentences() else: self.doc.setSentencesLanguage(language) #print self.doc.getCleanedText() #User's supplied regular expression if self.substitutionRegexFormula.hasPatterns(): self.logger.info( "Applying user regular expressions per language") self.doc.normalizeTextSentences() #print self.doc.getCleanedText() if self.filterSentences: self.logger.info("Filtering data") self.doc.filterTextSentences() #If LM option is selected, it will be done at #the prepareLM stage if self.removePunctuation and not self.lmModeling: self.doc.removeTextPunctuation() if self.verbalizePunctuation and not self.removePunctuation: self.doc.verbalizeTextPunctuation() #print self.doc.getCleanedText() #After language id has been set as it depends of #languages (i.e. numbers expansion) if self.lmModeling: self.logger.info("Preparing for language modeling") self.doc.prepareLM() except Exception, e: errorMessage = "An error as occurred when importing sentences: %s\n%s" % \ (getByteString(e.message), self.inputFile) errorMessage = getErrorMessage(e, errorMessage) self.logger.critical(errorMessage) raise Exception(e) return self.doc
class DataPreparationAPI(): """Import sentences from one file, classifying sentences into languages. """ logger = logging.getLogger("Asrt.DataPreparationAPI") def __init__(self, inputFile, outputDir): """Default constructor. """ self.inputFile = inputFile self.outputDir = outputDir self.tempDir = outputDir self.formattedText = None self.debug = False self.regexFile = None self.lmModeling = False self.filterSentences = False self.filterTextSentences2ndStage = False self.removePunctuation = False self.verbalizePunctuation = False self.segmentWithNLTK = True self.expandNumberInWords = True self.doc = None self.wordClassifier = None self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] ##################### #Getters and setters # def setInputFile(self, inputFile): self.inputFile = inputFile def setOutputDir(self, outputDir): self.outputDir = outputDir def setTempDir(self, tempDir): self.tempDir = tempDir def setFormattedText(self, formattedText): self.formattedText = formattedText def getCleanedText(self): if self.doc != None: return self.doc.getCleanedText() return "" def getCleanedTextPerLanguage(self): if self.doc != None: return self.doc.getCleanedTextPerLanguage() return "" def setDebugMode(self, debug): self.debug = debug def setRegexFile(self, regexFile): self.regexFile = regexFile def setRegexList(self, regexList): """Set both validation and substitution user regexes. param regexList: a list of the following form: [u'matching pattern', u'substitution', u'type', u'language id'] """ #Reset current lists self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] substitutionList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) else: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList) def getSubstitutionList(self): """Get the user defined substitution list. return a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ return self.substitutionRegexFormula.getSubstitutionPatterns() def setSubstitutionList(self, regexList): """Set the user regexes substitution list. param regexList: a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ self.substitutionRegexFormula = RegularExpressionFormula(None) substitutionList = [] for row in regexList: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList) def getValidationList(self): """Get the user defined validation list. return a four columns list of lists: [u'matching pattern', u'', u'-1', u'0'] """ validationList = [] for pattern, regexType in self.validationPatternList: validationList.append([pattern, "", regexType, "0"]) return validationList def setValidationList(self, regexList): """Set the user regexes validation list. Filter 'regexList' for validation rules only. param regexList: a four columns list of lists: ['matching pattern', 'substitution', 'type', 'language id'] """ self.validationPatternList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) def setLMModeling(self, modelNgram): self.lmModeling = modelNgram def setFilterSentences(self, filterSentences): self.filterSentences = filterSentences def setFilterSentences2ndStage(self, filterTextSentences2ndStage): self.filterTextSentences2ndStage = filterTextSentences2ndStage def setRemovePunctuation(self, removePunctuation): self.removePunctuation = removePunctuation def setVerbalizePunctuation(self, verbalizePunctuation): self.verbalizePunctuation = verbalizePunctuation def setSegmentWithNLTK(self, segmentWithNLTK): self.segmentWithNLTK = segmentWithNLTK def setExpandNumberInWords(self, expandNumberInWords): self.expandNumberInWords = expandNumberInWords def getDocument(self): """Get the underlying 'TextDocument'. """ return self.doc ##################### #Public interface # def trainClassifier(self): """Train the underlying classifier. """ if self.wordClassifier == None: self.logger.info("Prepare the word classifier ...") self.wordClassifier = WordClassifier() self.wordClassifier.train() def getRegexes(self): """Fetch validation and substitution regexes from csv file. """ #User did not specified rules if self.regexFile == None: return #Are regexes already loaded in API if self.substitutionRegexFormula.hasPatterns() or \ len(self.validationPatternList) > 0: return regexList = RegexList().loadFromFile(self.regexFile) self.setRegexList(regexList) def resetAllPatterns(self): """Empty all validation and substitution regexes. """ self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] def prepareDocument(self, language=0): """Segment the document into sentences and prepare them. param language: an int between 0-4 - unknown : 0 - french : 1 - german : 2 - english : 3 - italian : 4 """ if language > 4 or language < 0: raise Exception("Unknown language") #Done at the API level to share resources between #documents self.logger.info("Getting regexes") self.getRegexes() if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Using following regexes substitution:\n" +\ str(self.substitutionRegexFormula.getSubstitutionPatterns()[:])) # str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3])) if len(self.validationPatternList) > 0: self.logger.info("Using following regexes for sentence validation:\n" +\ str(self.validationPatternList[0:3])) try: self.logger.info("Document file: %s" % self.inputFile) #The main document self.doc = TextDocument(self.inputFile, language, self.substitutionRegexFormula, self.validationPatternList, self.outputDir, self.segmentWithNLTK, self.expandNumberInWords) if self.inputFile != None: self.logger.info( "Load file, convert to text when pdf document") self.doc.loadDocumentAsSentences(self.tempDir) elif self.formattedText != None: self.logger.info("Load text string as sentences") self.doc.loadAsSentences(self.formattedText) else: raise Exception("No input file or text string provided!") #print self.doc.getCleanedText() #Control character and strip self.logger.info("Cleaning control characters") self.doc.cleanTextSentences() #print self.doc.getCleanedText() if language == 0: self.logger.info("Classifying sentences") self.doc.setClassifier(self.wordClassifier) self.doc.classifySentences() else: self.doc.setSentencesLanguage(language) #print self.doc.getCleanedText() #User's supplied regular expression if self.substitutionRegexFormula.hasPatterns(): self.logger.info( "Applying user regular expressions per language") self.doc.normalizeTextSentences() #print self.doc.getCleanedText() if self.filterSentences: self.logger.info("Filtering data") self.doc.filterTextSentences() #If LM option is selected, it will be done at #the prepareLM stage if self.removePunctuation and not self.lmModeling: self.doc.removeTextPunctuation() if self.verbalizePunctuation and not self.removePunctuation: self.doc.verbalizeTextPunctuation() #print self.doc.getCleanedText() #After language id has been set as it depends of #languages (i.e. numbers expansion) if self.lmModeling: self.logger.info("Preparing for language modeling") self.doc.prepareLM() if self.filterTextSentences2ndStage: if language == GERMAN: self.logger.info( "Filtering data - 2nd stage (remove web address and check German orthograph)" ) self.doc.filterTextSentences2ndStage() except Exception as e: errorMessage = "An error has occurred when importing sentences: %s\n%s" % \ (getByteString(e.message), self.inputFile) errorMessage = getErrorMessage(e, errorMessage) self.logger.critical(errorMessage) raise Exception(e) return self.doc def outputSentencesToFiles(self, outputDir): """Output the original sentences with language information to the 'outputFile' """ self.logger.info("Output results to language files.") sentencesDict = { FRENCH_LABEL: [], GERMAN_LABEL: [], ITALIAN_LABEL: [], ENGLISH_LABEL: [], UNKNOWN_LABEL: [] } self.appendDocumentSentences(self.doc, sentencesDict) self.outputPerLanguage(sentencesDict, outputDir) @staticmethod def appendDocumentSentences(textDocument, sentencesDict): """Update 'sentencesDict' with the 'textDocument' content. """ #Save all sentences for textCluster in textDocument.getListContent(): strSentence = textCluster.getTextSentence() currentLanguage = UNKNOWN_LABEL if textCluster.isFrench(): currentLanguage = FRENCH_LABEL elif textCluster.isGerman(): currentLanguage = GERMAN_LABEL elif textCluster.isItalian(): currentLanguage = ITALIAN_LABEL elif textCluster.isEnglish(): currentLanguage = ENGLISH_LABEL #strOut = u"<" + textDocument.sourceFileName + u">: " + strSentence strOut = strSentence.rstrip() sentencesDict[currentLanguage].append(strOut) @staticmethod def outputPerLanguage(sentencesDict, outputDir): """Output sentences in language files. """ io = Ioread() #Finally output to disk for resultLanguage, results in list(sentencesDict.items()): if len(results) > 0: DataPreparationAPI.logger.info("%d sentences found for: %s" % (len(results), resultLanguage)) strContent = "\n".join(results) strContent = strContent.rstrip() + "\n" outputPath = "%s/sentences_%s.txt" % (outputDir,\ resultLanguage) DataPreparationAPI.logger.info("Writing content to: %s" % outputPath) io.writeFileContent(outputPath, strContent) else: DataPreparationAPI.logger.info("No sentences found for: %s" % resultLanguage)
parser.add_argument("-l", "--language", help="language (0=unk,1=fr,2=ge,3=en,4=it)", nargs=1, dest="language", default=[0]) parser.add_argument("-s", "--display", help="display regular expressions", dest="display",action="store_true") parser.add_argument("-d", "--debug", help="enable debug output", dest="debug",action="store_true") #Parse arguments args = parser.parse_args() regexFile = args.regexFile[0] inputText = args.inputText[0] languageId = int(args.language[0]) #Flags display = args.display debug = args.debug setupLogging(logging.INFO) substitutionPatternList = [] for line in RegexList.loadFromFile(regexFile): if int(line[RegexList.TYPEINDICE]) != -1: substitutionPatternList.append(line) f = RegularExpressionFormula(None, substitutionPatternList) if display: f.displayPatterns(languageId) result = f.apply(inputText, languageId, debug) print "Result --------------\n", result.encode('utf-8'),"\n---------------------"