Exemplo n.º 1
0
class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.segmentWithNLTK = True
        self.keepNewWords = False
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set both validation and substitution user regexes.

           param regexList: a list of the following form:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        #Reset current lists
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

        substitutionList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))
            else:
                substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getSubstitutionList(self):
        """Get the user defined substitution list.

           return a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        return self.substitutionRegexFormula.getSubstitutionPatterns()

    def setSubstitutionList(self, regexList):
        """Set the user regexes substitution list.

           param regexList: a four columns list of lists:
          
           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)

        substitutionList = []

        for row in regexList:
            substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getValidationList(self):
        """Get the user defined validation list.

           return a four columns list of lists:

           [u'matching pattern', u'', u'-1', u'0']
        """
        validationList = []
        for pattern, regexType in self.validationPatternList:
            validationList.append(pattern, u"", regexType, u"0")

        return validationList

    def setValidationList(self, regexList):
        """Set the user regexes validation list.

           Filter 'regexList' for validation rules only.

           param regexList: a four columns list of lists:
          
           ['matching pattern', 'substitution', 'type', 'language id']
        """
        self.validationPatternList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def setSegmentWithNLTK(self, segmentWithNLTK):
        self.segmentWithNLTK = segmentWithNLTK

    def setKeepNewWords(self, keepNewWords):
        self.keepNewWords = keepNewWords

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()

    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def resetAllPatterns(self):
        """Empty all validation and substitution regexes.
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    def prepareDocument(self, language=0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language > 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList, self.outputDir,
                                    self.segmentWithNLTK, self.keepNewWords)

            if self.inputFile != None:
                self.logger.info(
                    "Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info(
                    "Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()

            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences: %s\n%s" % \
                             (getByteString(e.message), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)

            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc
Exemplo n.º 2
0
class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger  = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set the acronyms to be used.

           param acronymList: a list of the following form:

           ['matching pattern', 'substitution', 'type', 'language id']
        """
        substitutionList = []

        #Skip header
        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0],row[3]))
            else:
                substitutionList.append((row[0],row[1],row[2],row[3]))
            
        self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList)

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()
    
    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def prepareDocument(self, language = 0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language> 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList,
                                    self.outputDir)
            
            if self.inputFile != None:
                self.logger.info("Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info("Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()
            
            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences: %s\n%s" % (str(e), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)
            
            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc
Exemplo n.º 3
0
class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.filterTextSentences2ndStage = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.segmentWithNLTK = True
        self.expandNumberInWords = True
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set both validation and substitution user regexes.

           param regexList: a list of the following form:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        #Reset current lists
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

        substitutionList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))
            else:
                substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getSubstitutionList(self):
        """Get the user defined substitution list.

           return a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        return self.substitutionRegexFormula.getSubstitutionPatterns()

    def setSubstitutionList(self, regexList):
        """Set the user regexes substitution list.

           param regexList: a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)

        substitutionList = []

        for row in regexList:
            substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getValidationList(self):
        """Get the user defined validation list.

           return a four columns list of lists:

           [u'matching pattern', u'', u'-1', u'0']
        """
        validationList = []
        for pattern, regexType in self.validationPatternList:
            validationList.append([pattern, "", regexType, "0"])

        return validationList

    def setValidationList(self, regexList):
        """Set the user regexes validation list.

           Filter 'regexList' for validation rules only.

           param regexList: a four columns list of lists:

           ['matching pattern', 'substitution', 'type', 'language id']
        """
        self.validationPatternList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setFilterSentences2ndStage(self, filterTextSentences2ndStage):
        self.filterTextSentences2ndStage = filterTextSentences2ndStage

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def setSegmentWithNLTK(self, segmentWithNLTK):
        self.segmentWithNLTK = segmentWithNLTK

    def setExpandNumberInWords(self, expandNumberInWords):
        self.expandNumberInWords = expandNumberInWords

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()

    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def resetAllPatterns(self):
        """Empty all validation and substitution regexes.
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    def prepareDocument(self, language=0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language > 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[:]))
            # str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList, self.outputDir,
                                    self.segmentWithNLTK,
                                    self.expandNumberInWords)

            if self.inputFile != None:
                self.logger.info(
                    "Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info(
                    "Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()

            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

            if self.filterTextSentences2ndStage:
                if language == GERMAN:
                    self.logger.info(
                        "Filtering data - 2nd stage (remove web address and check German orthograph)"
                    )
                    self.doc.filterTextSentences2ndStage()

        except Exception as e:
            errorMessage = "An error has occurred when importing sentences: %s\n%s" % \
                             (getByteString(e.message), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)

            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc

    def outputSentencesToFiles(self, outputDir):
        """Output the original sentences with language
           information to the 'outputFile'
        """
        self.logger.info("Output results to language files.")

        sentencesDict = {
            FRENCH_LABEL: [],
            GERMAN_LABEL: [],
            ITALIAN_LABEL: [],
            ENGLISH_LABEL: [],
            UNKNOWN_LABEL: []
        }

        self.appendDocumentSentences(self.doc, sentencesDict)
        self.outputPerLanguage(sentencesDict, outputDir)

    @staticmethod
    def appendDocumentSentences(textDocument, sentencesDict):
        """Update 'sentencesDict' with the 'textDocument'
           content.
        """
        #Save all sentences
        for textCluster in textDocument.getListContent():
            strSentence = textCluster.getTextSentence()
            currentLanguage = UNKNOWN_LABEL

            if textCluster.isFrench():
                currentLanguage = FRENCH_LABEL
            elif textCluster.isGerman():
                currentLanguage = GERMAN_LABEL
            elif textCluster.isItalian():
                currentLanguage = ITALIAN_LABEL
            elif textCluster.isEnglish():
                currentLanguage = ENGLISH_LABEL

            #strOut = u"<" + textDocument.sourceFileName + u">: " + strSentence
            strOut = strSentence.rstrip()
            sentencesDict[currentLanguage].append(strOut)

    @staticmethod
    def outputPerLanguage(sentencesDict, outputDir):
        """Output sentences in language files.
        """
        io = Ioread()
        #Finally output to disk
        for resultLanguage, results in list(sentencesDict.items()):
            if len(results) > 0:
                DataPreparationAPI.logger.info("%d sentences found for: %s" %
                                               (len(results), resultLanguage))
                strContent = "\n".join(results)
                strContent = strContent.rstrip() + "\n"
                outputPath = "%s/sentences_%s.txt" % (outputDir,\
                                                      resultLanguage)
                DataPreparationAPI.logger.info("Writing content to: %s" %
                                               outputPath)
                io.writeFileContent(outputPath, strContent)
            else:
                DataPreparationAPI.logger.info("No sentences found for: %s" %
                                               resultLanguage)