Exemplos de RegularExpressionFormula em Python, exemplos de asrt.common.formula.FormulaRegularExpression.RegularExpressionFormula em Python

Exemplo n.º 1

0

Exibir arquivo

def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))

        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: run_apply_regex.py Projeto: d-unknown-processor/asrt

def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))
        
        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: wolverineq/asrt

    def testAcronyms(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(ACRONYMREGEXLIST))

        testList = [(u"ADG SPO PS",u"a. d. g.  s. p. o.  p. s."),
                    (u"ADG SPO PS PDCC",u"a. d. g.  s. p. o.  p. s.  p. d. c. c."),
                    (u"A ADG SPO PS PDCCC",u"A a. d. g.  s. p. o.  p. s.  p. d. c. c. c."),
                    (u"ABCDs ABCs ABs",u"a. b. c. d. s.  a. b. c. s.  a. b. s.")]

        for t, gt in testList:
            resultString = f.apply(t, 0, False)
            resultString = re.sub(ACRONYMDELIMITER, u"", resultString, flags=re.UNICODE)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: hdubey/asrt

    def testApostrophe(self):
        f = RegularExpressionFormula(None,
                                     RegexList.removeComments(APOSTHROPHELIST))

        testList = [(u"d'avant", u"d' avant")]

        self.verifyEqual(testList, f, 1)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: ondrejklejch/asrt

    def setSubstitutionList(self, regexList):
        """Set the user regexes substitution list.

           param regexList: a four columns list of lists:
          
           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)

        substitutionList = []

        for row in regexList:
            substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

Exemplo n.º 6

0

Exibir arquivo

 def __init__(self, inputFile, outputDir):
     """Default constructor.
     """
     self.inputFile = inputFile
     self.outputDir = outputDir
     self.tempDir = outputDir
     self.formattedText = None
     self.debug = False
     self.regexFile = None
     self.lmModeling = False
     self.filterSentences = False
     self.removePunctuation = False
     self.verbalizePunctuation = False
     self.doc = None
     self.wordClassifier = None
     self.substitutionRegexFormula = RegularExpressionFormula(None)
     self.validationPatternList = []

Exemplo n.º 7

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: hdubey/asrt

    def testRegexTypes(self):
        TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.", ur"6", ur"0", ur"")]

        TESTLIST = [(u"ADG", u"a. d. g."), (u"ADG/LA", u"ADG/LA"),
                    (u"a ADG b", u"a a. d. g. b"), (u"l ADG ", u"l a. d. g. "),
                    (u"l'ADG'", u"l'a. d. g.'"), (u"\"ADG\"", u"\"a. d. g.\""),
                    (u"\"ADG", u"\"a. d. g."), (u"e-ADG-", u"e-a. d. g.-"),
                    (u"l'ADG,", u"l'a. d. g.,"), (u"l'ADG.", u"l'a. d. g.."),
                    (u"l'ADG?", u"l'a. d. g.?"), (u"l'ADG!", u"l'a. d. g.!"),
                    (u"l'ADG;", u"l'a. d. g.;"), (u"l'ADG:", u"l'a. d. g.:")]

        f = RegularExpressionFormula(None,
                                     RegexList.removeComments(TYPEREGEXLIST))

        for t, gt in TESTLIST:
            r = f.apply(t, 0)
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: hdubey/asrt

    def testAcronyms(self):
        f = RegularExpressionFormula(
            None, RegexList.removeComments(ACRONYMREGEXLIST))

        testList = [
            (u"ADG SPO PS", u"a. d. g.  s. p. o.  p. s."),
            (u"ADG SPO PS PDCC", u"a. d. g.  s. p. o.  p. s.  p. d. c. c."),
            (u"A ADG SPO PS PDCCC",
             u"A a. d. g.  s. p. o.  p. s.  p. d. c. c. c."),
            (u"ABCDs ABCs ABs", u"a. b. c. d. s.  a. b. c. s.  a. b. s.")
        ]

        for t, gt in testList:
            resultString = f.apply(t, 0, False)
            resultString = re.sub(ACRONYMDELIMITER,
                                  u"",
                                  resultString,
                                  flags=re.UNICODE)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: wolverineq/asrt

    def testContractionPrefixes(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(CONTRACTIONPREFIXELIST))
        
        for p, s, t, i, c in CONTRACTIONPREFIXELIST:
            if not p.find("gr1"):
                resultString = f.apply(p, 1, False)
                self.assertEquals(s.encode('utf-8'), 
                              resultString.encode('utf-8'))

        testList = [(ur"d une",ur"d' une"),(ur"j' ai",ur"j' ai"), (ur"l' y ",ur"l' y "),
                    (ur"m' a",ur"m' a"), (ur"n' est",ur"n' est"),(ur"n' a",ur"n' a"),
                    (ur"d' y",ur"d' y"),(ur"c' en",ur"c' en"), (ur"qu' y",ur"qu' y"),
                    (ur"qu' en",ur"qu' en"), (ur"-t-on",ur" -t-on")]

        for p, gt in testList:
            resultString = f.apply(p, 1, False)
            self.assertEquals(gt.encode('utf-8'), 
                              resultString.encode('utf-8'))

Exemplo n.º 10

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: hdubey/asrt

    def testDates(self):
        f = RegularExpressionFormula(None,
                                     RegexList.removeComments(DATEREGEXLIST))

        testList = [
            (u"01.01.2015", u"01 01 2015"),
            (u"01/01/2015", u"01 01 2015"),
            (u"01.01.15", u"01 01 15"),
        ]

        self.verifyEqual(testList, f, 0)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: hdubey/asrt

    def testContractionPrefixes(self):
        f = RegularExpressionFormula(
            None, RegexList.removeComments(CONTRACTIONPREFIXELIST))

        for p, s, t, i, c in CONTRACTIONPREFIXELIST:
            if not p.find("gr1"):
                resultString = f.apply(p, 1, False)
                self.assertEquals(s.encode('utf-8'),
                                  resultString.encode('utf-8'))

        testList = [(ur"d une", ur"d' une"), (ur"j' ai", ur"j' ai"),
                    (ur"l' y ", ur"l' y "), (ur"m' a", ur"m' a"),
                    (ur"n' est", ur"n' est"), (ur"n' a", ur"n' a"),
                    (ur"d' y", ur"d' y"), (ur"c' en", ur"c' en"),
                    (ur"qu' y", ur"qu' y"), (ur"qu' en", ur"qu' en"),
                    (ur"-t-on", ur" -t-on")]

        for p, gt in testList:
            resultString = f.apply(p, 1, False)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

Exemplo n.º 12

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: ondrejklejch/asrt

    def setRegexList(self, regexList):
        """Set both validation and substitution user regexes.

           param regexList: a list of the following form:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        #Reset current lists
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

        substitutionList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))
            else:
                substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: d-unknown-processor/asrt

    def setSubstitutionList(self, regexList):
        """Set the user regexes substitution list.

           param regexList: a four columns list of lists:
          
           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        
        substitutionList = []

        for row in regexList:
            substitutionList.append((row[0],row[1],row[2],row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: idiap/asrt

    def testRegexTypes(self):
        TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.",ur"6",ur"0",ur"")]

        TESTLIST = [(u"ADG",u"a. d. g."),
                    (u"ADG/LA",u"ADG/LA"),
                    (u"a ADG b",u"a a. d. g. b"),
                    (u"l ADG ",u"l a. d. g. "),
                    (u"l'ADG'",u"l'a. d. g.'"),
                    (u"\"ADG\"",u"\"a. d. g.\""),
                    (u"\"ADG",u"\"a. d. g."),
                    (u"e-ADG-",u"e-ADG-"),
                    (u"l'ADG,",u"l'a. d. g.,"),
                    (u"l'ADG.",u"l'a. d. g.."),
                    (u"l'ADG?",u"l'a. d. g.?"),
                    (u"l'ADG!",u"l'a. d. g.!"),
                    (u"l'ADG;",u"l'a. d. g.;"),
                    (u"l'ADG:",u"l'a. d. g.:")]

        f = RegularExpressionFormula(None,
                RegexList.removeComments(TYPEREGEXLIST))
        
        for t, gt in TESTLIST:
            r = f.apply(t, 0)
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 15

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: wolverineq/asrt

 def __init__(self, inputFile, outputDir):
     """Default constructor.
     """
     self.inputFile = inputFile
     self.outputDir = outputDir
     self.tempDir = outputDir
     self.formattedText = None
     self.debug = False
     self.regexFile = None
     self.lmModeling = False
     self.filterSentences = False
     self.removePunctuation = False
     self.verbalizePunctuation = False
     self.doc = None
     self.wordClassifier = None
     self.substitutionRegexFormula = RegularExpressionFormula(None)
     self.validationPatternList = []

Exemplo n.º 16

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: d-unknown-processor/asrt

    def setRegexList(self, regexList):
        """Set both validation and substitution user regexes.

           param regexList: a list of the following form:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        #Reset current lists
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

        substitutionList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0],row[3]))
            else:
                substitutionList.append((row[0],row[1],row[2],row[3]))
            
        self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: wolverineq/asrt

class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger  = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set the acronyms to be used.

           param acronymList: a list of the following form:

           ['matching pattern', 'substitution', 'type', 'language id']
        """
        substitutionList = []

        #Skip header
        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0],row[3]))
            else:
                substitutionList.append((row[0],row[1],row[2],row[3]))
            
        self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList)

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()
    
    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def prepareDocument(self, language = 0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language> 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList,
                                    self.outputDir)
            
            if self.inputFile != None:
                self.logger.info("Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info("Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()
            
            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences: %s\n%s" % (str(e), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)
            
            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc

Exemplo n.º 18

0

Exibir arquivo

class LMPreparationFormula():
    """Main formula for language modeling text
       preparation.
    """
    logger = logging.getLogger("Asrt.LMPreparationFormula")

    ordDict = {}
    abbreviationsDict = {}

    #Regular expressions formulas
    dateFormula = RegularExpressionFormula(
        None, RegexList.removeComments(DATEREGEXLIST))
    apostropheFormula = RegularExpressionFormula(
        None, RegexList.removeComments(APOSTHROPHELIST))
    contractionPrefixFormula = RegularExpressionFormula(
        None, RegexList.removeComments(CONTRACTIONPREFIXELIST))
    acronymFormula = RegularExpressionFormula(
        None, RegexList.removeComments(ACRONYMREGEXLIST))

    PUNCTUATIONREGEX = re.compile(PUNCTUATIONPATTERN, flags=re.UNICODE)
    ALLPUNCTUATIONSYMBOLS = "".join(PUNCTUATIONEXCLUDE + DOTCOMMAEXCLUDE)

    def __init__(self):
        """Default constructor.
        """
        self.strText = ""
        self.languageId = 0
        self.keepNewWords = False
        self.numberFormula = {
            FRENCH: FrenchNumberFormula,
            GERMAN: GermanNumberFormula
        }

    #####################
    #Getters and setters
    #
    def getText(self):
        return self.strText

    def getLanguageId(self):
        """Return a number between 0 and 4:

           0:'unknown', 1:'French', 2:'German',
           3:'English', 4:'Italian'
        """
        return self.languageId

    def setText(self, strText):
        """Set the underlying text with 'strText'.

           param strText: an utf-8 encoded string
        """
        self.strText = strText

    def setLanguageId(self, languageId):
        """Set the language id.

           param 'languageId': a value between 0 and 4:

           0:'unknown', 1:'French', 2:'German',
           3:'English', 4:'Italian'
        """
        self.languageId = languageId
        LMPreparationFormula.ordDict = {}
        LMPreparationFormula.ordDict = LMPreparationFormula._getOrdDict(
            self.languageId)

    def setKeepNewWords(self, keepNewWords):
        """Keep new words.
        """
        self.keepNewWords = keepNewWords

    ##################
    #Public interface
    #
    def prepareText(self):
        """Prepare 'strText' for language modeling.

           Heuristic is :
                Noise words filtering
                Character based normalization
                Dates normalization
                Language based abbreviations expansion
                Word based normalization
                Acronyms normalization
                Contraction prefixes separation
                Lowercase normalization

            return the normalized text in utf-8 encoding
        """
        #print self.strText
        #Some preprocessing
        self._filterNoiseWords()
        self._normalizeUtf8()
        #Before punctuation removal, some rules
        #are applied
        self._normalizeDates()
        self._expandAbbreviations()

        if not self.keepNewWords:
            self._expandNumberInWords()
            #print self.strText

        #Removal of some of punctuation symbols
        self._normalizePunctuation(PUNCTUATIONEXCLUDE)
        #print self.strText

        #Dot and comman punctuation symbols are still needed
        self._normalizeWords()
        #print self.strText

        self._normalizeContractionPrefixes()
        #print self.strText

        #Make sure no punctuation is remaining
        self._normalizePunctuation(self.ALLPUNCTUATIONSYMBOLS)
        #print self.strText

        if not self.keepNewWords:
            self._expandAcronyms()
            #print self.strText

        self._normalizeCase()
        #print self.strText

        return self.strText

    ##################
    #Implementation
    #
    def _filterNoiseWords(self):
        """Do not keep some words considered as noise.

           For example words consisting of 4 or more punctuation
           characters.
        """
        wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE)
        newWordsList = []
        for w in wordsList:
            if not LMPreparationFormula._isNoise(w):
                newWordsList.append(w)

        self.strText = u" ".join(newWordsList)
        return self.strText

    def _normalizeUtf8(self):
        """Some punctuation characters are normalized.
        """

        languageId = self.getLanguageId()

        #Mapping dictionary
        ordDict = LMPreparationFormula._getOrdDict(languageId)

        utf8List = []
        #Loop through unicode characters
        for i, c in enumerate(self.strText):
            if ord(c) in ordDict:
                utf8List.append(ordDict[ord(c)])
            else:
                utf8List.append(c)

        self.strText = u"".join(utf8List).rstrip().strip()

        if len(self.strText) > 1 and \
               self.strText[-1] in self.ALLPUNCTUATIONSYMBOLS and \
               self.strText[-2].isdigit():
            self.strText = self.strText.rstrip(self.ALLPUNCTUATIONSYMBOLS)

        self.strText = re.sub(SPACEPATTERN,
                              u" ",
                              self.strText,
                              flags=re.UNICODE)

    def _normalizeDates(self):
        """Normalize dates.
        """
        self.strText = self.dateFormula.apply(self.strText, self.languageId)

    def _expandAbbreviations(self):
        """Expand language abbreviations.
        """
        aDict = self._getAbbreviationsDict()
        if self.languageId not in aDict:
            return

        wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE)
        newWordsList = []
        for w in wordsList:
            wByte = w.encode('utf-8')
            if wByte in aDict[self.languageId]:
                newWordsList.append(aDict[self.languageId][wByte])
            else:
                newWordsList.append(w)

        self.strText = u" ".join(newWordsList)

    def _expandNumberInWords(self):
        """If there are numbers in words, split them.

           i.e. A1   --> A. 1
                P3B  --> P. 3 B.
                P5B4 --> P. 5 B. 4
                PPB5 --> PPB 5 (acronyms are expanded later on)
        """
        wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE)

        newWordsList = []
        for w in wordsList:
            tokenList = re.split(CAPTURINGDIGITPATTERN, w, flags=re.UNICODE)
            #Numbers need to contain a digit
            #Ordinal numbers are not expanded
            if not re.search(u"[0-9]", w) or w.endswith(EXPANDEXCEPTIONS):
                newWordsList.append(w)
            #We have a match
            elif len(tokenList) > 1:
                #Single letter acronyms
                for i, t in enumerate(tokenList):
                    #Digit return false
                    if len(t) == 1 and t.isupper():
                        tokenList[i] = tokenList[i] + u"."
                newWord = u" ".join(tokenList).strip()
                #Group P . 5 into P. 5
                newWord = re.sub(GROUPINGDOTCOMMAPATTERN, u"\g<2> ", newWord)
                newWordsList.append(newWord)
            else:
                newWordsList.append(w)

        self.strText = u" ".join(newWordsList)

    def _expandAcronyms(self):
        """Acronyms are splitted.

           i.e. PDC --> p. d. c.
        """
        self.strText = self.acronymFormula.apply(self.strText, self.languageId)
        self.strText = re.sub(ACRONYMDELIMITER,
                              u"",
                              self.strText,
                              flags=re.UNICODE)

    def _normalizePunctuation(self, excludeList):
        """Some punctuation characters are
           normalized:
           - Removal by spacing
                    - Single, double quotes
                    - Exclamation, Interrogation marks
                    - Braces, round, square, curly
                    - Slashes, back, forward
                    - Sharp symbol
                    - Star, plus, minus
                    - Comma, column, semi-column, dot (keep it for abbreviations)
                    - Lower, greater equal sign
                    - Alone diacritics marks (circumflex accent)
                    - Hyphen, underscore
                    - Back quote
                    - Pipe
                    - Tilde
                - Modification
                    - Percent % --> percent
                    - Ampersand & --> and
                    - At sign @ --> at
                    - Dollars symbol $ --> dollars

            param 'excludeList' : a list of exclude punctuation symbols
        """
        unicodeList, prevC, beforePrevC = [], u"", u""
        for i, c in enumerate(self.strText):
            strC = c.encode('utf-8')
            #For date format, i.e. 21-Jul
            if strC in excludeList:
                #Keep dots after uppercase letters
                if beforePrevC in (""," ") and not prevC.isdigit() \
                    and strC == ".":
                    unicodeList.append(c)
                    unicodeList.append(u" ")
                #Keep some special characters if they appear after a non-space value
                elif self.keepNewWords and prevC not in (
                        "", " ") and strC in PUNCTUATIONKEEPINWORD:
                    unicodeList.append(c)
            elif self.languageId != 0 and strC in PUNCTUATIONMAP:
                unicodeList.append(u" " +
                                   PUNCTUATIONMAP[strC][self.languageId] +
                                   u" ")
            else:
                unicodeList.append(c)
            beforePrevC = prevC
            prevC = strC

        self.strText = u"".join(unicodeList).rstrip().strip()
        self.strText = re.sub(u"(^- *| - |-$)",
                              u"",
                              self.strText,
                              flags=re.UNICODE)
        self.strText = re.sub(u"(- )", u" ", self.strText, flags=re.UNICODE)
        self.strText = re.sub(SPACEPATTERN,
                              u" ",
                              self.strText,
                              flags=re.UNICODE)

    def _normalizeWords(self):
        """Word base normalization.

           This is language dependant.

            - Contraction prefixes, suffixes --> separate
            - Abbreviations --> normalize
            - Acronyms (upper case words) --> split into letters
            - Decimal numbers --> add comma or dot words
            - Ordinal numbers  --> transform
            - Cardinal numbers --> transform
        """
        languageId = self.getLanguageId()
        if languageId not in self.numberFormula:
            #self.logger.warning("LM preparation not implemented for language id %d" % languageId)
            return
        numberFormula = self.numberFormula[languageId]

        self.strText = numberFormula.apply(self.strText)

    def _normalizeContractionPrefixes(self):
        """Contraction prefixes are separated and
           acronyms are normalized.
        """
        self.strText = self.apostropheFormula.apply(self.strText,
                                                    self.languageId)
        self.strText = self.contractionPrefixFormula.apply(
            self.strText, self.languageId, False)

    def _normalizeCase(self):
        """Case normalization (change to lower case)
        """
        self.strText = self.strText.lower()

    @staticmethod
    def _getOrdDict(langId):
        """Utf-8 characters mapping in the form of a
           code point dictionary.
        """
        if len(LMPreparationFormula.ordDict.keys()) > 0:
            return LMPreparationFormula.ordDict

        #Substitution dictionary, assume one character only
        ordDict = {}
        for match, sub, comment, languageId in UTF8MAP:
            if ord(match) in ordDict:
                raise Exception("Already in dictionary '%s' '%s'!" %
                                (unichr(ord(match)), comment.encode('utf8')))
            if (langId == int(languageId) or int(languageId) == 0):
                ordDict[ord(match)] = sub

        LMPreparationFormula.ordDict = ordDict
        return LMPreparationFormula.ordDict

    @staticmethod
    def _getAbbreviationsDict():
        """Get the abbreviations dictionary with keys
           encoded in byte string for comparison.
        """
        if len(LMPreparationFormula.abbreviationsDict.keys()) > 0:
            return LMPreparationFormula.abbreviationsDict

        aDict = {}
        for lang in ABBREVIATIONS.keys():
            if lang not in aDict:
                aDict[lang] = {}
            for k, v in ABBREVIATIONS[lang].items():
                aDict[lang][k.encode('utf-8')] = v

        LMPreparationFormula.abbreviationsDict = aDict
        return LMPreparationFormula.abbreviationsDict

    @staticmethod
    def _isNoise(strWord):
        """Check if 'strWord' is a noise word.

           return True or False
        """
        return LMPreparationFormula.PUNCTUATIONREGEX.search(strWord) != None

    @staticmethod
    def _applyRegexes(strText, regexList):
        for p, r, t in regexList:
            strText = re.sub(p, r, strText, flags=re.UNICODE)
        return strText

Exemplo n.º 19

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: d-unknown-processor/asrt

 def resetAllPatterns(self):
     """Empty all validation and substitution regexes.
     """
     self.substitutionRegexFormula = RegularExpressionFormula(None)
     self.validationPatternList = []

Exemplo n.º 20

0

Exibir arquivo

                        help="enable debug output",
                        dest="debug",
                        action="store_true")

    # Parse arguments
    args = parser.parse_args()
    regexFile = args.regexFile[0]
    inputText = args.inputText[0]
    languageId = int(args.language[0])

    # Flags
    display = args.display
    debug = args.debug

    setupLogging(logging.INFO)

    substitutionPatternList = []
    for line in RegexList.loadFromFile(regexFile):
        if int(line[RegexList.TYPEINDICE]) != -1:
            substitutionPatternList.append(line)

    f = RegularExpressionFormula(None, substitutionPatternList)

    if display:
        f.displayPatterns(languageId)

    result = f.apply(inputText, languageId, debug)

    print(("Result --------------\n", result.encode('utf-8'),
           "\n---------------------"))

Exemplo n.º 21

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: ondrejklejch/asrt

 def resetAllPatterns(self):
     """Empty all validation and substitution regexes.
     """
     self.substitutionRegexFormula = RegularExpressionFormula(None)
     self.validationPatternList = []

Exemplo n.º 22

0

Exibir arquivo

Arquivo: DataPreparationAPI.py Projeto: ondrejklejch/asrt

class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.segmentWithNLTK = True
        self.keepNewWords = False
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set both validation and substitution user regexes.

           param regexList: a list of the following form:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        #Reset current lists
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

        substitutionList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))
            else:
                substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getSubstitutionList(self):
        """Get the user defined substitution list.

           return a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        return self.substitutionRegexFormula.getSubstitutionPatterns()

    def setSubstitutionList(self, regexList):
        """Set the user regexes substitution list.

           param regexList: a four columns list of lists:
          
           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)

        substitutionList = []

        for row in regexList:
            substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getValidationList(self):
        """Get the user defined validation list.

           return a four columns list of lists:

           [u'matching pattern', u'', u'-1', u'0']
        """
        validationList = []
        for pattern, regexType in self.validationPatternList:
            validationList.append(pattern, u"", regexType, u"0")

        return validationList

    def setValidationList(self, regexList):
        """Set the user regexes validation list.

           Filter 'regexList' for validation rules only.

           param regexList: a four columns list of lists:
          
           ['matching pattern', 'substitution', 'type', 'language id']
        """
        self.validationPatternList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def setSegmentWithNLTK(self, segmentWithNLTK):
        self.segmentWithNLTK = segmentWithNLTK

    def setKeepNewWords(self, keepNewWords):
        self.keepNewWords = keepNewWords

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()

    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def resetAllPatterns(self):
        """Empty all validation and substitution regexes.
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    def prepareDocument(self, language=0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language > 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList, self.outputDir,
                                    self.segmentWithNLTK, self.keepNewWords)

            if self.inputFile != None:
                self.logger.info(
                    "Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info(
                    "Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()

            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences: %s\n%s" % \
                             (getByteString(e.message), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)

            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc

Exemplo n.º 23

0

Exibir arquivo

class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.filterTextSentences2ndStage = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.segmentWithNLTK = True
        self.expandNumberInWords = True
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set both validation and substitution user regexes.

           param regexList: a list of the following form:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        #Reset current lists
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

        substitutionList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))
            else:
                substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getSubstitutionList(self):
        """Get the user defined substitution list.

           return a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        return self.substitutionRegexFormula.getSubstitutionPatterns()

    def setSubstitutionList(self, regexList):
        """Set the user regexes substitution list.

           param regexList: a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)

        substitutionList = []

        for row in regexList:
            substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getValidationList(self):
        """Get the user defined validation list.

           return a four columns list of lists:

           [u'matching pattern', u'', u'-1', u'0']
        """
        validationList = []
        for pattern, regexType in self.validationPatternList:
            validationList.append([pattern, "", regexType, "0"])

        return validationList

    def setValidationList(self, regexList):
        """Set the user regexes validation list.

           Filter 'regexList' for validation rules only.

           param regexList: a four columns list of lists:

           ['matching pattern', 'substitution', 'type', 'language id']
        """
        self.validationPatternList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setFilterSentences2ndStage(self, filterTextSentences2ndStage):
        self.filterTextSentences2ndStage = filterTextSentences2ndStage

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def setSegmentWithNLTK(self, segmentWithNLTK):
        self.segmentWithNLTK = segmentWithNLTK

    def setExpandNumberInWords(self, expandNumberInWords):
        self.expandNumberInWords = expandNumberInWords

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()

    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def resetAllPatterns(self):
        """Empty all validation and substitution regexes.
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    def prepareDocument(self, language=0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language > 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[:]))
            # str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList, self.outputDir,
                                    self.segmentWithNLTK,
                                    self.expandNumberInWords)

            if self.inputFile != None:
                self.logger.info(
                    "Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info(
                    "Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()

            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

            if self.filterTextSentences2ndStage:
                if language == GERMAN:
                    self.logger.info(
                        "Filtering data - 2nd stage (remove web address and check German orthograph)"
                    )
                    self.doc.filterTextSentences2ndStage()

        except Exception as e:
            errorMessage = "An error has occurred when importing sentences: %s\n%s" % \
                             (getByteString(e.message), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)

            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc

    def outputSentencesToFiles(self, outputDir):
        """Output the original sentences with language
           information to the 'outputFile'
        """
        self.logger.info("Output results to language files.")

        sentencesDict = {
            FRENCH_LABEL: [],
            GERMAN_LABEL: [],
            ITALIAN_LABEL: [],
            ENGLISH_LABEL: [],
            UNKNOWN_LABEL: []
        }

        self.appendDocumentSentences(self.doc, sentencesDict)
        self.outputPerLanguage(sentencesDict, outputDir)

    @staticmethod
    def appendDocumentSentences(textDocument, sentencesDict):
        """Update 'sentencesDict' with the 'textDocument'
           content.
        """
        #Save all sentences
        for textCluster in textDocument.getListContent():
            strSentence = textCluster.getTextSentence()
            currentLanguage = UNKNOWN_LABEL

            if textCluster.isFrench():
                currentLanguage = FRENCH_LABEL
            elif textCluster.isGerman():
                currentLanguage = GERMAN_LABEL
            elif textCluster.isItalian():
                currentLanguage = ITALIAN_LABEL
            elif textCluster.isEnglish():
                currentLanguage = ENGLISH_LABEL

            #strOut = u"<" + textDocument.sourceFileName + u">: " + strSentence
            strOut = strSentence.rstrip()
            sentencesDict[currentLanguage].append(strOut)

    @staticmethod
    def outputPerLanguage(sentencesDict, outputDir):
        """Output sentences in language files.
        """
        io = Ioread()
        #Finally output to disk
        for resultLanguage, results in list(sentencesDict.items()):
            if len(results) > 0:
                DataPreparationAPI.logger.info("%d sentences found for: %s" %
                                               (len(results), resultLanguage))
                strContent = "\n".join(results)
                strContent = strContent.rstrip() + "\n"
                outputPath = "%s/sentences_%s.txt" % (outputDir,\
                                                      resultLanguage)
                DataPreparationAPI.logger.info("Writing content to: %s" %
                                               outputPath)
                io.writeFileContent(outputPath, strContent)
            else:
                DataPreparationAPI.logger.info("No sentences found for: %s" %
                                               resultLanguage)

Exemplo n.º 24

0

Exibir arquivo

Arquivo: run_test_regex.py Projeto: d-unknown-processor/asrt

    parser.add_argument("-l", "--language", help="language (0=unk,1=fr,2=ge,3=en,4=it)", nargs=1, dest="language", default=[0])
    parser.add_argument("-s", "--display", help="display regular expressions", dest="display",action="store_true")
    parser.add_argument("-d", "--debug", help="enable debug output", dest="debug",action="store_true")
    

    #Parse arguments
    args = parser.parse_args()
    regexFile = args.regexFile[0]
    inputText = args.inputText[0]
    languageId = int(args.language[0])

    #Flags
    display = args.display
    debug = args.debug

    setupLogging(logging.INFO)

    substitutionPatternList = []
    for line in RegexList.loadFromFile(regexFile):
        if int(line[RegexList.TYPEINDICE]) != -1:
            substitutionPatternList.append(line)

    f = RegularExpressionFormula(None, substitutionPatternList)

    if display:
      f.displayPatterns(languageId)
    
    result = f.apply(inputText, languageId, debug)

    print "Result --------------\n", result.encode('utf-8'),"\n---------------------"