Exemplos de RegularExpressionFormula.apply em Python, exemplos de asrt.common.formula.FormulaRegularExpression.RegularExpressionFormula.apply em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: run_apply_regex.py Projeto: d-unknown-processor/asrt

def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))
        
        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)

Exemplo n.º 2

0

Exibir arquivo

def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))

        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: wolverineq/asrt

    def testContractionPrefixes(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(CONTRACTIONPREFIXELIST))
        
        for p, s, t, i, c in CONTRACTIONPREFIXELIST:
            if not p.find("gr1"):
                resultString = f.apply(p, 1, False)
                self.assertEquals(s.encode('utf-8'), 
                              resultString.encode('utf-8'))

        testList = [(ur"d une",ur"d' une"),(ur"j' ai",ur"j' ai"), (ur"l' y ",ur"l' y "),
                    (ur"m' a",ur"m' a"), (ur"n' est",ur"n' est"),(ur"n' a",ur"n' a"),
                    (ur"d' y",ur"d' y"),(ur"c' en",ur"c' en"), (ur"qu' y",ur"qu' y"),
                    (ur"qu' en",ur"qu' en"), (ur"-t-on",ur" -t-on")]

        for p, gt in testList:
            resultString = f.apply(p, 1, False)
            self.assertEquals(gt.encode('utf-8'), 
                              resultString.encode('utf-8'))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: hdubey/asrt

    def testContractionPrefixes(self):
        f = RegularExpressionFormula(
            None, RegexList.removeComments(CONTRACTIONPREFIXELIST))

        for p, s, t, i, c in CONTRACTIONPREFIXELIST:
            if not p.find("gr1"):
                resultString = f.apply(p, 1, False)
                self.assertEquals(s.encode('utf-8'),
                                  resultString.encode('utf-8'))

        testList = [(ur"d une", ur"d' une"), (ur"j' ai", ur"j' ai"),
                    (ur"l' y ", ur"l' y "), (ur"m' a", ur"m' a"),
                    (ur"n' est", ur"n' est"), (ur"n' a", ur"n' a"),
                    (ur"d' y", ur"d' y"), (ur"c' en", ur"c' en"),
                    (ur"qu' y", ur"qu' y"), (ur"qu' en", ur"qu' en"),
                    (ur"-t-on", ur" -t-on")]

        for p, gt in testList:
            resultString = f.apply(p, 1, False)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

Exemplo n.º 5

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: wolverineq/asrt

    def testAcronyms(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(ACRONYMREGEXLIST))

        testList = [(u"ADG SPO PS",u"a. d. g.  s. p. o.  p. s."),
                    (u"ADG SPO PS PDCC",u"a. d. g.  s. p. o.  p. s.  p. d. c. c."),
                    (u"A ADG SPO PS PDCCC",u"A a. d. g.  s. p. o.  p. s.  p. d. c. c. c."),
                    (u"ABCDs ABCs ABs",u"a. b. c. d. s.  a. b. c. s.  a. b. s.")]

        for t, gt in testList:
            resultString = f.apply(t, 0, False)
            resultString = re.sub(ACRONYMDELIMITER, u"", resultString, flags=re.UNICODE)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: hdubey/asrt

    def testRegexTypes(self):
        TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.", ur"6", ur"0", ur"")]

        TESTLIST = [(u"ADG", u"a. d. g."), (u"ADG/LA", u"ADG/LA"),
                    (u"a ADG b", u"a a. d. g. b"), (u"l ADG ", u"l a. d. g. "),
                    (u"l'ADG'", u"l'a. d. g.'"), (u"\"ADG\"", u"\"a. d. g.\""),
                    (u"\"ADG", u"\"a. d. g."), (u"e-ADG-", u"e-a. d. g.-"),
                    (u"l'ADG,", u"l'a. d. g.,"), (u"l'ADG.", u"l'a. d. g.."),
                    (u"l'ADG?", u"l'a. d. g.?"), (u"l'ADG!", u"l'a. d. g.!"),
                    (u"l'ADG;", u"l'a. d. g.;"), (u"l'ADG:", u"l'a. d. g.:")]

        f = RegularExpressionFormula(None,
                                     RegexList.removeComments(TYPEREGEXLIST))

        for t, gt in TESTLIST:
            r = f.apply(t, 0)
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: hdubey/asrt

    def testAcronyms(self):
        f = RegularExpressionFormula(
            None, RegexList.removeComments(ACRONYMREGEXLIST))

        testList = [
            (u"ADG SPO PS", u"a. d. g.  s. p. o.  p. s."),
            (u"ADG SPO PS PDCC", u"a. d. g.  s. p. o.  p. s.  p. d. c. c."),
            (u"A ADG SPO PS PDCCC",
             u"A a. d. g.  s. p. o.  p. s.  p. d. c. c. c."),
            (u"ABCDs ABCs ABs", u"a. b. c. d. s.  a. b. c. s.  a. b. s.")
        ]

        for t, gt in testList:
            resultString = f.apply(t, 0, False)
            resultString = re.sub(ACRONYMDELIMITER,
                                  u"",
                                  resultString,
                                  flags=re.UNICODE)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

Exemplo n.º 8

0

Exibir arquivo

Arquivo: FormulaRegexUnitTest.py Projeto: idiap/asrt

    def testRegexTypes(self):
        TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.",ur"6",ur"0",ur"")]

        TESTLIST = [(u"ADG",u"a. d. g."),
                    (u"ADG/LA",u"ADG/LA"),
                    (u"a ADG b",u"a a. d. g. b"),
                    (u"l ADG ",u"l a. d. g. "),
                    (u"l'ADG'",u"l'a. d. g.'"),
                    (u"\"ADG\"",u"\"a. d. g.\""),
                    (u"\"ADG",u"\"a. d. g."),
                    (u"e-ADG-",u"e-ADG-"),
                    (u"l'ADG,",u"l'a. d. g.,"),
                    (u"l'ADG.",u"l'a. d. g.."),
                    (u"l'ADG?",u"l'a. d. g.?"),
                    (u"l'ADG!",u"l'a. d. g.!"),
                    (u"l'ADG;",u"l'a. d. g.;"),
                    (u"l'ADG:",u"l'a. d. g.:")]

        f = RegularExpressionFormula(None,
                RegexList.removeComments(TYPEREGEXLIST))
        
        for t, gt in TESTLIST:
            r = f.apply(t, 0)
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemplo n.º 9

0

Exibir arquivo

Arquivo: run_test_regex.py Projeto: d-unknown-processor/asrt

    parser.add_argument("-l", "--language", help="language (0=unk,1=fr,2=ge,3=en,4=it)", nargs=1, dest="language", default=[0])
    parser.add_argument("-s", "--display", help="display regular expressions", dest="display",action="store_true")
    parser.add_argument("-d", "--debug", help="enable debug output", dest="debug",action="store_true")
    

    #Parse arguments
    args = parser.parse_args()
    regexFile = args.regexFile[0]
    inputText = args.inputText[0]
    languageId = int(args.language[0])

    #Flags
    display = args.display
    debug = args.debug

    setupLogging(logging.INFO)

    substitutionPatternList = []
    for line in RegexList.loadFromFile(regexFile):
        if int(line[RegexList.TYPEINDICE]) != -1:
            substitutionPatternList.append(line)

    f = RegularExpressionFormula(None, substitutionPatternList)

    if display:
      f.displayPatterns(languageId)
    
    result = f.apply(inputText, languageId, debug)

    print "Result --------------\n", result.encode('utf-8'),"\n---------------------"

Exemplo n.º 10

0

Exibir arquivo

                        help="enable debug output",
                        dest="debug",
                        action="store_true")

    # Parse arguments
    args = parser.parse_args()
    regexFile = args.regexFile[0]
    inputText = args.inputText[0]
    languageId = int(args.language[0])

    # Flags
    display = args.display
    debug = args.debug

    setupLogging(logging.INFO)

    substitutionPatternList = []
    for line in RegexList.loadFromFile(regexFile):
        if int(line[RegexList.TYPEINDICE]) != -1:
            substitutionPatternList.append(line)

    f = RegularExpressionFormula(None, substitutionPatternList)

    if display:
        f.displayPatterns(languageId)

    result = f.apply(inputText, languageId, debug)

    print(("Result --------------\n", result.encode('utf-8'),
           "\n---------------------"))