Exemplo n.º 1
0
def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))
        
        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)
Exemplo n.º 2
0
def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))

        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)
Exemplo n.º 3
0
    def testContractionPrefixes(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(CONTRACTIONPREFIXELIST))
        
        for p, s, t, i, c in CONTRACTIONPREFIXELIST:
            if not p.find("gr1"):
                resultString = f.apply(p, 1, False)
                self.assertEquals(s.encode('utf-8'), 
                              resultString.encode('utf-8'))

        testList = [(ur"d une",ur"d' une"),(ur"j' ai",ur"j' ai"), (ur"l' y ",ur"l' y "),
                    (ur"m' a",ur"m' a"), (ur"n' est",ur"n' est"),(ur"n' a",ur"n' a"),
                    (ur"d' y",ur"d' y"),(ur"c' en",ur"c' en"), (ur"qu' y",ur"qu' y"),
                    (ur"qu' en",ur"qu' en"), (ur"-t-on",ur" -t-on")]

        for p, gt in testList:
            resultString = f.apply(p, 1, False)
            self.assertEquals(gt.encode('utf-8'), 
                              resultString.encode('utf-8'))
Exemplo n.º 4
0
    def testContractionPrefixes(self):
        f = RegularExpressionFormula(
            None, RegexList.removeComments(CONTRACTIONPREFIXELIST))

        for p, s, t, i, c in CONTRACTIONPREFIXELIST:
            if not p.find("gr1"):
                resultString = f.apply(p, 1, False)
                self.assertEquals(s.encode('utf-8'),
                                  resultString.encode('utf-8'))

        testList = [(ur"d une", ur"d' une"), (ur"j' ai", ur"j' ai"),
                    (ur"l' y ", ur"l' y "), (ur"m' a", ur"m' a"),
                    (ur"n' est", ur"n' est"), (ur"n' a", ur"n' a"),
                    (ur"d' y", ur"d' y"), (ur"c' en", ur"c' en"),
                    (ur"qu' y", ur"qu' y"), (ur"qu' en", ur"qu' en"),
                    (ur"-t-on", ur" -t-on")]

        for p, gt in testList:
            resultString = f.apply(p, 1, False)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
Exemplo n.º 5
0
    def testAcronyms(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(ACRONYMREGEXLIST))

        testList = [(u"ADG SPO PS",u"a. d. g.  s. p. o.  p. s."),
                    (u"ADG SPO PS PDCC",u"a. d. g.  s. p. o.  p. s.  p. d. c. c."),
                    (u"A ADG SPO PS PDCCC",u"A a. d. g.  s. p. o.  p. s.  p. d. c. c. c."),
                    (u"ABCDs ABCs ABs",u"a. b. c. d. s.  a. b. c. s.  a. b. s.")]

        for t, gt in testList:
            resultString = f.apply(t, 0, False)
            resultString = re.sub(ACRONYMDELIMITER, u"", resultString, flags=re.UNICODE)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
Exemplo n.º 6
0
    def testRegexTypes(self):
        TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.", ur"6", ur"0", ur"")]

        TESTLIST = [(u"ADG", u"a. d. g."), (u"ADG/LA", u"ADG/LA"),
                    (u"a ADG b", u"a a. d. g. b"), (u"l ADG ", u"l a. d. g. "),
                    (u"l'ADG'", u"l'a. d. g.'"), (u"\"ADG\"", u"\"a. d. g.\""),
                    (u"\"ADG", u"\"a. d. g."), (u"e-ADG-", u"e-a. d. g.-"),
                    (u"l'ADG,", u"l'a. d. g.,"), (u"l'ADG.", u"l'a. d. g.."),
                    (u"l'ADG?", u"l'a. d. g.?"), (u"l'ADG!", u"l'a. d. g.!"),
                    (u"l'ADG;", u"l'a. d. g.;"), (u"l'ADG:", u"l'a. d. g.:")]

        f = RegularExpressionFormula(None,
                                     RegexList.removeComments(TYPEREGEXLIST))

        for t, gt in TESTLIST:
            r = f.apply(t, 0)
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 7
0
    def testAcronyms(self):
        f = RegularExpressionFormula(
            None, RegexList.removeComments(ACRONYMREGEXLIST))

        testList = [
            (u"ADG SPO PS", u"a. d. g.  s. p. o.  p. s."),
            (u"ADG SPO PS PDCC", u"a. d. g.  s. p. o.  p. s.  p. d. c. c."),
            (u"A ADG SPO PS PDCCC",
             u"A a. d. g.  s. p. o.  p. s.  p. d. c. c. c."),
            (u"ABCDs ABCs ABs", u"a. b. c. d. s.  a. b. c. s.  a. b. s.")
        ]

        for t, gt in testList:
            resultString = f.apply(t, 0, False)
            resultString = re.sub(ACRONYMDELIMITER,
                                  u"",
                                  resultString,
                                  flags=re.UNICODE)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
Exemplo n.º 8
0
    def testRegexTypes(self):
        TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.",ur"6",ur"0",ur"")]

        TESTLIST = [(u"ADG",u"a. d. g."),
                    (u"ADG/LA",u"ADG/LA"),
                    (u"a ADG b",u"a a. d. g. b"),
                    (u"l ADG ",u"l a. d. g. "),
                    (u"l'ADG'",u"l'a. d. g.'"),
                    (u"\"ADG\"",u"\"a. d. g.\""),
                    (u"\"ADG",u"\"a. d. g."),
                    (u"e-ADG-",u"e-ADG-"),
                    (u"l'ADG,",u"l'a. d. g.,"),
                    (u"l'ADG.",u"l'a. d. g.."),
                    (u"l'ADG?",u"l'a. d. g.?"),
                    (u"l'ADG!",u"l'a. d. g.!"),
                    (u"l'ADG;",u"l'a. d. g.;"),
                    (u"l'ADG:",u"l'a. d. g.:")]

        f = RegularExpressionFormula(None,
                RegexList.removeComments(TYPEREGEXLIST))
        
        for t, gt in TESTLIST:
            r = f.apply(t, 0)
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemplo n.º 9
0
    parser.add_argument("-l", "--language", help="language (0=unk,1=fr,2=ge,3=en,4=it)", nargs=1, dest="language", default=[0])
    parser.add_argument("-s", "--display", help="display regular expressions", dest="display",action="store_true")
    parser.add_argument("-d", "--debug", help="enable debug output", dest="debug",action="store_true")
    

    #Parse arguments
    args = parser.parse_args()
    regexFile = args.regexFile[0]
    inputText = args.inputText[0]
    languageId = int(args.language[0])

    #Flags
    display = args.display
    debug = args.debug

    setupLogging(logging.INFO)

    substitutionPatternList = []
    for line in RegexList.loadFromFile(regexFile):
        if int(line[RegexList.TYPEINDICE]) != -1:
            substitutionPatternList.append(line)

    f = RegularExpressionFormula(None, substitutionPatternList)

    if display:
      f.displayPatterns(languageId)
    
    result = f.apply(inputText, languageId, debug)

    print "Result --------------\n", result.encode('utf-8'),"\n---------------------"
Exemplo n.º 10
0
                        help="enable debug output",
                        dest="debug",
                        action="store_true")

    # Parse arguments
    args = parser.parse_args()
    regexFile = args.regexFile[0]
    inputText = args.inputText[0]
    languageId = int(args.language[0])

    # Flags
    display = args.display
    debug = args.debug

    setupLogging(logging.INFO)

    substitutionPatternList = []
    for line in RegexList.loadFromFile(regexFile):
        if int(line[RegexList.TYPEINDICE]) != -1:
            substitutionPatternList.append(line)

    f = RegularExpressionFormula(None, substitutionPatternList)

    if display:
        f.displayPatterns(languageId)

    result = f.apply(inputText, languageId, debug)

    print(("Result --------------\n", result.encode('utf-8'),
           "\n---------------------"))