def applyRegexes(inputFile, outputFile, regularFile): """Apply the regular expressions contained in 'regularFile'. params: - inputFile : a text file in 'utf-8' encoding - outputFile : the result text file in 'utf-8' encoding - regularFile : the file containing the regular expressions to apply. """ regexFormula = RegularExpressionFormula(rulesFile=regularFile) io = Ioread() fd = io.openFile(inputFile) count, linesList = 0, [] #Read first line l = fd.readline() while l != "": l = l.rstrip().strip() #Remove punctuation using regular expressions linesList.append(regexFormula.apply(l, FRENCH)) count += 1 if count % 50000 == 0: print "Processed %d values" % count #Read next line l = fd.readline() io.closeFile(fd) strContent = u"\n".join(linesList) io.writeFileContent(outputFile, strContent)
def applyRegexes(inputFile, outputFile, regularFile): """Apply the regular expressions contained in 'regularFile'. params: - inputFile : a text file in 'utf-8' encoding - outputFile : the result text file in 'utf-8' encoding - regularFile : the file containing the regular expressions to apply. """ regexFormula = RegularExpressionFormula(rulesFile=regularFile) io = Ioread() fd = io.openFile(inputFile) count, linesList = 0, [] #Read first line l = fd.readline() while l != "": l = l.rstrip().strip() #Remove punctuation using regular expressions linesList.append(regexFormula.apply(l, FRENCH)) count += 1 if count % 50000 == 0: print "Processed %d values" % count #Read next line l = fd.readline() io.closeFile(fd) strContent = u"\n".join(linesList) io.writeFileContent(outputFile, strContent)
def testContractionPrefixes(self): f = RegularExpressionFormula(None, RegexList.removeComments(CONTRACTIONPREFIXELIST)) for p, s, t, i, c in CONTRACTIONPREFIXELIST: if not p.find("gr1"): resultString = f.apply(p, 1, False) self.assertEquals(s.encode('utf-8'), resultString.encode('utf-8')) testList = [(ur"d une",ur"d' une"),(ur"j' ai",ur"j' ai"), (ur"l' y ",ur"l' y "), (ur"m' a",ur"m' a"), (ur"n' est",ur"n' est"),(ur"n' a",ur"n' a"), (ur"d' y",ur"d' y"),(ur"c' en",ur"c' en"), (ur"qu' y",ur"qu' y"), (ur"qu' en",ur"qu' en"), (ur"-t-on",ur" -t-on")] for p, gt in testList: resultString = f.apply(p, 1, False) self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
def testContractionPrefixes(self): f = RegularExpressionFormula( None, RegexList.removeComments(CONTRACTIONPREFIXELIST)) for p, s, t, i, c in CONTRACTIONPREFIXELIST: if not p.find("gr1"): resultString = f.apply(p, 1, False) self.assertEquals(s.encode('utf-8'), resultString.encode('utf-8')) testList = [(ur"d une", ur"d' une"), (ur"j' ai", ur"j' ai"), (ur"l' y ", ur"l' y "), (ur"m' a", ur"m' a"), (ur"n' est", ur"n' est"), (ur"n' a", ur"n' a"), (ur"d' y", ur"d' y"), (ur"c' en", ur"c' en"), (ur"qu' y", ur"qu' y"), (ur"qu' en", ur"qu' en"), (ur"-t-on", ur" -t-on")] for p, gt in testList: resultString = f.apply(p, 1, False) self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
def testAcronyms(self): f = RegularExpressionFormula(None, RegexList.removeComments(ACRONYMREGEXLIST)) testList = [(u"ADG SPO PS",u"a. d. g. s. p. o. p. s."), (u"ADG SPO PS PDCC",u"a. d. g. s. p. o. p. s. p. d. c. c."), (u"A ADG SPO PS PDCCC",u"A a. d. g. s. p. o. p. s. p. d. c. c. c."), (u"ABCDs ABCs ABs",u"a. b. c. d. s. a. b. c. s. a. b. s.")] for t, gt in testList: resultString = f.apply(t, 0, False) resultString = re.sub(ACRONYMDELIMITER, u"", resultString, flags=re.UNICODE) self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
def testRegexTypes(self): TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.", ur"6", ur"0", ur"")] TESTLIST = [(u"ADG", u"a. d. g."), (u"ADG/LA", u"ADG/LA"), (u"a ADG b", u"a a. d. g. b"), (u"l ADG ", u"l a. d. g. "), (u"l'ADG'", u"l'a. d. g.'"), (u"\"ADG\"", u"\"a. d. g.\""), (u"\"ADG", u"\"a. d. g."), (u"e-ADG-", u"e-a. d. g.-"), (u"l'ADG,", u"l'a. d. g.,"), (u"l'ADG.", u"l'a. d. g.."), (u"l'ADG?", u"l'a. d. g.?"), (u"l'ADG!", u"l'a. d. g.!"), (u"l'ADG;", u"l'a. d. g.;"), (u"l'ADG:", u"l'a. d. g.:")] f = RegularExpressionFormula(None, RegexList.removeComments(TYPEREGEXLIST)) for t, gt in TESTLIST: r = f.apply(t, 0) self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
def testAcronyms(self): f = RegularExpressionFormula( None, RegexList.removeComments(ACRONYMREGEXLIST)) testList = [ (u"ADG SPO PS", u"a. d. g. s. p. o. p. s."), (u"ADG SPO PS PDCC", u"a. d. g. s. p. o. p. s. p. d. c. c."), (u"A ADG SPO PS PDCCC", u"A a. d. g. s. p. o. p. s. p. d. c. c. c."), (u"ABCDs ABCs ABs", u"a. b. c. d. s. a. b. c. s. a. b. s.") ] for t, gt in testList: resultString = f.apply(t, 0, False) resultString = re.sub(ACRONYMDELIMITER, u"", resultString, flags=re.UNICODE) self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))
def testRegexTypes(self): TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.",ur"6",ur"0",ur"")] TESTLIST = [(u"ADG",u"a. d. g."), (u"ADG/LA",u"ADG/LA"), (u"a ADG b",u"a a. d. g. b"), (u"l ADG ",u"l a. d. g. "), (u"l'ADG'",u"l'a. d. g.'"), (u"\"ADG\"",u"\"a. d. g.\""), (u"\"ADG",u"\"a. d. g."), (u"e-ADG-",u"e-ADG-"), (u"l'ADG,",u"l'a. d. g.,"), (u"l'ADG.",u"l'a. d. g.."), (u"l'ADG?",u"l'a. d. g.?"), (u"l'ADG!",u"l'a. d. g.!"), (u"l'ADG;",u"l'a. d. g.;"), (u"l'ADG:",u"l'a. d. g.:")] f = RegularExpressionFormula(None, RegexList.removeComments(TYPEREGEXLIST)) for t, gt in TESTLIST: r = f.apply(t, 0) self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
parser.add_argument("-l", "--language", help="language (0=unk,1=fr,2=ge,3=en,4=it)", nargs=1, dest="language", default=[0]) parser.add_argument("-s", "--display", help="display regular expressions", dest="display",action="store_true") parser.add_argument("-d", "--debug", help="enable debug output", dest="debug",action="store_true") #Parse arguments args = parser.parse_args() regexFile = args.regexFile[0] inputText = args.inputText[0] languageId = int(args.language[0]) #Flags display = args.display debug = args.debug setupLogging(logging.INFO) substitutionPatternList = [] for line in RegexList.loadFromFile(regexFile): if int(line[RegexList.TYPEINDICE]) != -1: substitutionPatternList.append(line) f = RegularExpressionFormula(None, substitutionPatternList) if display: f.displayPatterns(languageId) result = f.apply(inputText, languageId, debug) print "Result --------------\n", result.encode('utf-8'),"\n---------------------"
help="enable debug output", dest="debug", action="store_true") # Parse arguments args = parser.parse_args() regexFile = args.regexFile[0] inputText = args.inputText[0] languageId = int(args.language[0]) # Flags display = args.display debug = args.debug setupLogging(logging.INFO) substitutionPatternList = [] for line in RegexList.loadFromFile(regexFile): if int(line[RegexList.TYPEINDICE]) != -1: substitutionPatternList.append(line) f = RegularExpressionFormula(None, substitutionPatternList) if display: f.displayPatterns(languageId) result = f.apply(inputText, languageId, debug) print(("Result --------------\n", result.encode('utf-8'), "\n---------------------"))