def applyRegexes(inputFile, outputFile, regularFile): """Apply the regular expressions contained in 'regularFile'. params: - inputFile : a text file in 'utf-8' encoding - outputFile : the result text file in 'utf-8' encoding - regularFile : the file containing the regular expressions to apply. """ regexFormula = RegularExpressionFormula(rulesFile=regularFile) io = Ioread() fd = io.openFile(inputFile) count, linesList = 0, [] #Read first line l = fd.readline() while l != "": l = l.rstrip().strip() #Remove punctuation using regular expressions linesList.append(regexFormula.apply(l, FRENCH)) count += 1 if count % 50000 == 0: print "Processed %d values" % count #Read next line l = fd.readline() io.closeFile(fd) strContent = u"\n".join(linesList) io.writeFileContent(outputFile, strContent)
class TestIoread(unittest.TestCase): logger = logging.getLogger("Asrt.TestIoread") testFile = scriptsDir + "/resources/ioread_utf8.txt" testFileCSV = scriptsDir + "/resources/ioread_utf8.csv" testsString = [ """Utf-8 test\nLatin characters é à ä\nNon latin characters 镕\n""", """Non latin characters 镕""" ] testList = [[ 'Utf-8 test', 'Latin characters é à ä', 'Non latin characters 镕' ]] def setUp(self): self.ioread = Ioread() ############ # Tests # def testOpenFile(self): try: fd = self.ioread.openFile(self.testFile) self.ioread.closeFile(fd) except Exception: self.fail("testOpenFile raised ExceptionType unexpectedled") def testReadFileContent(self): strContent = self.ioread.readFileContent(self.testFile) self.assertEquals(self.testsString[0], strContent) def testReadFileContentList(self): strContentList = self.ioread.readFileContentList(self.testFile) self.assertEquals(3, len(strContentList)) self.assertEquals(self.testsString[1], strContentList[2]) def testReadCSV(self): strContentList = self.ioread.readCSV(self.testFileCSV) self.assertEquals(1, len(strContentList)) self.assertEquals(strContentList, self.testList) def testWriteFileContent(self): strContent = self.testsString[0] self.ioread.writeFileContent(TEMPDIRUNITTEST + "/test.txt", strContent) readStrContent = self.ioread.readFileContent(self.testFile) self.assertEquals(strContent, readStrContent)