Exemplo n.º 1
0
def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))

        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)
Exemplo n.º 2
0
def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))
        
        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)
Exemplo n.º 3
0
class TestIoread(unittest.TestCase):
    logger = logging.getLogger("Asrt.TestIoread")
    testFile = scriptsDir + "/resources/ioread_utf8.txt"
    testFileCSV = scriptsDir + "/resources/ioread_utf8.csv"

    testsString = [
        """Utf-8 test\nLatin characters é à ä\nNon latin characters 镕\n""",
        """Non latin characters 镕"""
    ]

    testList = [[
        'Utf-8 test', 'Latin characters é à ä', 'Non latin characters 镕'
    ]]

    def setUp(self):
        self.ioread = Ioread()

    ############
    # Tests
    #
    def testOpenFile(self):
        try:
            fd = self.ioread.openFile(self.testFile)
            self.ioread.closeFile(fd)
        except Exception:
            self.fail("testOpenFile raised ExceptionType unexpectedled")

    def testReadFileContent(self):
        strContent = self.ioread.readFileContent(self.testFile)
        self.assertEquals(self.testsString[0], strContent)

    def testReadFileContentList(self):
        strContentList = self.ioread.readFileContentList(self.testFile)
        self.assertEquals(3, len(strContentList))
        self.assertEquals(self.testsString[1], strContentList[2])

    def testReadCSV(self):
        strContentList = self.ioread.readCSV(self.testFileCSV)
        self.assertEquals(1, len(strContentList))
        self.assertEquals(strContentList, self.testList)

    def testWriteFileContent(self):
        strContent = self.testsString[0]
        self.ioread.writeFileContent(TEMPDIRUNITTEST + "/test.txt", strContent)

        readStrContent = self.ioread.readFileContent(self.testFile)
        self.assertEquals(strContent, readStrContent)