inputFile = args.inputFile[0] outputDir = args.outputDir[0] language = int(args.language[0]) regexFile = args.regexFile[0] #Flags debug = bool(args.debug) filterSentences = bool(args.filter) filterSentences2ndStage = bool( args.filter2ndStage ) removePunctuation = bool(args.rmpunct) verbalizePunctuation = bool(args.vbpunct) rawSeg = bool(args.rawseg) lmModeling = bool(args.lm) keepNewWords = bool(not args.trim) setupLogging(logging.INFO, outputDir + "/task_log.txt") #Api setup api = DataPreparationAPI(inputFile, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setKeepNewWords(keepNewWords) if language == 0: api.trainClassifier()
count += 1 if count % 50000 == 0: print "Processed %d values" % count #Read next line l = fd.readline() io.closeFile(fd) strContent = u"\n".join(linesList) io.writeFileContent(outputFile, strContent) ################ # main # if __name__ == "__main__" : parser = argparse.ArgumentParser(description=usage) parser.add_argument("-i", "--input", help="input file", nargs=1, dest="inputFile", required=True) parser.add_argument("-o", "--output", help="output file", nargs=1, dest="outputFile", required=True) parser.add_argument("-r", "--regex", help="regular expression file", nargs=1, dest="regexFile", required=True) args = parser.parse_args() inputFile = os.path.abspath(args.inputFile[0]) outputFile = os.path.abspath(args.outputFile[0]) regexFile = os.path.abspath(args.regexFile[0]) setupLogging(logging.INFO) applyRegexes(inputFile, outputFile, regexFile)
# along with asrt. If not, see <http://opensource.org/licenses/>. __author__ = "Alexandre Nanchen" __version__ = "Revision: 1.0 " __date__ = "Date: 2015/09" __copyright__ = "Copyright (c) 2015 Idiap Research Institute" __license__ = "BSD 3-Clause" import unittest, re, string, logging from asrt.common.formula.FormulaLMPreparation import LMPreparationFormula from asrt.common.AsrtConstants import UTF8MAP, SPACEPATTERN, DOTCOMMAEXCLUDE, PUNCTUATIONEXCLUDE from asrt.common.AsrtConstants import ABBREVIATIONS from asrt.common.LoggingSetup import setupLogging setupLogging(logging.INFO, "./output.log") class TestFormulaLMPreparation(unittest.TestCase): allPunctList = DOTCOMMAEXCLUDE + PUNCTUATIONEXCLUDE def verifyEqual(self, testList, f, callback): for t, gt in testList: f.strText = t callback() self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8')) ############ #Tests # def testNormalizeUtf8(self):
# if __name__ == "__main__": #Setup parser parser = argparse.ArgumentParser(description=usage) parser.add_argument("-t", "--target", help="target directory containing the data.olist and data.omap", nargs=1, dest="targetDir", required=True) parser.add_argument("-o", "--output", help="output directory", nargs=1, dest="outputDir", required=True) parser.add_argument("-r", "--regex", help="regex file", nargs=1, dest="regexFile", required=True) parser.add_argument("-f", "--filter", help="filter sentences", dest="filter",action="store_true") parser.add_argument("-d", "--debug", help="enable debug output", action="store_true") parser.add_argument("-n", "--rmpunctuation", help="remove punctuation", action="store_true") parser.add_argument("-p", "--vbpunctuation", help="verbalize punctuation", action="store_true") parser.add_argument("-s", "--rawseg", help="do not segment sentences with NLTK", dest="rawseg",action="store_true") parser.add_argument("-m", "--lm", help="prepare for lm modeling", dest="lm",action="store_true") #Parse arguments args = parser.parse_args() targetDir = args.targetDir[0] outputDir = args.outputDir[0] regexFile = args.regexFile[0] segmentWithNLTK = "True" if not args.rawseg else "False" setupLogging(logging.INFO, outputDir + "/task_log.txt") task = ImportDocumentTask(TaskInfo(STRPARAMETERS % (regexFile, str(args.debug), args.rmpunctuation, args.vbpunctuation, segmentWithNLTK, args.filter, args.lm), outputDir, targetDir)) task.execute()
inputList = args.inputList[0] outputDir = args.outputDir[0] language = int(args.language[0]) regexFile = args.regexFile[0] #Flags debug = bool(args.debug) filterSentences = bool(args.filter) filterSentences2ndStage = bool( args.filter2ndStage ) removePunctuation = bool(args.rmpunct) verbalizePunctuation = bool(args.vbpunct) rawSeg = bool(args.rawseg) lmModeling = bool(args.lm) keepNewWords = bool(not args.trim) setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") #Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setKeepNewWords(keepNewWords) if language == 0: api.trainClassifier()
if __name__ == "__main__": parser = argparse.ArgumentParser(description=usage) parser.add_argument("-i", "--input", help="input file", nargs=1, dest="inputFile", required=True) parser.add_argument("-o", "--output", help="output file", nargs=1, dest="outputFile", required=True) parser.add_argument("-r", "--regex", help="regular expression file", nargs=1, dest="regexFile", required=True) args = parser.parse_args() inputFile = os.path.abspath(args.inputFile[0]) outputFile = os.path.abspath(args.outputFile[0]) regexFile = os.path.abspath(args.regexFile[0]) setupLogging(logging.INFO) applyRegexes(inputFile, outputFile, regexFile)
# along with asrt. If not, see <http://opensource.org/licenses/>. __author__ = "Alexandre Nanchen" __version__ = "Revision: 1.0 " __date__ = "Date: 2015/09" __copyright__ = "Copyright (c) 2015 Idiap Research Institute" __license__ = "BSD 3-Clause" import unittest, re, string, logging from asrt.common.formula.FormulaLMPreparation import LMPreparationFormula from asrt.common.AsrtConstants import UTF8MAP, SPACEPATTERN, DOTCOMMAEXCLUDE, PUNCTUATIONEXCLUDE from asrt.common.AsrtConstants import ABBREVIATIONS from asrt.common.LoggingSetup import setupLogging setupLogging(logging.INFO, "./output.log") class TestFormulaLMPreparation(unittest.TestCase): allPunctList = DOTCOMMAEXCLUDE + PUNCTUATIONEXCLUDE def verifyEqual(self, testList, f, callback): for t, gt in testList: f.strText = t callback() self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8')) ############ #Tests # def testNormalizeUtf8(self): languages = ['0', '1', '2']
__date__ = "Date: 2015/09" __copyright__ = "Copyright (c) 2015 Idiap Research Institute" __license__ = "BSD 3-Clause" import unittest import re import string import logging from asrt.common.formula.FormulaLMPreparation import LMPreparationFormula from asrt.common.AsrtConstants import UTF8MAP, SPACEPATTERN, DOTCOMMAEXCLUDE, PUNCTUATIONEXCLUDE from asrt.common.AsrtConstants import ABBREVIATIONS from asrt.common.LoggingSetup import setupLogging from asrt.config.AsrtConfig import TEMPDIRUNITTEST setupLogging(logging.INFO, TEMPDIRUNITTEST + "/output.log") class TestFormulaLMPreparation(unittest.TestCase): allPunctList = DOTCOMMAEXCLUDE + PUNCTUATIONEXCLUDE def verifyEqual(self, testList, f, callback): for t, gt in testList: f.strText = t callback() self.assertEqual(gt.encode('utf-8'), f.strText.encode('utf-8')) ############ # Tests # def testNormalizeUtf8(self):
inputList = args.inputList[0] outputDir = args.outputDir[0] language = int(args.language[0]) regexFile = args.regexFile[0] # Flags debug = bool(args.debug) filterSentences = bool(args.filter) filterSentences2ndStage = bool(args.filter2ndStage) removePunctuation = bool(args.rmpunct) verbalizePunctuation = bool(args.vbpunct) rawSeg = bool(args.rawseg) lmModeling = bool(args.lm) expandNumberInWords = bool(not args.trim) setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") # Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setExpandNumberInWords(expandNumberInWords) if language == 0: api.trainClassifier()