def __init__(self, path): """Construct the config object by parsing and validating the configuration file.""" # the config data member self.project = None # project name :string self.exportdir = None # export directory :path self.username = None # user name :string self.userpath = None # user directory :path self.rawfiles = None # raw file list :list of path self.src = None # source lang :string self.targets = None # target langs :list of string self.stanford_execpath = None # Stanford Chinese Word Segmenter path :path self.stanford_standard = None # Stanford Chinese Word Segmenter Standard : string try: log_start("Config") log_stderr("Config file: '{0}'".format(path)) self._readConfig(path) self._validateConfig() log_done("Config") except ConfigException as e: log_error(e.message) log_fail("Config") raise
def _validateConfig(self): """validate the config.""" log_stderr("Config Validating ...") isValidated = True self.project = self.project.strip() if (len(self.project) == 0): self._logValidateError("Empty project name.") isValidated = False self.exportdir = os.path.expanduser(self.exportdir) if (not os.path.isdir(self.exportdir)): self._logValidateError("Not existed export directory '{0}'".format(self.exportdir)) isValidated = False self.username = self.username.strip() if (len(self.username) == 0): self._logValidateError("Empty user name.") isValidated = False self.userpath = os.path.expanduser(self.userpath) if (not os.path.isdir(self.userpath)): self._logValidateError("Not existed user directory '{0}'".format(self.userpath)) isValidated = False tmplist = [] for filepath in self.rawfiles: if (os.path.isfile(filepath)): tmplist.append(filepath) else: self._logValidateError("Ignored raw file '{0}'".format(filepath)) self.rawfiles = [] self.rawfiles.extend(tmplist) if len(self.rawfiles) == 0: self._logValidateError("No valid raw file.") isValidated = False # if (not isSupportLang(self.src)): # self._logValidateError("Src lang {0} not supported.".format(self.src)) # isValidated = False # tmplist = [] # for lang in self.targets: # if isSupportLang(lang): # tmplist.append(lang) # else: # self._logValidateError("Ignored target lang {0}".format(lang)) # self.targets = [] # self.targets.extend(tmplist) if len(self.targets) == 0: self._logValidateError("No valid target lang.") isValidated = False if not isValidated: raise CEValidateFailed()
def process(self, lang): log_stderr(lang + " processing ...") # read and verify the src and target. src = self.pc.src = self.config.src target = self.pc.target = lang pcfile = self.config.getPCFilePath(src, lang) try: doc = xml.dom.minidom.parse(pcfile) except: raise CEFormatBroken(pcfile) root = doc.documentElement srcInXML = root.getAttribute("src") targetInXML = root.getAttribute("target") if (srcInXML != src) or (targetInXML != target): raise CEFormatBroken(pcfile) nodelist = root.childNodes nodelist = [ node for node in nodelist if node.nodeType == Node.ELEMENT_NODE ] nodelist = [ node for node in nodelist if node.getAttribute("enable") == "yes" ] for node in nodelist: self.pc.xml_frag = node.toxml() try: moduleName = "corpustool.common." + node.tagName module = __import__(moduleName, globals(), locals(), 'filter') module.filter(self.pc) corpustool.common.commonstep.run(self.pc) # sys.modules[moduleName].filter() except ImportError: if node.hasAttribute("include"): if node.getAttribute("include") == "src": module = self._execModule(node.tagName, src) else: module = self._execModule(node.tagName, target) else: module = self._execModule(node.tagName, src) module = self._execModule(node.tagName, target) corpustool.common.commonstep.run(self.pc)
def _readConfig(self, path): """parse the xml file.""" log_stderr("Config Reading ...") try: doc = xml.dom.minidom.parse(path) except: raise CEFormatBroken(path) root = doc.documentElement self.project = _getElemText(_getElem(root, "project")) self.exportdir = _getElemText(_getElem(root, "exportdir")) extensions = _getElem(root, "extensions") stanford = _getElem(extensions, "StanfordChineseWordSegmenter") stanford_path = _getElem(stanford, "Path") if stanford_path.firstChild == None : self.stanford_execpath = None else: self.stanford_execpath = _getElemText(_getElem(stanford, "Path")) self.stanford_standard = _getElemText(_getElem(stanford, "Standard")) user = _getElem(root, "user") self.username = _getElemText(_getElem(user, "name")) self.userpath = _getElemText(_getElem(user, "configpath")) self.rawfiles = [] rawfiles = _getElem(root, "rawfiles") filelist = rawfiles.getElementsByTagName("file") for afile in filelist: self.rawfiles.append(_getElemText(afile)) language = _getElem(root, "language") self.src = _getElemText(_getElem(language, "src")) self.targets = [] targetlist = _getElem(language, "targetlist") targets = targetlist.getElementsByTagName("target") for target in targets: self.targets.append(_getElemText(target))
def main(): """The main function of convert module. Parse the cmdline, and create the config from xml file which describe the configuration for conversion. Then run the conversion to create and filter the corpus files according to config.""" progname = sys.argv[0] usage = """%prog -f command.xml""" parser = OptionParser( usage, version="%prog v0.1 (c) 2010 by Leo Jiang <*****@*****.**>") parser.add_option("-f", "--file", dest="filename", metavar="FILE", type="string", help="read the command from file.") (options, args) = parser.parse_args() log_stderr("convert.py v0.1 (c) 2010 by Leo Jiang <*****@*****.**>") if (options.filename == None): log_stderr("Usage: {0} -f command.xml".format(progname)) log_stderr(os.strerror(errno.EINVAL) + " : config file not specified.") sys.exit(errno.EINVAL) path = os.path.abspath(options.filename) if not os.path.isfile(path): log_error( os.strerror(errno.EINVAL) + " : file '{0}' not existed.".format(path)) log_fail("Convert") sys.exit(errno.EINVAL) try: config = ConversionConfig(path) conversion = Conversion(config) conversion.run() except ConfigException as e: log_fail("Convert: ConfigException") sys.exit(-1) except Exception as e: print "failed." log_fail(e.message) log_fail("Convert: unknown exception.") sys.exit(-1) log_done("Convert") sys.exit(0)
def main(): """The main function of convert module. Parse the cmdline, and create the config from xml file which describe the configuration for conversion. Then run the conversion to create and filter the corpus files according to config.""" progname = sys.argv[0] usage="""%prog -f command.xml""" parser = OptionParser(usage, version="%prog v0.1 (c) 2010 by Leo Jiang <*****@*****.**>") parser.add_option("-f", "--file", dest="filename", metavar="FILE", type="string", help="read the command from file.") (options, args) = parser.parse_args() log_stderr("convert.py v0.1 (c) 2010 by Leo Jiang <*****@*****.**>") if (options.filename == None): log_stderr("Usage: {0} -f command.xml".format(progname)) log_stderr(os.strerror(errno.EINVAL) + " : config file not specified.") sys.exit(errno.EINVAL) path = os.path.abspath(options.filename) if not os.path.isfile(path): log_error(os.strerror(errno.EINVAL) + " : file '{0}' not existed.".format(path)) log_fail("Convert") sys.exit(errno.EINVAL) try: config = ConversionConfig(path) conversion = Conversion(config) conversion.run() except ConfigException as e: log_fail("Convert: ConfigException") sys.exit(-1) except Exception as e: print "failed." log_fail(e.message) log_fail("Convert: unknown exception.") sys.exit(-1) log_done("Convert") sys.exit(0)
def _logValidateError(self, msg): log_stderr("Validate Error: " + msg)
def _prepare(self): """Prepare the corpus directory hierarchy.""" log_stderr("Preparing corpus directory hierarchy ...") # prepare the project directory. projPath = self.config.getProjectDir() if not os.path.exists(projPath): os.mkdir(projPath) log_stderr("Creating project directory.") # create the directory for corpus if necessary, clean the Corpus.en/zh file. if cannot open the corpus file, # remove the target language from the list, so will not do the process for that target language. srclang = self.config.src targets = self.config.targets[:] # same as: targets = list(self.config.targets) for targetlang in targets: log_stderr("") log_stderr(localePairForm(srclang, targetlang)) corpusDirPath = self.config.getCorpusDir(srclang, targetlang) if not os.path.exists(corpusDirPath): os.mkdir(corpusDirPath) log_stderr("Creating corpus directory '{0}'.".format(corpusDirPath)) log_stderr("Cleaning the corpus files ...") srcCorpusFile = self.config.getCorpusFile(srclang, targetlang, srclang) targetCorpusFile = self.config.getCorpusFile(srclang, targetlang, targetlang) srcfile = None targetfile = None try: srcfile = open(srcCorpusFile, 'w') targetfile = open(targetCorpusFile, 'w') log_stderr("Cleaned: {0}".format(srcCorpusFile)) log_stderr("Cleaned: {0}".format(targetCorpusFile)) except IOError as e: self.config.targets.remove(targetlang) log_stderr(str(e)) finally: if srcfile: srcfile.close() if targetfile: targetfile.close()