Пример #1
0
    def __init__(self, path):
        """Construct the config object by parsing and validating the configuration file."""

        # the config data member
        self.project   = None           # project name     :string
        self.exportdir = None           # export directory :path
        self.username  = None           # user name        :string
        self.userpath  = None           # user directory   :path
        self.rawfiles  = None           # raw file list    :list of path
        self.src       = None           # source lang      :string
        self.targets   = None           # target langs     :list of string
        self.stanford_execpath = None        # Stanford Chinese Word Segmenter path :path
        self.stanford_standard = None        # Stanford Chinese Word Segmenter Standard : string

        try:
            log_start("Config")
            log_stderr("Config file: '{0}'".format(path))

            self._readConfig(path)
            self._validateConfig()

            log_done("Config")
        except ConfigException as e:
            log_error(e.message)
            log_fail("Config")
            raise
Пример #2
0
    def _validateConfig(self):
        """validate the config."""

        log_stderr("Config Validating ...")

        isValidated = True

        self.project = self.project.strip()
        if (len(self.project) == 0):
            self._logValidateError("Empty project name.")
            isValidated = False

        self.exportdir = os.path.expanduser(self.exportdir)
        if (not os.path.isdir(self.exportdir)):
            self._logValidateError("Not existed export directory '{0}'".format(self.exportdir))
            isValidated = False

        self.username = self.username.strip()
        if (len(self.username) == 0):
            self._logValidateError("Empty user name.")
            isValidated = False

        self.userpath = os.path.expanduser(self.userpath)
        if (not os.path.isdir(self.userpath)):
            self._logValidateError("Not existed user directory '{0}'".format(self.userpath))
            isValidated = False

        tmplist = []
        for filepath in self.rawfiles:
            if (os.path.isfile(filepath)):
                tmplist.append(filepath)
            else:
                self._logValidateError("Ignored raw file '{0}'".format(filepath))

        self.rawfiles = []
        self.rawfiles.extend(tmplist)
        if len(self.rawfiles) == 0:
            self._logValidateError("No valid raw file.")
            isValidated = False

        # if (not isSupportLang(self.src)):
        #     self._logValidateError("Src lang {0} not supported.".format(self.src))
        #     isValidated = False

        # tmplist = []
        # for lang in self.targets:
        #     if isSupportLang(lang):
        #         tmplist.append(lang)
        #     else:
        #         self._logValidateError("Ignored target lang {0}".format(lang))
        # self.targets = []
        # self.targets.extend(tmplist)
        if len(self.targets) == 0:
            self._logValidateError("No valid target lang.")
            isValidated = False

        if not isValidated:
            raise CEValidateFailed()
Пример #3
0
    def process(self, lang):
        log_stderr(lang + " processing ...")
        # read and verify the src and target.
        src = self.pc.src = self.config.src
        target = self.pc.target = lang
        pcfile = self.config.getPCFilePath(src, lang)
        try:
            doc = xml.dom.minidom.parse(pcfile)
        except:
            raise CEFormatBroken(pcfile)

        root = doc.documentElement
        srcInXML = root.getAttribute("src")
        targetInXML = root.getAttribute("target")

        if (srcInXML != src) or (targetInXML != target):
            raise CEFormatBroken(pcfile)

        nodelist = root.childNodes
        nodelist = [
            node for node in nodelist if node.nodeType == Node.ELEMENT_NODE
        ]
        nodelist = [
            node for node in nodelist if node.getAttribute("enable") == "yes"
        ]
        for node in nodelist:
            self.pc.xml_frag = node.toxml()
            try:
                moduleName = "corpustool.common." + node.tagName
                module = __import__(moduleName, globals(), locals(), 'filter')
                module.filter(self.pc)
                corpustool.common.commonstep.run(self.pc)
                # sys.modules[moduleName].filter()
            except ImportError:
                if node.hasAttribute("include"):
                    if node.getAttribute("include") == "src":
                        module = self._execModule(node.tagName, src)
                    else:
                        module = self._execModule(node.tagName, target)
                else:
                    module = self._execModule(node.tagName, src)
                    module = self._execModule(node.tagName, target)
                    corpustool.common.commonstep.run(self.pc)
Пример #4
0
    def process(self, lang):
        log_stderr(lang + " processing ...")
        # read and verify the src and target.
        src    = self.pc.src = self.config.src
        target = self.pc.target = lang
        pcfile = self.config.getPCFilePath(src, lang)
        try:
            doc = xml.dom.minidom.parse(pcfile)
        except:
            raise CEFormatBroken(pcfile)

        root = doc.documentElement
        srcInXML = root.getAttribute("src")
        targetInXML = root.getAttribute("target")

        if (srcInXML != src) or (targetInXML != target):
            raise CEFormatBroken(pcfile)

        nodelist = root.childNodes
        nodelist = [ node for node in nodelist if node.nodeType == Node.ELEMENT_NODE ]
        nodelist = [ node for node in nodelist if node.getAttribute("enable") == "yes" ]
        for node in nodelist:
            self.pc.xml_frag = node.toxml()
            try:
                moduleName = "corpustool.common." + node.tagName
                module = __import__(moduleName, globals(), locals(), 'filter')
                module.filter(self.pc)
                corpustool.common.commonstep.run(self.pc)
                # sys.modules[moduleName].filter()
            except ImportError:
                if node.hasAttribute("include"):
                    if node.getAttribute("include") == "src":
                        module = self._execModule(node.tagName, src)
                    else:
                        module = self._execModule(node.tagName, target)
                else:
                    module = self._execModule(node.tagName, src)
                    module = self._execModule(node.tagName, target)
                    corpustool.common.commonstep.run(self.pc)
Пример #5
0
    def _readConfig(self, path):
        """parse the xml file."""

        log_stderr("Config Reading ...")
        try:
            doc = xml.dom.minidom.parse(path)
        except:
            raise CEFormatBroken(path)

        root = doc.documentElement
        self.project = _getElemText(_getElem(root, "project"))
        self.exportdir = _getElemText(_getElem(root, "exportdir"))

        extensions = _getElem(root, "extensions")
        stanford = _getElem(extensions, "StanfordChineseWordSegmenter")
        stanford_path = _getElem(stanford, "Path")
        if stanford_path.firstChild == None :
            self.stanford_execpath = None
        else:
            self.stanford_execpath = _getElemText(_getElem(stanford, "Path"))
        self.stanford_standard = _getElemText(_getElem(stanford, "Standard"))

        user = _getElem(root, "user")
        self.username = _getElemText(_getElem(user, "name"))
        self.userpath = _getElemText(_getElem(user, "configpath"))

        self.rawfiles = []
        rawfiles = _getElem(root, "rawfiles")
        filelist = rawfiles.getElementsByTagName("file")
        for afile in filelist:
            self.rawfiles.append(_getElemText(afile))

        language = _getElem(root, "language")
        self.src = _getElemText(_getElem(language, "src"))
        self.targets = []
        targetlist = _getElem(language, "targetlist")
        targets = targetlist.getElementsByTagName("target")
        for target in targets:
            self.targets.append(_getElemText(target))
Пример #6
0
def main():
    """The main function of convert module. Parse the cmdline, and create the config from xml file which describe the
    configuration for conversion. Then run the conversion to create and filter the corpus files according to config."""

    progname = sys.argv[0]
    usage = """%prog -f command.xml"""

    parser = OptionParser(
        usage, version="%prog v0.1 (c) 2010 by Leo Jiang <*****@*****.**>")
    parser.add_option("-f",
                      "--file",
                      dest="filename",
                      metavar="FILE",
                      type="string",
                      help="read the command from file.")
    (options, args) = parser.parse_args()

    log_stderr("convert.py v0.1 (c) 2010 by Leo Jiang <*****@*****.**>")

    if (options.filename == None):
        log_stderr("Usage: {0} -f command.xml".format(progname))
        log_stderr(os.strerror(errno.EINVAL) + " : config file not specified.")
        sys.exit(errno.EINVAL)

    path = os.path.abspath(options.filename)
    if not os.path.isfile(path):
        log_error(
            os.strerror(errno.EINVAL) +
            " : file '{0}' not existed.".format(path))
        log_fail("Convert")
        sys.exit(errno.EINVAL)

    try:
        config = ConversionConfig(path)
        conversion = Conversion(config)
        conversion.run()
    except ConfigException as e:
        log_fail("Convert: ConfigException")
        sys.exit(-1)
    except Exception as e:
        print "failed."
        log_fail(e.message)
        log_fail("Convert: unknown exception.")
        sys.exit(-1)

    log_done("Convert")
    sys.exit(0)
Пример #7
0
def main():
    """The main function of convert module. Parse the cmdline, and create the config from xml file which describe the
    configuration for conversion. Then run the conversion to create and filter the corpus files according to config."""

    progname = sys.argv[0]
    usage="""%prog -f command.xml"""

    parser = OptionParser(usage, version="%prog v0.1 (c) 2010 by Leo Jiang <*****@*****.**>")
    parser.add_option("-f", "--file", dest="filename", metavar="FILE", type="string",
                      help="read the command from file.")
    (options, args) = parser.parse_args()

    log_stderr("convert.py v0.1 (c) 2010 by Leo Jiang <*****@*****.**>")

    if (options.filename == None):
        log_stderr("Usage: {0} -f command.xml".format(progname))
        log_stderr(os.strerror(errno.EINVAL) + " : config file not specified.")
        sys.exit(errno.EINVAL)

    path = os.path.abspath(options.filename)
    if not os.path.isfile(path):
        log_error(os.strerror(errno.EINVAL) + " : file '{0}' not existed.".format(path))
        log_fail("Convert")
        sys.exit(errno.EINVAL)

    try:
        config = ConversionConfig(path)
        conversion = Conversion(config)
        conversion.run()
    except ConfigException as e:
        log_fail("Convert: ConfigException")
        sys.exit(-1)
    except Exception as e:
        print "failed."
        log_fail(e.message)
        log_fail("Convert: unknown exception.")
        sys.exit(-1)

    log_done("Convert")
    sys.exit(0)
Пример #8
0
 def _logValidateError(self, msg):
     log_stderr("Validate Error: " + msg)
Пример #9
0
    def _prepare(self):
        """Prepare the corpus directory hierarchy."""

        log_stderr("Preparing corpus directory hierarchy ...")

        # prepare the project directory.
        projPath = self.config.getProjectDir()
        if not os.path.exists(projPath):
            os.mkdir(projPath)
            log_stderr("Creating project directory.")

        # create the directory for corpus if necessary, clean the Corpus.en/zh file. if cannot open the corpus file,
        # remove the target language from the list, so will not do the process for that target language.
        srclang = self.config.src
        targets = self.config.targets[:] # same as: targets = list(self.config.targets)
        for targetlang in targets:
            log_stderr("")
            log_stderr(localePairForm(srclang, targetlang))
            corpusDirPath = self.config.getCorpusDir(srclang, targetlang)
            if not os.path.exists(corpusDirPath):
                os.mkdir(corpusDirPath)
                log_stderr("Creating corpus directory '{0}'.".format(corpusDirPath))

            log_stderr("Cleaning the corpus files ...")
            srcCorpusFile    = self.config.getCorpusFile(srclang, targetlang, srclang)
            targetCorpusFile = self.config.getCorpusFile(srclang, targetlang, targetlang)
            srcfile = None
            targetfile = None
            try:
                srcfile    = open(srcCorpusFile, 'w')
                targetfile = open(targetCorpusFile, 'w')
                log_stderr("Cleaned: {0}".format(srcCorpusFile))
                log_stderr("Cleaned: {0}".format(targetCorpusFile))
            except IOError as e:
                self.config.targets.remove(targetlang)
                log_stderr(str(e))
            finally:
                if srcfile:
                    srcfile.close()
                if targetfile:
                    targetfile.close()