示例#1
0
 def __init__(self):
     self.filepool = CorpusFilePool()
     self.splitter = Splitter()
     self.splitter.setFilepool(self.filepool)
     self.config = None
示例#2
0
class SplitTool:
    def __init__(self):
        self.filepool = CorpusFilePool()
        self.splitter = Splitter()
        self.splitter.setFilepool(self.filepool)
        self.config = None

    def setConfig(self, config):
        self.config = config

    def _prepare(self):
        """Prepare the corpus directory hierarchy."""

        log_stderr("Preparing corpus directory hierarchy ...")

        # prepare the project directory.
        projPath = self.config.getProjectDir()
        if not os.path.exists(projPath):
            os.mkdir(projPath)
            log_stderr("Creating project directory.")

        # create the directory for corpus if necessary, clean the Corpus.en/zh file. if cannot open the corpus file,
        # remove the target language from the list, so will not do the process for that target language.
        srclang = self.config.src
        targets = self.config.targets[:] # same as: targets = list(self.config.targets)
        for targetlang in targets:
            log_stderr("")
            log_stderr(localePairForm(srclang, targetlang))
            corpusDirPath = self.config.getCorpusDir(srclang, targetlang)
            if not os.path.exists(corpusDirPath):
                os.mkdir(corpusDirPath)
                log_stderr("Creating corpus directory '{0}'.".format(corpusDirPath))

            log_stderr("Cleaning the corpus files ...")
            srcCorpusFile    = self.config.getCorpusFile(srclang, targetlang, srclang)
            targetCorpusFile = self.config.getCorpusFile(srclang, targetlang, targetlang)
            srcfile = None
            targetfile = None
            try:
                srcfile    = open(srcCorpusFile, 'w')
                targetfile = open(targetCorpusFile, 'w')
                log_stderr("Cleaned: {0}".format(srcCorpusFile))
                log_stderr("Cleaned: {0}".format(targetCorpusFile))
            except IOError as e:
                self.config.targets.remove(targetlang)
                log_stderr(str(e))
            finally:
                if srcfile:
                    srcfile.close()
                if targetfile:
                    targetfile.close()

    def fillPool(self, filename):
        fname = self._pureName(filename)
        srclang = self.config.src
        for targetlang in self.config.targets:
            srcCorpusFile    = self.config.getNamedCorpusFile(srclang, targetlang, fname, srclang)
            targetCorpusFile = self.config.getNamedCorpusFile(srclang, targetlang, fname, targetlang)
            srcfile    = open(srcCorpusFile, 'w')
            targetfile = open(targetCorpusFile, 'w')
            self.filepool.setMapping(srclang, targetlang, srcfile, targetfile)

    def generateCorpus(self):
        log_start("Split")
        self._prepare()
        if ( len(self.config.targets) == 0 ):
            raise SplitException("Prepare the directory failed.")

        filelist = []
        for afile in self.config.rawfiles:
            try:
                log_start("Split {0}".format(afile))
                self.fillPool(afile)
                self.splitter.split(afile)
                self.filepool.closeFiles()
                self.filepool.clean()
                filelist.append(afile)
                log_done("Split {0}".format(afile))
            except SplitException as e:
                log_warning(e.message)
                # TODO: del the files when failed.
                log_fail("Split {0}".format(afile))

        if filelist == [] :
            log_error("No corpus file generated.")
            log_fail("Split")
        else:
            self.mergeCorpus(filelist)
            log_done("Split")

    def _pureName(self, filename):
        basename = os.path.basename(filename)
        (name, sep, ext) = basename.rpartition('.')
        return name

    def _mergeFiles(self, filelist, src, target, lang):
        corpus = self.config.getCorpusFile(src, target, lang)
        cf = open(corpus, "w")
        for afile in filelist:
            with open(afile, "r") as f:
                for line in f:
                    cf.write(line)
        cf.close()
        corpus_orig = corpus + ".orig"
        shutil.copyfile(corpus, corpus_orig)

    def mergeCorpus(self, filelist):
        srclang = self.config.src
        for targetlang in self.config.targets:
            slist = [ self.config.getNamedCorpusFile(srclang, targetlang, self._pureName(filename), srclang) for filename in filelist]
            tlist = [ self.config.getNamedCorpusFile(srclang, targetlang, self._pureName(filename), targetlang) for filename in filelist]
            self._mergeFiles(slist, srclang, targetlang, srclang)
            self._mergeFiles(tlist, srclang, targetlang, targetlang)
            for file in slist:
                os.remove(file)
            for file in tlist:
                os.remove(file)