Exemplo n.º 1
0
    def __init__(self, path):
        """Construct the config object by parsing and validating the configuration file."""

        # the config data member
        self.project   = None           # project name     :string
        self.exportdir = None           # export directory :path
        self.username  = None           # user name        :string
        self.userpath  = None           # user directory   :path
        self.rawfiles  = None           # raw file list    :list of path
        self.src       = None           # source lang      :string
        self.targets   = None           # target langs     :list of string
        self.stanford_execpath = None        # Stanford Chinese Word Segmenter path :path
        self.stanford_standard = None        # Stanford Chinese Word Segmenter Standard : string

        try:
            log_start("Config")
            log_stderr("Config file: '{0}'".format(path))

            self._readConfig(path)
            self._validateConfig()

            log_done("Config")
        except ConfigException as e:
            log_error(e.message)
            log_fail("Config")
            raise
Exemplo n.º 2
0
    def generateCorpus(self):
        log_start("Split")
        self._prepare()
        if ( len(self.config.targets) == 0 ):
            raise SplitException("Prepare the directory failed.")

        filelist = []
        for afile in self.config.rawfiles:
            try:
                log_start("Split {0}".format(afile))
                self.fillPool(afile)
                self.splitter.split(afile)
                self.filepool.closeFiles()
                self.filepool.clean()
                filelist.append(afile)
                log_done("Split {0}".format(afile))
            except SplitException as e:
                log_warning(e.message)
                # TODO: del the files when failed.
                log_fail("Split {0}".format(afile))

        if filelist == [] :
            log_error("No corpus file generated.")
            log_fail("Split")
        else:
            self.mergeCorpus(filelist)
            log_done("Split")
Exemplo n.º 3
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext, "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
    #    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemplo n.º 4
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    oldcwd = os.getcwd()
    scriptpath = "./corpustool/engine/ja-JP/"
    os.chdir(scriptpath)
    scriptname = "./chasen"
    scriptparams = " -i w " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null"
    scriptcmd = scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    os.chdir(oldcwd)

    infile = open(filename + ext, "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
    #    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemplo n.º 5
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext , "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
#    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemplo n.º 6
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    oldcwd = os.getcwd()
    scriptpath = "./corpustool/engine/ja-JP/"
    os.chdir(scriptpath)
    scriptname = "./chasen"
    scriptparams = " -i w " + '"'+ filename +'"'  + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd =  scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    os.chdir(oldcwd)

    infile = open(filename + ext , "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
#    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemplo n.º 7
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    segmenter_execpath = config.stanford_execpath
    segmenter_standard = config.stanford_standard

    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)
    
    # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。".
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+".tmp" +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    shutil.move(filename + ".tmp", filename)

    scriptpath = segmenter_execpath
    if scriptpath != None:
        scriptpath = os.path.expanduser(scriptpath)
        scriptname = scriptpath + "/segment.sh"
        print "segmenter path : "  + scriptname

        scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"'+ filename+ ".cntok" +'"' 
        # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path.
        # #2830571
        scriptcmd =  '"' + scriptname + '"' + scriptparams
        print scriptcmd
        os.system(scriptcmd)
        shutil.copy(filename + ".cntok", filename)

    # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect.
    # So filter the corpus with English tokenizor and detoken again.
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(lang) + " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext , "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
#    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)
Exemplo n.º 8
0
def filter(pcconfig, lang):
    log_start("num_clean " + lang)
    ext = ".numclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")
    cleanNum(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
    log_done("num_clean " + lang)
Exemplo n.º 9
0
def filter(pcconfig):
    log_start("diff_align")
    ext = ".diff_align"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("diff")
    diff_threshold = int(elems[0].firstChild.data)
    clean_weird_diff_align(src_filename, target_filename, diff_threshold)
    log_done("diff_align")
Exemplo n.º 10
0
def filter(pcconfig):
    log_start("dup_clean")
    ext = ".dupclean"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("restricted")
    isRestricted = True if (elems[0].firstChild.data) == "yes" else False
    cleanDup(src_filename, target_filename, isRestricted)

    # cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold)
    # cleaner.clean()
    log_done("dup_clean")
Exemplo n.º 11
0
def filter(pcconfig, lang):
    log_start("lowercase " + lang)
    ext = ".low"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "lowercase.perl"
    scriptparams = " < " + '"'+ filename +'"' + " > " + '"'+ filename+ext +'"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    shutil.copyfile(filename + ext , filename)
#    shutil.move(filename + ext, filename)
    log_done("lowercase " + lang)
Exemplo n.º 12
0
def main():
    """The main function of convert module. Parse the cmdline, and create the config from xml file which describe the
    configuration for conversion. Then run the conversion to create and filter the corpus files according to config."""

    progname = sys.argv[0]
    usage = """%prog -f command.xml"""

    parser = OptionParser(
        usage, version="%prog v0.1 (c) 2010 by Leo Jiang <*****@*****.**>")
    parser.add_option("-f",
                      "--file",
                      dest="filename",
                      metavar="FILE",
                      type="string",
                      help="read the command from file.")
    (options, args) = parser.parse_args()

    log_stderr("convert.py v0.1 (c) 2010 by Leo Jiang <*****@*****.**>")

    if (options.filename == None):
        log_stderr("Usage: {0} -f command.xml".format(progname))
        log_stderr(os.strerror(errno.EINVAL) + " : config file not specified.")
        sys.exit(errno.EINVAL)

    path = os.path.abspath(options.filename)
    if not os.path.isfile(path):
        log_error(
            os.strerror(errno.EINVAL) +
            " : file '{0}' not existed.".format(path))
        log_fail("Convert")
        sys.exit(errno.EINVAL)

    try:
        config = ConversionConfig(path)
        conversion = Conversion(config)
        conversion.run()
    except ConfigException as e:
        log_fail("Convert: ConfigException")
        sys.exit(-1)
    except Exception as e:
        print "failed."
        log_fail(e.message)
        log_fail("Convert: unknown exception.")
        sys.exit(-1)

    log_done("Convert")
    sys.exit(0)
Exemplo n.º 13
0
def filter(pcconfig):
    log_start("extra_long")
    ext = ".extra_long"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("source")
    source_threshold = int(elems[0].firstChild.data)
    elems = doc.getElementsByTagName("target")
    target_threshold = int(elems[0].firstChild.data)
    print source_threshold, target_threshold

    cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold)
    cleaner.clean()
    log_done("extra_long")
Exemplo n.º 14
0
def filter(pcconfig):
    log_start("dup_clean")
    ext = ".dupclean"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target,
                                        config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target,
                                           pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("restricted")
    isRestricted = True if (elems[0].firstChild.data) == "yes" else False
    cleanDup(src_filename, target_filename, isRestricted)

    # cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold)
    # cleaner.clean()
    log_done("dup_clean")
Exemplo n.º 15
0
def filter(pcconfig):
    log_start("extra_long")
    ext = ".extra_long"
    config = pcconfig.config
    src_filename = config.getCorpusFile(config.src, pcconfig.target, config.src)
    target_filename = config.getCorpusFile(config.src, pcconfig.target, pcconfig.target)
    xml = pcconfig.xml_frag
    doc = parseString(xml)
    elems = doc.getElementsByTagName("source")
    source_threshold = int(elems[0].firstChild.data)
    elems = doc.getElementsByTagName("target")
    target_threshold = int(elems[0].firstChild.data)
    print source_threshold, target_threshold

    cleaner = ExtraLongCleaner(src_filename, target_filename, source_threshold, target_threshold)
    cleaner.clean()
    log_done("extra_long")
Exemplo n.º 16
0
def filter(pcconfig, lang):
    log_start("phtag_clean " + lang)
    ext = ".tagclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")

    pattern = re.compile("<ph(?:\s+[\w=\"]*)*>(\{\d+\})<\/ph>")

    for line in infile:
        line = re.sub(pattern, r'\1', line)
        outfile.write(line)

    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
#    shutil.move(filename + ext, filename)
    log_done("phtag_clean " + lang)
Exemplo n.º 17
0
def filter(pcconfig, lang):
    log_start("phtag_clean " + lang)
    ext = ".tagclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")

    pattern = re.compile("<ph(?:\s+[\w=\"]*)*>(\{\d+\})<\/ph>")

    for line in infile:
        line = re.sub(pattern, r'\1', line)
        outfile.write(line)

    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
    #    shutil.move(filename + ext, filename)
    log_done("phtag_clean " + lang)
Exemplo n.º 18
0
def main():
    """The main function of convert module. Parse the cmdline, and create the config from xml file which describe the
    configuration for conversion. Then run the conversion to create and filter the corpus files according to config."""

    progname = sys.argv[0]
    usage="""%prog -f command.xml"""

    parser = OptionParser(usage, version="%prog v0.1 (c) 2010 by Leo Jiang <*****@*****.**>")
    parser.add_option("-f", "--file", dest="filename", metavar="FILE", type="string",
                      help="read the command from file.")
    (options, args) = parser.parse_args()

    log_stderr("convert.py v0.1 (c) 2010 by Leo Jiang <*****@*****.**>")

    if (options.filename == None):
        log_stderr("Usage: {0} -f command.xml".format(progname))
        log_stderr(os.strerror(errno.EINVAL) + " : config file not specified.")
        sys.exit(errno.EINVAL)

    path = os.path.abspath(options.filename)
    if not os.path.isfile(path):
        log_error(os.strerror(errno.EINVAL) + " : file '{0}' not existed.".format(path))
        log_fail("Convert")
        sys.exit(errno.EINVAL)

    try:
        config = ConversionConfig(path)
        conversion = Conversion(config)
        conversion.run()
    except ConfigException as e:
        log_fail("Convert: ConfigException")
        sys.exit(-1)
    except Exception as e:
        print "failed."
        log_fail(e.message)
        log_fail("Convert: unknown exception.")
        sys.exit(-1)

    log_done("Convert")
    sys.exit(0)
Exemplo n.º 19
0
def filter(pcconfig, lang):
    log_start("url_clean " + lang)
    ext = ".urlclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")

    # [Protocol] [Username:Password] Subdomains TopLevelDomains [Port] [Directory] [Query] [Anchor]
    # please to read the re pattern carefully to understand it.
    # reference: http://flanders.co.nz/2009/11/08/a-good-url-regular-expression-repost/
    # A blog posted by Ivan Porto Carrero.

    # The last group ($|<|{) will be used as \1 again. Cannot use the [$<{] , since the $ is not special in [].
    #urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)($|<|{)'

    # \1 <==> $|<|{
    # line = re.sub( urlPattern, r'\1', line)

    # Match the url when is followed by $, < , {. Mostly url should be ended with $, but is followed by < before
    # phtag_clean and by { after phtag_clean.
    urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)(?=($|<|{))'

    line_count = 0
    for line in infile:
        line_count += 1
        list_matched = re.findall(urlPattern, line)
        # TODO: log, not print
        # for x, y in list_matched:
        #     print str(line_count) + " : " + x
        line = re.sub( urlPattern, r'', line)
        outfile.write(line)

    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
    log_done("url_clean " + lang)
Exemplo n.º 20
0
def filter(pcconfig, lang):
    log_start("url_clean " + lang)
    ext = ".urlclean"
    config = pcconfig.config
    filename = config.getCorpusFile(config.src, pcconfig.target, lang)
    infile = open(filename, "r")
    outfile = open(filename + ext, "w")

    # [Protocol] [Username:Password] Subdomains TopLevelDomains [Port] [Directory] [Query] [Anchor]
    # please to read the re pattern carefully to understand it.
    # reference: http://flanders.co.nz/2009/11/08/a-good-url-regular-expression-repost/
    # A blog posted by Ivan Porto Carrero.

    # The last group ($|<|{) will be used as \1 again. Cannot use the [$<{] , since the $ is not special in [].
    #urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)($|<|{)'

    # \1 <==> $|<|{
    # line = re.sub( urlPattern, r'\1', line)

    # Match the url when is followed by $, < , {. Mostly url should be ended with $, but is followed by < before
    # phtag_clean and by { after phtag_clean.
    urlPattern = r'((?#Protocol)(?:(?:ht|f)tp(?:s?)\:\/\/|~\/|\/)?(?#Username:Password)(?:\w+:\w+@)?(?#Subdomains)(?:(?:[-\w]+\.)+(?#TopLevel Domains)(?:com|org|net|gov|mil|biz|info|mobi|name|aero|jobs|museum|travel|[a-z]{2}))(?#Port)(?::[\d]{1,5})?(?#Directories)(?:(?:(?:\/(?:[-\w~!$+|.,=]|%[a-f\d]{2})+)+|\/)+|\?|#)?(?#Query)(?:(?:\?(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)(?:&(?:[-\w~!$+|.,*:]|%[a-f\d{2}])+=?(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)*)*(?#Anchor)(?:#(?:[-\w~!$+|.,*:=]|%[a-f\d]{2})*)?)(?=($|<|{))'

    line_count = 0
    for line in infile:
        line_count += 1
        list_matched = re.findall(urlPattern, line)
        # TODO: log, not print
        # for x, y in list_matched:
        #     print str(line_count) + " : " + x
        line = re.sub(urlPattern, r'', line)
        outfile.write(line)

    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext, filename)
    log_done("url_clean " + lang)
Exemplo n.º 21
0
def filter(pcconfig, lang):
    log_start("tokenize " + lang)
    ext = ".tok"
    config = pcconfig.config
    segmenter_execpath = config.stanford_execpath
    segmenter_standard = config.stanford_standard

    filename = config.getCorpusFile(config.src, pcconfig.target, lang)

    # normalize the lines.
    infile = open(filename, "r")
    outfile = open(filename + ".tmp", "w")

    normalize(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ".tmp", filename)

    # if not tokenize the corpus first, Stanford Chinese Segmenter will drop the string "}。".
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ".tmp" + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)
    shutil.move(filename + ".tmp", filename)

    scriptpath = segmenter_execpath
    if scriptpath != None:
        scriptpath = os.path.expanduser(scriptpath)
        scriptname = scriptpath + "/segment.sh"
        print "segmenter path : " + scriptname

        scriptparams = " " + segmenter_standard + " " + '"' + filename + '"' + " UTF-8 0" + " 2> /dev/null" + " > " + '"' + filename + ".cntok" + '"'
        # warp the scriptname with double quote to avoid the problem of can't executing the script because of whitespace embedded in the path.
        # #2830571
        scriptcmd = '"' + scriptname + '"' + scriptparams
        print scriptcmd
        os.system(scriptcmd)
        shutil.copy(filename + ".cntok", filename)

    # Standford Chinese Segmenter will combine the { 1 } back to {1}, but not prefect.
    # So filter the corpus with English tokenizor and detoken again.
    scriptpath = "./corpustool/third-party/scripts/"
    scriptname = "tokenizer.perl"
    scriptparams = " -l " + langName(
        lang
    ) + " < " + '"' + filename + '"' + " > " + '"' + filename + ext + '"' + " 2> /dev/null"
    scriptcmd = scriptpath + scriptname + scriptparams
    print scriptcmd
    os.system(scriptcmd)

    infile = open(filename + ext, "r")
    outfile = open(filename + ext + ".detok", "w")

    detoken(infile, outfile)
    infile.close()
    outfile.close()
    shutil.copyfile(filename + ext + ".detok", filename)
    #    shutil.move(filename + ext, filename)
    log_done("tokenize " + lang)