示例#1
0
def main():
  logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
  parser = optparse.OptionParser("%prog [options]")
  parser.add_option("-e", "--target-language", type="string", dest="target_language")
  parser.add_option("-f", "--source-language", type="string", dest="source_language")
  parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
  parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem")
  parser.add_option("-a", "--align", type="string", dest="align_file")
  parser.add_option("-w", "--working-dir", type="string", dest="working_dir")


  parser.set_defaults(
    target_language = "en",
    source_language = "de",
    corpus_stem  = "test",
    align_file = "test.align",
    working_dir = "working",
  )
  options,args = parser.parse_args(sys.argv)
  if not os.path.exists(options.working_dir):
    LOG.error("Working directory '%s' not found" % working_dir)
    sys.exit(1)

  m,n = None,None
  for line in open(options.working_dir + "/info"):
    name,value = line[:-1].split()
    if name == "m": m = int(value)
    if name == "n": n = int(value)
  if m == None or n == None:
    LOG.error("info file is incomplete")
    sys.exit(1)

  tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
  svocab, offset = read_vocab(options.working_dir + "/vocab.source", offset+1)

  file_stem = os.path.basename(options.corpus_stem)
  ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
  extract.get_ngrams(options.corpus_stem,
                     options.align_file,
                     options.tagged_stem,
                     svocab,
                     tvocab,
                     options.source_language,
                     options.target_language,
                     m,
                     n,
                     ofh)

  numberized_file = options.working_dir + "/" + file_stem + ".numberized"
  ngrams_file_handle = open(options.working_dir + "/" + file_stem + ".ngrams", 'r')
  numberized_file_handle = open(numberized_file, 'w')

  #Numberize the file
  for line in ngrams_file_handle:
    numberized_file_handle.write(extract.numberize(line, m, n, svocab, tvocab))

  numberized_file_handle.close()
  ngrams_file_handle.close()
示例#2
0
文件: extract_test.py 项目: j0ma/lmm
def main():
    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.DEBUG)
    parser = optparse.OptionParser("%prog [options]")
    parser.add_option("-e",
                      "--target-language",
                      type="string",
                      dest="target_language")
    parser.add_option("-f",
                      "--source-language",
                      type="string",
                      dest="source_language")
    parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
    parser.add_option("-t",
                      "--tagged-corpus",
                      type="string",
                      dest="tagged_stem")
    parser.add_option("-a", "--align", type="string", dest="align_file")
    parser.add_option("-w", "--working-dir", type="string", dest="working_dir")

    parser.set_defaults(target_language="en",
                        source_language="de",
                        corpus_stem="test",
                        align_file="test.align",
                        working_dir="working")
    options, args = parser.parse_args(sys.argv)
    if not os.path.exists(options.working_dir):
        raise Exception("Working directory '%s' not found" %
                        options.working_dir)

    m, n = None, None
    for line in open(options.working_dir + "/info"):
        name, value = line[:-1].split()
        if name == "m":
            m = int(value)
        if name == "n":
            n = int(value)
    if m is None or n is None:
        raise Exception("Info file is incomplete.")

    tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
    svocab, offset = read_vocab(options.working_dir + "/vocab.source",
                                offset + 1)

    file_stem = os.path.basename(options.corpus_stem)
    ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
    extract.get_ngrams(options.corpus_stem, options.align_file,
                       options.tagged_stem, svocab, tvocab,
                       options.source_language, options.target_language, m, n,
                       ofh)

    numberized_file = options.working_dir + "/" + file_stem + ".numberized"
    ngrams_file_handle = open(
        os.path.join(options.working_dir, file_stem + ".ngrams"), 'r')
    numberized_file_handle = open(numberized_file, 'w')

    # Numberize the file.
    for line in ngrams_file_handle:
        numberized_file_handle.write(
            extract.numberize(line, m, n, svocab, tvocab))

    numberized_file_handle.close()
    ngrams_file_handle.close()
示例#3
0
def main():
    logging.basicConfig(
        format='%(asctime)s %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
    parser = optparse.OptionParser("%prog [options]")
    parser.add_option(
        "-e", "--target-language", type="string", dest="target_language")
    parser.add_option(
        "-f", "--source-language", type="string", dest="source_language")
    parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
    parser.add_option(
        "-t", "--tagged-corpus", type="string", dest="tagged_stem")
    parser.add_option("-a", "--align", type="string", dest="align_file")
    parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
    parser.add_option("-n", "--target-context", type="int", dest="n")
    parser.add_option("-m", "--source-context", type="int", dest="m")
    parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
    parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")

    parser.set_defaults(
        target_language="en",
        source_language="de",
        corpus_stem="train.10k",
        align_file="train.10k.align",
        n=5,
        m=4,
        working_dir="working",
        sprune=16000,
        tprune=16000
        )
    options, args = parser.parse_args(sys.argv)

    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    else:
        LOG.warn("Directory %s already exists, re-using" % options.working_dir)

    info_file = options.working_dir + "/info"
    if os.path.exists(info_file):
        for line in open(info_file):
            name, value = line[:-1].split()
            n_mismatch = (name == 'n' and int(value) != options.n)
            m_mismatch = (name == 'm' and int(value) != options.m)
            if n_mismatch or m_mismatch:
                LOG.error(
                    "info file exists, but parameters do not match. "
                    "Delete working directory and rerun.")
                sys.exit(1)
    else:
        ifh = open(info_file, "w")
        print>>ifh, "m", options.m
        print>>ifh, "n", options.n
        ifh.close()

    scorpus = options.corpus_stem + "." + options.source_language
    tcorpus = options.corpus_stem + "." + options.target_language

    tvocab, svocab = None, None
    # Extract vocabulary, and prune, if required.
    svocab = get_pruned_vocab(scorpus, options.sprune)
    tvocab = get_pruned_vocab(tcorpus, options.tprune)

    file_stem = os.path.basename(options.corpus_stem)
    ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
    ofh = open(ngram_file, "w")

    tags = extract.get_ngrams(
        options.corpus_stem,
        options.align_file,
        options.tagged_stem,
        svocab,
        tvocab,
        options.source_language,
        options.target_language,
        options.m,
        options.n,
        ofh)

    # Save vocabularies.
    del svocab["<null>"]
    del tvocab["<null>"]
    del svocab["<unk>"]
    del tvocab["<unk>"]
    svocab_list = [item[0] for item in svocab.most_common()]
    tvocab_list = [item[0] for item in tvocab.most_common()]

    # UNK is always the first vocabulary element. Make sure
    # it appears in position 0
    # We need to use <null> token in the chart decoder in order
    # to correctly estimate the probabilities of incomplete subphrases
    # that are not sentence initial.

    tvocab_list.insert(0, "<null>")
    tvocab_list.insert(0, "<unk>")
    svocab_list.insert(0, "<unk>")

    # Get tags:
    tag_list = [item[0] for item in tags.most_common()]
    svocab_list = svocab_list + tag_list
    tvocab_list = tvocab_list + tag_list

    save_vocab(options.working_dir, "vocab.source", svocab_list)
    save_vocab(options.working_dir, "vocab.target", tvocab_list)

    # Create vocab dictionaries that map word to ID.
    tvocab_idmap = {}
    for i in range(len(tvocab_list)):
        tvocab_idmap[tvocab_list[i]] = i

    svocab_idmap = {}
    for i in range(len(svocab_list)):
        svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)

    numberized_file = options.working_dir + "/" + file_stem + ".numberized"
    ngrams_file_handle = open(ngram_file, 'r')
    numberized_file_handle = open(numberized_file, 'w')

    # Numberize the file.
    for line in ngrams_file_handle:
        numberized_file_handle.write(
            extract.numberize(
                line, options.m, options.n, svocab_idmap, tvocab_idmap))
    numberized_file_handle.close()
    ngrams_file_handle.close()
示例#4
0
def main():
    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.DEBUG)
    parser = optparse.OptionParser("%prog [options]")
    parser.add_option("-e",
                      "--target-language",
                      type="string",
                      dest="target_language")
    parser.add_option("-f",
                      "--source-language",
                      type="string",
                      dest="source_language")
    parser.add_option("-c", "--corpus", type="string", dest="corpus_stem")
    parser.add_option("-t",
                      "--tagged-corpus",
                      type="string",
                      dest="tagged_stem")
    parser.add_option("-a", "--align", type="string", dest="align_file")
    parser.add_option("-w", "--working-dir", type="string", dest="working_dir")
    parser.add_option("-n", "--target-context", type="int", dest="n")
    parser.add_option("-m", "--source-context", type="int", dest="m")
    parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune")
    parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune")

    parser.set_defaults(target_language="en",
                        source_language="de",
                        corpus_stem="train.10k",
                        align_file="train.10k.align",
                        n=5,
                        m=4,
                        working_dir="working",
                        sprune=16000,
                        tprune=16000)
    options, args = parser.parse_args(sys.argv)

    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    else:
        LOG.warn("Directory %s already exists, re-using" % options.working_dir)

    info_file = options.working_dir + "/info"
    if os.path.exists(info_file):
        for line in open(info_file):
            name, value = line[:-1].split()
            if name == "n" and int(value) != options.n or \
                name == "m" and int(value) != options.m:
                LOG.error(
                    "info file exists, but parameters do not match. Delete working directory and rerun"
                )
                sys.exit(1)
    else:
        ifh = open(info_file, "w")
        print >> ifh, "m", options.m
        print >> ifh, "n", options.n
        ifh.close()

    scorpus = options.corpus_stem + "." + options.source_language
    tcorpus = options.corpus_stem + "." + options.target_language

    tvocab, svocab = None, None
    # Extract vocabulary, and prune, if required
    svocab = get_pruned_vocab(scorpus, options.sprune)
    tvocab = get_pruned_vocab(tcorpus, options.tprune)

    file_stem = os.path.basename(options.corpus_stem)
    ngram_file = options.working_dir + "/" + file_stem + ".ngrams"
    ofh = open(ngram_file, "w")

    tags = extract.get_ngrams(options.corpus_stem, options.align_file,
                              options.tagged_stem, svocab, tvocab,
                              options.source_language, options.target_language,
                              options.m, options.n, ofh)

    # Save vocabularies
    del svocab["<null>"]
    del tvocab["<null>"]
    del svocab["<unk>"]
    del tvocab["<unk>"]
    svocab_list = [item[0] for item in svocab.most_common()]
    tvocab_list = [item[0] for item in tvocab.most_common()]

    # UNK is always the first vocabulary element. Make sure
    # it appears in position 0
    # We need to use <null> token in the chart decoder in order
    # to correctly estimate the probabilities of incomplete subphrases
    # that are not sentence initial.

    tvocab_list.insert(0, "<null>")
    tvocab_list.insert(0, "<unk>")
    svocab_list.insert(0, "<unk>")

    #Get tags:
    tag_list = [item[0] for item in tags.most_common()]
    svocab_list = svocab_list + tag_list
    tvocab_list = tvocab_list + tag_list

    save_vocab(options.working_dir, "vocab.source", svocab_list)
    save_vocab(options.working_dir, "vocab.target", tvocab_list)

    #Create vocab dictionaries that map word to ID
    tvocab_idmap = {}
    for i in range(len(tvocab_list)):
        tvocab_idmap[tvocab_list[i]] = i

    svocab_idmap = {}
    for i in range(len(svocab_list)):
        svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap)

    numberized_file = options.working_dir + "/" + file_stem + ".numberized"
    ngrams_file_handle = open(ngram_file, 'r')
    numberized_file_handle = open(numberized_file, 'w')

    #Numberize the file
    for line in ngrams_file_handle:
        numberized_file_handle.write(
            extract.numberize(line, m, n, tvocab_idmap, tvocab_idmap))
    numberized_file_handle.close()
    ngrams_file_handle.close()