예제 #1
0
def main(options):

    options.ngram_size = (
        2 * options.up_context_size +
        2 * options.left_context_size +
        2 * options.right_context_size
        )
    if options.mode == 'head':
        options.ngram_size += 2
    elif options.mode == 'label':
        options.ngram_size += 1

    if options.input_words_file is None or options.output_words_file is None:
        sys.stderr.write(
            "Either input vocabulary or output vocabulary not specified: "
            "extracting vocabulary from training text.\n")
        prepare_vocabulary(options)

    extract_options = extract_syntactic_ngrams.create_parser().parse_args([
        '--input', options.corpus_stem,
        '--output', os.path.join(
            options.working_dir,
            os.path.basename(options.corpus_stem) + '.numberized'),
        '--vocab', options.input_words_file,
        '--output_vocab', options.output_words_file,
        '--right_context', str(options.right_context_size),
        '--left_context', str(options.left_context_size),
        '--up_context', str(options.up_context_size),
        '--mode', options.mode
        ])
    sys.stderr.write('extracting syntactic n-grams\n')
    extract_syntactic_ngrams.main(extract_options)

    if options.validation_corpus:
        extract_options.input = open(options.validation_corpus)
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))
        extract_options.output = open(
            options.validation_file + '.numberized', 'w')
        sys.stderr.write('extracting syntactic n-grams (validation file)\n')
        extract_syntactic_ngrams.main(extract_options)
        extract_options.output.close()

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    ret = subprocess.call([
        os.path.join(sys.path[0], 'average_null_embedding.py'),
        options.nplm_home,
        os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs)),
        os.path.join(
            options.working_dir,
            os.path.basename(options.corpus_stem) + '.numberized'),
        os.path.join(options.output_dir, options.output_model + '.model.nplm')
        ])
    if ret:
        raise Exception("averaging null words failed")
예제 #2
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    else:
        # Create output dir if necessary
        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)

    extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
                      '--train_text', options.corpus_stem,
                      '--ngramize', '1',
                      '--ngram_size', str(options.ngram_size),
                      '--vocab_size', str(options.vocab_size),
                      '--write_words_file', os.path.join(options.working_dir, options.words_file),
                      '--train_file', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized')
                      ]

    sys.stderr.write('extracting n-grams\n')
    ret = subprocess.call(extraction_cmd)
    if ret:
        raise Exception("preparing neural LM failed")
    
    if options.validation_corpus:

        extraction_cmd = [os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
                          '--train_text', options.validation_corpus,
                          '--ngramize', '1',
                          '--ngram_size', str(options.ngram_size),
                          '--vocab_size', str(options.vocab_size),
                          '--words_file', os.path.join(options.working_dir, options.words_file),
                          '--train_file', os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + '.numberized')
                          ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")

    else:
        options.validation_file = None

    options.input_words_file = options.words_file
    options.output_words_file = options.words_file
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    average_options = averageNullEmbedding.parser.parse_args(
        ['-i', os.path.join(options.output_dir, options.output_model + '.model.nplm.' + str(options.epochs)),
         '-o', os.path.join(options.output_dir, options.output_model + '.model.nplm'),
         '-t', os.path.join(options.working_dir, os.path.basename(options.corpus_stem) + '.numberized'),
         '-p', os.path.join(options.nplm_home, 'python')])
    averageNullEmbedding.main(average_options)
예제 #3
0
def main(options):

    options.ngram_size = 2 * options.up_context_size + 2 * options.left_context_size + 2 * options.right_context_size
    if options.mode == 'head':
        options.ngram_size += 2
    elif options.mode == 'label':
        options.ngram_size += 1

    if options.input_words_file is None or options.output_words_file is None:
        sys.stderr.write(
            'either input vocabulary or output vocabulary not specified: extracting vocabulary from training text\n'
        )
        prepare_vocabulary(options)

    extract_options = extract_syntactic_ngrams.create_parser().parse_args([
        '--input', options.corpus_stem, '--output',
        os.path.join(options.working_dir,
                     os.path.basename(options.corpus_stem) + '.numberized'),
        '--vocab', options.input_words_file, '--output_vocab',
        options.output_words_file, '--right_context',
        str(options.right_context_size), '--left_context',
        str(options.left_context_size), '--up_context',
        str(options.up_context_size), '--mode', options.mode
    ])
    sys.stderr.write('extracting syntactic n-grams\n')
    extract_syntactic_ngrams.main(extract_options)

    if options.validation_corpus:
        extract_options.input = open(options.validation_corpus)
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))
        extract_options.output = open(options.validation_file + '.numberized',
                                      'w')
        sys.stderr.write('extracting syntactic n-grams (validation file)\n')
        extract_syntactic_ngrams.main(extract_options)
        extract_options.output.close()

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    ret = subprocess.call([
        os.path.join(sys.path[0], 'average_null_embedding.py'),
        options.nplm_home,
        os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs)),
        os.path.join(options.working_dir,
                     os.path.basename(options.corpus_stem) + '.numberized'),
        os.path.join(options.output_dir, options.output_model + '.model.nplm')
    ])
    if ret:
        raise Exception("averaging null words failed")
예제 #4
0
파일: train-neurallm.py 프로젝트: j0ma/lmm
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    vocab_file = os.path.join(options.working_dir, options.words_file)
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extraction_cmd = []
    if options.train_host:
        extraction_cmd = ["ssh", options.train_host]
    extraction_cmd += [
        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
        '--train_text', options.corpus_stem, '--ngramize', '1', '--ngram_size',
        str(options.ngram_size), '--vocab_size',
        str(options.vocab_size), '--write_words_file', vocab_file,
        '--train_file',
        os.path.join(options.working_dir, numberized_file)
    ]

    sys.stderr.write('extracting n-grams\n')
    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
    subprocess.check_call(extraction_cmd)

    # if dropout enabled, need to check which is the <null> vocab id
    null_id = None
    if options.dropout or options.input_dropout:
        with open(vocab_file) as vfh:
            for i, line in enumerate(vfh):
                if line[:-1].decode("utf8") == "<null>":
                    null_id = i
                    break
        if null_id == None:
            sys.stderr.write(
                "WARN: could not identify null token, cannot enable dropout\n")
        else:
            if not options.extra_settings:
                options.extra_settings = ""
            if options.dropout or options.input_dropout:
                options.extra_settings += " --null_index %d " % null_id
            if options.dropout:
                options.extra_settings += " --dropout %s " % options.dropout
            if options.input_dropout:
                options.extra_settings += " --input_dropout %s " % options.input_dropout

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = []
        if options.train_host:
            mmap_cmd = ["ssh", options.train_host]
        mmap_cmd += [
            os.path.join(options.nplm_home, 'src',
                         'createMmap'), '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
        ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = []
        if options.train_host:
            extraction_cmd = ["ssh", options.train_host]
        extraction_cmd += [
            os.path.join(options.nplm_home, 'src',
                         'prepareNeuralLM'), '--train_text',
            options.validation_corpus, '--ngramize', '1', '--ngram_size',
            str(options.ngram_size), '--vocab_size',
            str(options.vocab_size), '--words_file', vocab_file,
            '--train_file',
            os.path.join(
                options.working_dir,
                os.path.basename(options.validation_corpus) + '.numberized')
        ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = vocab_file
    options.output_words_file = vocab_file
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    output_model_file = os.path.join(options.output_dir,
                                     options.output_model + '.model.nplm.best')
    if not os.path.exists(output_model_file):
        output_model_file = os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs))
    average_options = averageNullEmbedding.parser.parse_args([
        '-i',
        output_model_file,
        '-o',
        os.path.join(options.output_dir, options.output_model + '.model.nplm'),
        '-t',
        os.path.join(options.working_dir, numberized_file),
        '-p',
        os.path.join(options.nplm_home, 'python'),
    ])
    averageNullEmbedding.main(average_options)
예제 #5
0
def main(options):

    if options.output_dir is None:
        options.output_dir = options.working_dir
    else:
        # Create output dir if necessary
        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)

    options.ngram_size = (
        2 * options.up_context_size +
        2 * options.left_context_size +
        2 * options.right_context_size
        )
    if options.mode == 'head':
        options.ngram_size += 2
    elif options.mode == 'label':
        options.ngram_size += 1

    if options.input_words_file is None or options.output_words_file is None:
        sys.stderr.write(
            "Either input vocabulary or output vocabulary not specified: "
            "extracting vocabulary from training text.\n")
        prepare_vocabulary(options)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extract_options = extract_syntactic_ngrams.create_parser().parse_args([
        '--input', options.corpus_stem,
        '--output', os.path.join(options.working_dir, numberized_file),
        '--vocab', options.input_words_file,
        '--output_vocab', options.output_words_file,
        '--right_context', str(options.right_context_size),
        '--left_context', str(options.left_context_size),
        '--up_context', str(options.up_context_size),
        '--mode', options.mode
        ])
    sys.stderr.write('extracting syntactic n-grams\n')
    extract_syntactic_ngrams.main(extract_options)

    if options.validation_corpus:
        extract_options.input = open(options.validation_corpus)
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))
        extract_options.output = open(
            options.validation_file + '.numberized', 'w')
        sys.stderr.write('extracting syntactic n-grams (validation file)\n')
        extract_syntactic_ngrams.main(extract_options)
        extract_options.output.close()
    else:
        options.validation_file = None

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [os.path.join(options.nplm_home, 'src', 'createMmap'),
                    '--input_file',
                    os.path.join(options.working_dir, numberized_file),
                    '--output_file',
                    os.path.join(options.working_dir, train_file)
                    ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    ret = subprocess.call([
        os.path.join(sys.path[0], 'average_null_embedding.py'),
        options.nplm_home,
        os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs)),
        os.path.join(
            options.working_dir,
            numberized_file),
        os.path.join(options.output_dir, options.output_model + '.model.nplm')
        ])
    if ret:
        raise Exception("averaging null words failed")
예제 #6
0
def main(options):

    if options.output_dir is None:
        options.output_dir = options.working_dir
    else:
        # Create output dir if necessary
        if not os.path.exists(options.output_dir):
            os.makedirs(options.output_dir)

    options.ngram_size = (2 * options.up_context_size +
                          2 * options.left_context_size +
                          2 * options.right_context_size)
    if options.mode == 'head':
        options.ngram_size += 2
    elif options.mode == 'label':
        options.ngram_size += 1

    if options.input_words_file is None or options.output_words_file is None:
        sys.stderr.write(
            "Either input vocabulary or output vocabulary not specified: "
            "extracting vocabulary from training text.\n")
        prepare_vocabulary(options)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extract_options = extract_syntactic_ngrams.create_parser().parse_args([
        '--input', options.corpus_stem, '--output',
        os.path.join(options.working_dir,
                     numberized_file), '--vocab', options.input_words_file,
        '--output_vocab', options.output_words_file, '--right_context',
        str(options.right_context_size), '--left_context',
        str(options.left_context_size), '--up_context',
        str(options.up_context_size), '--mode', options.mode
    ])
    sys.stderr.write('extracting syntactic n-grams\n')
    extract_syntactic_ngrams.main(extract_options)

    if options.validation_corpus:
        extract_options.input = open(options.validation_corpus)
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))
        extract_options.output = open(options.validation_file + '.numberized',
                                      'w')
        sys.stderr.write('extracting syntactic n-grams (validation file)\n')
        extract_syntactic_ngrams.main(extract_options)
        extract_options.output.close()
    else:
        options.validation_file = None

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, 'src',
                         'createMmap'), '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
        ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    ret = subprocess.call([
        os.path.join(sys.path[0], 'average_null_embedding.py'),
        options.nplm_home,
        os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs)),
        os.path.join(options.working_dir, numberized_file),
        os.path.join(options.output_dir, options.output_model + '.model.nplm')
    ])
    if ret:
        raise Exception("averaging null words failed")
예제 #7
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + ".numberized"
    train_file = numberized_file
    if options.mmap:
        train_file += ".mmap"

    extraction_cmd = [
        os.path.join(options.nplm_home, "src", "prepareNeuralLM"),
        "--train_text",
        options.corpus_stem,
        "--ngramize",
        "1",
        "--ngram_size",
        str(options.ngram_size),
        "--vocab_size",
        str(options.vocab_size),
        "--write_words_file",
        os.path.join(options.working_dir, options.words_file),
        "--train_file",
        os.path.join(options.working_dir, numberized_file),
    ]

    sys.stderr.write("extracting n-grams\n")
    sys.stderr.write("executing: " + ", ".join(extraction_cmd) + "\n")
    ret = subprocess.call(extraction_cmd)
    if ret:
        raise Exception("preparing neural LM failed")

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, "src", "createMmap"),
            "--input_file",
            os.path.join(options.working_dir, numberized_file),
            "--output_file",
            os.path.join(options.working_dir, train_file),
        ]
        sys.stderr.write("creating memory-mapped file\n")
        sys.stderr.write("executing: " + ", ".join(mmap_cmd) + "\n")
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = [
            os.path.join(options.nplm_home, "src", "prepareNeuralLM"),
            "--train_text",
            options.validation_corpus,
            "--ngramize",
            "1",
            "--ngram_size",
            str(options.ngram_size),
            "--vocab_size",
            str(options.vocab_size),
            "--words_file",
            os.path.join(options.working_dir, options.words_file),
            "--train_file",
            os.path.join(options.working_dir, os.path.basename(options.validation_corpus) + ".numberized"),
        ]

        sys.stderr.write("extracting n-grams (validation file)\n")
        sys.stderr.write("executing: " + ", ".join(extraction_cmd) + "\n")
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = os.path.join(options.working_dir, options.words_file)
    options.output_words_file = os.path.join(options.working_dir, options.words_file)
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write("training neural network\n")
    train_nplm.main(options)

    sys.stderr.write("averaging null words\n")
    output_model_file = os.path.join(options.output_dir, options.output_model + ".model.nplm.best")
    if not os.path.exists(output_model_file):
        output_model_file = os.path.join(
            options.output_dir, options.output_model + ".model.nplm." + str(options.epochs)
        )
    average_options = averageNullEmbedding.parser.parse_args(
        [
            "-i",
            output_model_file,
            "-o",
            os.path.join(options.output_dir, options.output_model + ".model.nplm"),
            "-t",
            os.path.join(options.working_dir, numberized_file),
            "-p",
            os.path.join(options.nplm_home, "python"),
        ]
    )
    averageNullEmbedding.main(average_options)
예제 #8
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extraction_cmd = [
        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
        '--train_text', options.corpus_stem, '--ngramize', '1', '--ngram_size',
        str(options.ngram_size), '--vocab_size',
        str(options.vocab_size), '--write_words_file',
        os.path.join(options.working_dir, options.words_file), '--train_file',
        os.path.join(options.working_dir, numberized_file)
    ]

    sys.stderr.write('extracting n-grams\n')
    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
    ret = subprocess.call(extraction_cmd)
    if ret:
        raise Exception("preparing neural LM failed")

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, 'src',
                         'createMmap'), '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
        ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = [
            os.path.join(options.nplm_home, 'src',
                         'prepareNeuralLM'), '--train_text',
            options.validation_corpus, '--ngramize', '1', '--ngram_size',
            str(options.ngram_size), '--vocab_size',
            str(options.vocab_size), '--words_file',
            os.path.join(options.working_dir,
                         options.words_file), '--train_file',
            os.path.join(
                options.working_dir,
                os.path.basename(options.validation_corpus) + '.numberized')
        ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = os.path.join(options.working_dir,
                                            options.words_file)
    options.output_words_file = os.path.join(options.working_dir,
                                             options.words_file)
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    output_model_file = os.path.join(options.output_dir,
                                     options.output_model + '.model.nplm.best')
    if not os.path.exists(output_model_file):
        output_model_file = os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs))
    average_options = averageNullEmbedding.parser.parse_args([
        '-i',
        output_model_file,
        '-o',
        os.path.join(options.output_dir, options.output_model + '.model.nplm'),
        '-t',
        os.path.join(options.working_dir, numberized_file),
        '-p',
        os.path.join(options.nplm_home, 'python'),
    ])
    averageNullEmbedding.main(average_options)
예제 #9
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    vocab_file =os.path.join(options.working_dir, options.words_file) 
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extraction_cmd = [
        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
        '--train_text', options.corpus_stem,
        '--ngramize', '1',
        '--ngram_size', str(options.ngram_size),
        '--vocab_size', str(options.vocab_size),
        '--write_words_file', vocab_file,
        '--train_file', os.path.join(options.working_dir, numberized_file)
        ]

    sys.stderr.write('extracting n-grams\n')
    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
    subprocess.check_call(extraction_cmd)

    # if dropout enabled, need to check which is the <null> vocab id
    null_id = None
    if options.dropout or options.input_dropout:
      with open(vocab_file) as vfh:
        for i,line in enumerate(vfh):
          if line[:-1].decode("utf8") == "<null>":
            null_id = i
            break
      if null_id == None:
        sys.stderr.write("WARN: could not identify null token, cannot enable dropout\n")
      else:
        if not options.extra_settings:
          options.extra_settings = ""
        if options.dropout or options.input_dropout:
          options.extra_settings += " --null_index %d " % null_id
        if options.dropout:
          options.extra_settings += " --dropout %s " % options.dropout
        if options.input_dropout:
          options.extra_settings += " --input_dropout %s " % options.input_dropout


    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, 'src', 'createMmap'),
            '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
            ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = [
            os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
            '--train_text', options.validation_corpus,
            '--ngramize', '1',
            '--ngram_size', str(options.ngram_size),
            '--vocab_size', str(options.vocab_size),
            '--words_file', vocab_file,
            '--train_file', os.path.join(
                options.working_dir,
                os.path.basename(options.validation_corpus) + '.numberized')
            ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = vocab_file
    options.output_words_file = vocab_file
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    output_model_file = os.path.join(
              options.output_dir,
              options.output_model + '.model.nplm.best')
    if not os.path.exists(output_model_file):
      output_model_file =  os.path.join(
              options.output_dir,
              options.output_model + '.model.nplm.' + str(options.epochs))
    average_options = averageNullEmbedding.parser.parse_args([
        '-i', output_model_file ,
        '-o', os.path.join(
            options.output_dir, options.output_model + '.model.nplm'),
        '-t', os.path.join(options.working_dir, numberized_file),
        '-p', os.path.join(options.nplm_home, 'python'),
    ])
    averageNullEmbedding.main(average_options)
예제 #10
0
def main(options):

    options.ngram_size = options.order

    if options.output_dir is None:
        options.output_dir = options.working_dir
    # Create dirs if necessary
    if not os.path.exists(options.working_dir):
        os.makedirs(options.working_dir)
    if not os.path.exists(options.output_dir):
        os.makedirs(options.output_dir)

    numberized_file = os.path.basename(options.corpus_stem) + '.numberized'
    train_file = numberized_file
    if options.mmap:
        train_file += '.mmap'

    extraction_cmd = [
        os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
        '--train_text', options.corpus_stem,
        '--ngramize', '1',
        '--ngram_size', str(options.ngram_size),
        '--vocab_size', str(options.vocab_size),
        '--write_words_file', os.path.join(
            options.working_dir, options.words_file),
        '--train_file', os.path.join(options.working_dir, numberized_file)
        ]

    sys.stderr.write('extracting n-grams\n')
    sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
    ret = subprocess.call(extraction_cmd)
    if ret:
        raise Exception("preparing neural LM failed")

    if options.mmap:
        try:
            os.remove(os.path.join(options.working_dir, train_file))
        except OSError:
            pass
        mmap_cmd = [
            os.path.join(options.nplm_home, 'src', 'createMmap'),
            '--input_file',
            os.path.join(options.working_dir, numberized_file),
            '--output_file',
            os.path.join(options.working_dir, train_file)
            ]
        sys.stderr.write('creating memory-mapped file\n')
        sys.stderr.write('executing: ' + ', '.join(mmap_cmd) + '\n')
        ret = subprocess.call(mmap_cmd)
        if ret:
            raise Exception("creating memory-mapped file failed")

    if options.validation_corpus:

        extraction_cmd = [
            os.path.join(options.nplm_home, 'src', 'prepareNeuralLM'),
            '--train_text', options.validation_corpus,
            '--ngramize', '1',
            '--ngram_size', str(options.ngram_size),
            '--vocab_size', str(options.vocab_size),
            '--words_file', os.path.join(
                options.working_dir, options.words_file),
            '--train_file', os.path.join(
                options.working_dir,
                os.path.basename(options.validation_corpus) + '.numberized')
            ]

        sys.stderr.write('extracting n-grams (validation file)\n')
        sys.stderr.write('executing: ' + ', '.join(extraction_cmd) + '\n')
        ret = subprocess.call(extraction_cmd)
        if ret:
            raise Exception("preparing neural LM failed")
        options.validation_file = os.path.join(
            options.working_dir, os.path.basename(options.validation_corpus))

    else:
        options.validation_file = None

    options.input_words_file = os.path.join(options.working_dir, options.words_file)
    options.output_words_file = os.path.join(options.working_dir, options.words_file)
    options.input_vocab_size = options.vocab_size
    options.output_vocab_size = options.vocab_size

    sys.stderr.write('training neural network\n')
    train_nplm.main(options)

    sys.stderr.write('averaging null words\n')
    average_options = averageNullEmbedding.parser.parse_args([
        '-i', os.path.join(
            options.output_dir,
            options.output_model + '.model.nplm.' + str(options.epochs)),
        '-o', os.path.join(
            options.output_dir, options.output_model + '.model.nplm'),
        '-t', os.path.join(options.working_dir, numberized_file),
        '-p', os.path.join(options.nplm_home, 'python'),
    ])
    averageNullEmbedding.main(average_options)