Пример #1
0
def single_train(max_epoch=args.epochs):
    train_manager, test_manager, transfer_train_manager, transfer_test_manager, id2char, id2tag, transfer_id2tag = get_train_data()
    with tf.Session(config=config) as sess:
        transfer_model = SpecModel(args=args,
                                   num_tags=len(transfer_id2tag),
                                   vocab_size=len(id2char),
                                   name='transfer')
        transfer_model.build()
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
        ckpt = tf.train.get_checkpoint_state(args.model_path)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("Creating model with random parameters")
            sess.run(tf.global_variables_initializer())
            embeddings = sess.run(transfer_model.get_embeddings().read_value())
            embeddings = load_wordvec(args.wiki_path, id2char, args.embedding_size, embeddings)
            sess.run(transfer_model.get_embeddings().assign(embeddings))
        print("========== Start training ==========")
        for i in range(max_epoch):
            transfer_loss = []
            for transfer_batch in transfer_train_manager.iter_batch():
                transfer_step, transfer_batch_loss = transfer_model.run_one_step(sess, True, transfer_batch)
                transfer_loss.append(transfer_batch_loss)
                if transfer_step % 1000 == 0:
                    print(f"Step: {transfer_step} Transfer Loss: {transfer_batch_loss}")
            print("Epoch: {} Transfer Loss: {:>9.6f}".format(i, np.mean(transfer_loss)))
            results = transfer_model.evaluate(sess, transfer_test_manager, transfer_id2tag)
            for line in test_ner(results, "data/transfer_test_result"):
                print(f"<<Test NER res>>: \n\t\t---> {line}")
            ckpt_file = os.path.join(args.model_path, str(i) + "ner.ckpt")
            saver.save(sess, ckpt_file)
        print("========== Finish training ==========")
Пример #2
0
def main(args):
    """Main function for lattice preprocessing."""
    global LOGGER
    LOGGER = utils.get_logger(args.verbose)

    dst_dir = args.dst_dir
    utils.check_dir(dst_dir)
    file_list_dir = args.file_list_dir
    utils.check_dir(file_list_dir)

    wordvec_path = os.path.join(args.wordvec)
    wordvec = utils.load_wordvec(wordvec_path)
    subword_embedding_path = os.path.join(args.embedding)
    subword_embedding = utils.load_wordvec(subword_embedding_path)

    subset_list = ['train.lat.txt', 'cv.lat.txt', 'test.lat.txt']
    processed_subset_list = []

    for subset in subset_list:
        subset_name = subset.split('.')[0] + '.' + subset.split('.')[2]
        preprocessed_list_file = os.path.join(args.processed_file_list_dir,
                                              subset_name)
        utils.remove_file(preprocessed_list_file)
        processed_subset_list.append(preprocessed_list_file)

    for i, subset in enumerate(subset_list):
        lat_file_list = os.path.join(file_list_dir, subset)

        # Compile the list of lat.gz files to process
        lattice_list = []
        with open(os.path.abspath(lat_file_list), 'r') as file_in:
            for line in file_in:
                lattice_list.append(line.strip())

        with Pool(args.num_threads) as pool:
            pool.starmap(
                process_one_lattice,
                zip(lattice_list, repeat(dst_dir), repeat(wordvec),
                    repeat(subword_embedding), repeat(args.embed_apostrophe),
                    repeat(processed_subset_list[i])))
Пример #3
0
def main():
    """Main function for converting CN into `.npz` lattices."""
    parser = argparse.ArgumentParser(
        description='confusion network pre-processing')
    parser.add_argument(
        '-d',
        '--dst-dir',
        type=str,
        help='Location to save the processed confusion network files (*.npz)')
    parser.add_argument(
        '-e',
        '--embedding',
        type=str,
        required=True,
        help=
        'Full path to the file containing a dictionary with the grapheme / phone embeddings'
    )
    parser.add_argument(
        '-w',
        '--wordvec',
        type=str,
        required=True,
        help=
        'Full path to the file containing a dictionary with the word vector embeddings'
    )
    parser.add_argument(
        '-f',
        '--file-list-dir',
        type=str,
        help=
        'The directory containing the files with the lists of lattice absolute paths for each subset (*.cn.txt)'
    )
    parser.add_argument(
        '-p',
        '--processed-file-list-dir',
        type=str,
        help=
        'The directory in which to save files with paths to the processed confusion networks (*.txt).'
    )
    parser.add_argument('-l',
                        '--log',
                        default=False,
                        action='store_true',
                        help='Use posterior probabilities in log domain')
    parser.add_argument(
        '-v', '--verbose',
        help='Set logging level: ERROR (default), '\
             'WARNING (-v), INFO (-vv), DEBUG (-vvv)',
        action='count', default=0
    )
    parser.add_argument('-n',
                        '--num_threads',
                        help='number of threads to use for concurrency',
                        type=int,
                        default=30)
    parser.add_argument('--decision-tree',
                        type=str,
                        dest='dec_tree',
                        required=False,
                        default='NONE')

    parser.add_argument('--uniform-subword-durations',
                        dest='uniform_subword_durations',
                        action='store_true')
    parser.set_defaults(uniform_subword_durations=False)

    parser.add_argument('--embed-apostrophe',
                        dest='embed_apostrophe',
                        action='store_true')
    parser.set_defaults(embed_apostrophe=False)

    parser.add_argument('--keep-pronunciation',
                        dest='keep_pronunciation',
                        action='store_true')
    parser.set_defaults(keep_pronunciation=False)

    parser.add_argument('--ignore_time_seg',
                        dest='ignore_time_seg',
                        required=False,
                        default=False)
    args = parser.parse_args()

    global LOGGER
    LOGGER = utils.get_logger(args.verbose)

    dst_dir = args.dst_dir
    utils.check_dir(dst_dir)
    file_list_dir = args.file_list_dir
    utils.check_dir(file_list_dir)

    wordvec_path = os.path.join(args.wordvec)
    wordvec = utils.load_wordvec(wordvec_path)
    subword_embedding_path = os.path.join(args.embedding)
    subword_embedding = utils.load_wordvec(subword_embedding_path)

    subset_list = ['train.cn.txt', 'cv.cn.txt', 'test.cn.txt']
    processed_subset_list = []

    for subset in subset_list:
        subset_name = subset.split('.')[0] + '.' + subset.split('.')[2]
        preprocessed_list_file = os.path.join(args.processed_file_list_dir,
                                              subset_name)
        utils.remove_file(preprocessed_list_file)
        processed_subset_list.append(preprocessed_list_file)

    for i, subset in enumerate(subset_list):
        lat_file_list = os.path.join(file_list_dir, subset)

        # Compile the list of CN files to process
        cn_list = []
        with open(os.path.abspath(lat_file_list), 'r') as file_in:
            for line in file_in:
                cn_list.append(line.strip())
        all_oov = set()
        for cn in cn_list:
            file_name = cn.split('/')[-1]
            print('Processing {}'.format(file_name[:-7]))
            oov = process_one_cn(cn, args.dst_dir, wordvec, subword_embedding,
                                 args.log, args.dec_tree, args.ignore_time_seg,
                                 processed_subset_list[i],
                                 args.embed_apostrophe,
                                 args.keep_pronunciation,
                                 args.uniform_subword_durations)
            all_oov.update(oov)
    if not all_oov:
        print(
            'OOV words were detected which could not be mapped to an embedding\nThese are the words:\n{}'
            .format(all_oov))
        with open('oov-words.txt', 'w') as oov_file:
            oov_file.write('\n'.join(list(all_oov)))
Пример #4
0
                        help="number of semantic groups to construct")
    parser.add_argument("--postprocessing",
                        default=1,
                        type=int,
                        help="principal component removal")
    args = parser.parse_args()

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Load text file
    sentences = load_file(PATH_TO_SENTENCE)

    # Load dictionary
    args.id2word, args.word2id = create_dictionary(sentences)

    # Load word vectors
    args.word_vec_np = load_wordvec(PATH_TO_VEC, args.word2id)
    args.wvec_dim = args.word_vec_np.shape[1]

    # Load word weights
    args.word_weight = load_word_weight(PATH_TO_WORD_WEIGHTS,
                                        args.word2id,
                                        a=1e-3)

    # Construct semantic groups
    semantic_construction(args)

    # Generate embedding
    sentence_emb = compute_embedding(args, sentences)

    # Provide Example
    index1 = int(input("\nThe index for the first sentence:"))