예제 #1
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags  = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)

    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)
    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False, allow_unk=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, config.max_iter)
    test  = CoNLLDataset(config.test_filename, processing_word,
                        processing_tag, config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, config.max_iter)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags),
                                         nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
    model.interactive_shell(vocab_tags, processing_word)
예제 #2
0
def do_shell(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = NERModel(helper, config, embeddings)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            saver.restore(session, model.config.model_output)

            while True:
                # Create simple REPL
                try:
                    sentence = eval(input("input> "))
                    tokens = sentence.strip().split(" ")
                    for sentence, _, predictions in model.output(session, [(tokens, ["O"] * len(tokens))]):
                        predictions = [LBLS[l] for l in predictions]
                        print_sentence(sys.stdout, sentence, [
                                       ""] * len(tokens), predictions)
                except EOFError:
                    print("Closing session.")
                    break
예제 #3
0
def main():
    parser = argparse.ArgumentParser()
    # Required parameters
    parser.add_argument("--do_train", default=False, action='store_true')
    parser.add_argument('--do_eval', default=False, action='store_true')
    parser.add_argument("--do_predict", default=False, action='store_true')

    parser.add_argument('--markup',
                        default='bios',
                        type=str,
                        choices=['bios', 'bio'])
    parser.add_argument("--arch", default='bilstm_crf', type=str)
    parser.add_argument('--learning_rate', default=0.001, type=float)
    parser.add_argument('--seed', default=1234, type=int)
    # parser.add_argument('--gpu',default='0',type=str)
    parser.add_argument('--gpu', default='', type=str)
    parser.add_argument('--epochs', default=50, type=int)
    parser.add_argument('--batch_size', default=32, type=int)
    parser.add_argument('--embedding_size', default=128, type=int)
    parser.add_argument('--hidden_size', default=384, type=int)
    parser.add_argument("--grad_norm",
                        default=5.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--task_name", type=str, default='ner')
    args = parser.parse_args()
    args.data_dir = config.data_dir
    if not config.output_dir.exists():
        args.output_dir.mkdir()
    args.output_dir = config.output_dir / '{}'.format(args.arch)
    if not args.output_dir.exists():
        args.output_dir.mkdir()
    init_logger(log_file=str(args.output_dir /
                             '{}-{}.log'.format(args.arch, args.task_name)))
    seed_everything(args.seed)
    if args.gpu != '':
        args.device = torch.device(f"cuda:{args.gpu}")
    else:
        args.device = torch.device("cpu")
    args.id2label = {i: label for i, label in enumerate(config.label2id)}
    args.label2id = config.label2id
    processor = CluenerProcessor(data_dir=config.data_dir)
    processor.get_vocab()
    model = NERModel(vocab_size=len(processor.vocab),
                     embedding_size=args.embedding_size,
                     hidden_size=args.hidden_size,
                     device=args.device,
                     label2id=args.label2id)
    model.to(args.device)
    if args.do_train:
        train(args, model, processor)
    if args.do_eval:
        model_path = args.output_dir / 'best-model.bin'
        model = load_model(model, model_path=str(model_path))
        evaluate(args, model, processor)
    if args.do_predict:
        predict(args, model, processor)
예제 #4
0
def main(config):
    # load vocabs
    vocab_words, idx2words = load_vocab(config.words_filename)
    vocab_tags, _  = load_vocab(config.tags_filename)
    vocab_chars, _ = load_vocab(config.chars_filename)
    vocab_pos, _ = load_vocab(config.pos_filename)


    # get processing functions
    processing_word = get_processing_word(vocab_words, vocab_chars,
                    lowercase=True, chars=config.chars)

    processing_tag  = get_processing_word(vocab_tags, 
                    lowercase=False)

    processing_pos = get_processing_word(vocab_pos,
                                         lowercase=False)




    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
    embeddings_uni = get_trimmed_glove_vectors(config.uni_trimmed_filename)
    pos_embeddings = get_trimmed_glove_vectors(config.feature_trimmed_filename)
    NE_dic = get_trimmed_glove_vectors(config.trimmed_dic)


    # create dataset
    dev   = CoNLLDataset(config.dev_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)

    train = CoNLLDataset(config.train_filename, processing_word,
                        processing_tag, processing_pos, config.max_iter)
    
    # build model
    model = NERModel(config, embeddings, embeddings_uni,
                     pos_embeddings, ntags=len(vocab_tags), nchars=len(vocab_chars), vocab_words=idx2words,
                    NE_dic=NE_dic)
    model.build()

    # train, evaluate and interact
    if state == "train":
        model.train(train, dev, vocab_tags)

    elif state == "evaluate":
        model.evaluate(dev, vocab_tags)

    else: #state == predict
        convert(file)
        t2o("data_format/test_convert.txt","data_format/test.txt")
        test = CoNLLDataset(config.test_filename, processing_word,
                            processing_tag, processing_pos, config.max_iter)

        model.evaluate(test, vocab_tags)

        tagging("data_format/test_convert.txt")
예제 #5
0
class nlu():

    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)

    # get processing functions

    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # get logger
    # logger = get_logger(config.log_path)

    # build model
    model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=None)
    model.build()

    idx_to_tag = {idx: tag for tag, idx in vocab_tags.items()}
    saver = tf.train.Saver()
    sess = tf.Session()
    saver.restore(sess, config.model_output)
    # model.logger.info("This is an interactive mode, enter a sentence:")

    @staticmethod
    def rec(sentence):
        try:

            processing_word = get_processing_word(nlu.vocab_words,
                                                  lowercase=config.lowercase)
            # print character_separation(sentence)[0]

            words_raw = character_separation(sentence)[0].split(' ')
            # for word in words_raw:
            #     if type(word)==str:
            words_raw = [unicode(word, 'utf-8') for word in words_raw]
            # words_raw = [word.decode('utf-8') for word in words_raw]
            # else:
            # words_raw = [unicode(word, 'utf-8') for word in words_raw]

            words = map(processing_word, words_raw)
            words = list(words)
            pred_ids, _ = nlu.model.predict_batch(nlu.sess, [words])
            preds = map(lambda idx: nlu.idx_to_tag[idx], list(pred_ids[0]))
            # print(list(preds))
            print_sentence(nlu.model.logger, {"x": words_raw, "y": preds})
            return list(preds)
        except EOFError:
            print("Closing session.")


# nlu.rec('请播放电视剧三生三世十里桃花')
# nlu.rec('请播放电视剧三生三世十里桃花')
# nlu.rec('请播放电视剧三生三世十里桃花')
예제 #6
0
def do_evaluate(args):
    config = Config(args)
    helper = ModelHelper.load(args.model_path)
    input_data = read_conll(args.data)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = NERModel(helper, config, embeddings)

        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            saver.restore(session, model.config.model_output)
            for sentence, labels, predictions in model.output(session, input_data):
                predictions = [LBLS[l] for l in predictions]
                print_sentence(args.output, sentence, labels, predictions)
예제 #7
0
def main(args):

    # User parameters
    parser = OptionParser()
    parser.add_option(
        "-m", "--model", default="",
        help="Model location"
    )
    parser.add_option(
        "-i", "--input", default="",
        help="Input file, one sample per line"
    )
    parser.add_option(
        "-o", "--output", default="",
        help="Output file location"
    )
    parser.add_option(
        "--output_format", default="iob",
        help="Whether to output predicted tokens in IOB format or src/tgt format. [iob|st]"
    )
    parser.add_option('--get_probs', default=0, help="Get normalized log likelihoods of each sample")
    parser.add_option('--get_vectors', default=0,
                      help="Get output vectors of second-to-last layer in the network. Currently only tested with the CNN-BLSTM-CRF configuration")

    opts = parser.parse_args(args)[0]

    # Check parameters validity
    assert opts.output_format in ["iob", "st"]
    assert os.path.isfile(opts.model)
    assert os.path.isfile(opts.model + "_parameters.pkl")  # need params file to reload model
    assert os.path.isfile(opts.input)

    # Add parameters
    parameters = {'reload': True, 'tag': True, 'repickle_data': True}

    # Load existing model
    print "Loading model..."
    model = NERModel(model_path=opts.model, parameters=parameters)
    parameters = model.parameters
    parameters['input'] = opts.input
    parameters['output'] = opts.output
    parameters['output_format'] = opts.output_format
    parameters['model'] = model.model
    parameters['get_probs'] = int(opts.get_probs) == 1
    parameters['get_vectors'] = int(opts.get_vectors) == 1

    print 'Tagging...'
    start = time.time()
    load_data_and_predict(parameters)
    print '---- lines tagged in %.4fs ----' % (time.time() - start)
예제 #8
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_iob = {"O": 0, "B": 1, "I": 2}
    vocab_type = {"LOC": 0, "PER": 1, "ORG": 2, "MISC": 3}

    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          lowercase=True,
                                          chars=config.chars)
    processing_tag = get_processing_word(vocab_tags, lowercase=False)
    processing_iob = get_processing_word(vocab_iob, lowercase=False)
    processing_type = get_processing_word(vocab_type, lowercase=False)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    # create dataset
    dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                       processing_iob, processing_type, config.max_iter)
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        processing_iob, processing_type, config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, processing_iob, processing_type,
                         config.max_iter)

    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars),
                     niob=3,
                     ntype=4)

    model.build()

    # train, evaluate and interact
    print vocab_tags
    model.train(train, dev, vocab_tags)
    stime = time.time()

    model.evaluate(test, vocab_tags)

    etime = time.time()
    print etime - stime
예제 #9
0
def main(config):
    # load vocabs
    vocab_words = load_vocab(config.words_filename)
    vocab_tags = load_vocab(config.tags_filename)
    vocab_chars = load_vocab(config.chars_filename)
    vocab_pref_suff = load_vocab(
        config.PS_filename)  ############### For prefix and suffix
    vocab_pref_suff_2 = load_vocab(config.PS_filename_2)
    vocab_pref_suff_4 = load_vocab(config.PS_filename_4)
    # get processing functions
    processing_word = get_processing_word(vocab_words,
                                          vocab_chars,
                                          vocab_pref_suff,
                                          vocab_pref_suff_2,
                                          vocab_pref_suff_4,
                                          lowercase=True,
                                          chars=config.chars,
                                          Pref_Suff=config.pref_suff)
    processing_tag = get_processing_word(vocab_tags,
                                         lowercase=False,
                                         Geoparser=True)

    # get pre trained embeddings
    embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

    ##create dataset
    dev = CoNLLDataset(
        config.dev_filename,
        processing_word,  ############ Here dev, test and train have the raw words and tags. Now we have to map these to corresponding word index
        processing_tag,
        config.max_iter
    )  ############ and tags index. Therefore, when we do model.evaluate in below lines, it calls run_evaluate in run_epoch function
    test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                        config.max_iter)
    train = CoNLLDataset(config.train_filename, processing_word,
                         processing_tag, config.max_iter)

    # build model
    model = NERModel(config,
                     embeddings,
                     ntags=len(vocab_tags),
                     nchars=len(vocab_chars))
    model.build()

    # train, evaluate and interact
    model.train(train, dev, vocab_tags)
    model.evaluate(test, vocab_tags)
예제 #10
0
def do_train(args):
    # Set up some parameters.
    config = Config(args)
    helper, train, dev, train_raw, dev_raw = load_and_preprocess_data(args)
    embeddings = load_embeddings(args, helper)
    config.embed_size = embeddings.shape[1]
    helper.save(config.output_path)

    handler = logging.FileHandler(config.log_output)
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(logging.Formatter(
        '%(asctime)s:%(levelname)s: %(message)s'))
    logging.getLogger().addHandler(handler)

    report = None  # Report(Config.eval_output)

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = NERModel(helper, config, embeddings)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

        with tf.Session() as session:
            session.run(init)
            model.fit(session, saver, train, dev)
            if report:
                report.log_output(model.output(session, dev_raw))
                report.save()
            else:
                # Save predictions in a text file.
                output = model.output(session, dev_raw)
                sentences, labels, predictions = list(zip(*output))
                predictions = [[LBLS[l] for l in preds]
                               for preds in predictions]
                output = list(zip(sentences, labels, predictions))

                with open(model.config.conll_output, 'w') as f:
                    write_conll(f, output)
                with open(model.config.eval_output, 'w') as f:
                    for sentence, labels, predictions in output:
                        print_sentence(f, sentence, labels, predictions)
예제 #11
0
# get processing functions
processing_word = get_processing_word(vocab_words,
                                      vocab_chars,
                                      lowercase=True,
                                      chars=config.chars)
processing_tag = get_processing_word(vocab_tags, lowercase=False)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     config.max_iter)

# build model
model = NERModel(config,
                 embeddings,
                 ntags=len(vocab_tags),
                 nchars=len(vocab_chars))
model.build()

# train, evaluate and interact
model.train(train, dev, vocab_tags)
model.evaluate(test, vocab_tags)
model.interactive_shell(vocab_tags, processing_word)
예제 #12
0
# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   processing_pos, config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    processing_pos, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     processing_pos, config.max_iter)

# build model
lmwords = len(vocab_words)
lmposs = len(pos_tags)

model = NERModel(config,
                 embeddings,
                 dic_embeddings,
                 pos_embeddings,
                 syl_embeddings,
                 morph_embeddings,
                 ntags=len(vocab_tags),
                 nchars=len(vocab_chars),
                 nsyls=len(vocab_syls),
                 nmorphs=len(vocab_morphs),
                 nwords=lmwords,
                 nposs=lmposs)
model.build()

# train, evaluate and interact
model.train(train, dev, vocab_tags)
model.evaluate(test, vocab_tags, test_flag=1)
#model.interactive_shell(vocab_tags, processing_word)
예제 #13
0
vocab_tags  = load_vocab(config.tags_filename)
vocab_chars = load_vocab(config.chars_filename)

# get processing functions
processing_word = get_processing_word(vocab_words, vocab_chars,
                lowercase=True, chars=config.chars)
processing_tag  = get_processing_word(vocab_tags, 
                lowercase=False)

# get pre trained embeddings
embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev   = CoNLLDataset(config.dev_filename, processing_word,
                    processing_tag, config.max_iter)
test  = CoNLLDataset(config.test_filename, processing_word,
                    processing_tag, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word,
                    processing_tag, config.max_iter)

# build model
model = NERModel(config, embeddings, ntags=len(vocab_tags),
                                     nchars=len(vocab_chars))
model.build()

# train, evaluate and interact
model.train(train, dev, vocab_tags)
model.evaluate(test, vocab_tags)
model.interactive_shell(vocab_tags, processing_word)

예제 #14
0
    pairs_batch_train = DataLoader(dataset=data_train,
                    batch_size=batch_size,
                    shuffle=True,
                    collate_fn=prepare_data.collate,
                    pin_memory=True)

    pairs_batch_dev = DataLoader(dataset=data_dev,
                    batch_size=batch_size,
                    shuffle=True,
                    collate_fn=prepare_data.collate,
                    pin_memory=True)


    # initialize the model
    model = NERModel(word_embedding_dim, char_embedding_dim, morph_embedding_dim, word_hidden_size, char_hidden_size, morph_hidden_size, 
                len(char2idx), len(morph2idx), len(tag2idx)+1, word_num_layers, char_num_layers, morph_num_layers, dropout_prob).to(device)
    model.train()

    criterion = nn.NLLLoss()

    optimizer = radam.RAdam(model.parameters(), lr=learning_rate) 
    print(model)
    
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('The number of trainable parameters is: %d' % (total_trainable_params))



    # train the model
    if skip_training == False:
        train(model, word_num_layers, char_num_layers, morph_num_layers, num_epochs, pairs_batch_train, pairs_batch_dev, word_hidden_size, 
예제 #15
0
def main():
    hp = parse_args()

    # Setup model directories
    model_name = get_model_name(hp)
    model_path = path.join(hp.model_dir, model_name)
    best_model_path = path.join(model_path, 'best_models')
    if not path.exists(model_path):
        os.makedirs(model_path)
    if not path.exists(best_model_path):
        os.makedirs(best_model_path)

    # Set random seed
    torch.manual_seed(hp.seed)

    # Hacky way of assigning the number of labels.
    encoder = Encoder(
        model=hp.model,
        model_size=hp.model_size,
        fine_tune=hp.fine_tune,
        # CASE-PRESERVED!!
        cased=True)
    # Load data
    logging.info("Loading data")
    train_iter, val_iter, test_iter, num_labels = NERDataset.iters(
        hp.data_dir,
        encoder,
        batch_size=hp.batch_size,
        eval_batch_size=hp.eval_batch_size,
        train_frac=hp.train_frac)
    logging.info("Data loaded")

    # Initialize the model
    model = NERModel(encoder, num_labels=num_labels, **vars(hp)).cuda()
    sys.stdout.flush()

    if not hp.fine_tune:
        optimizer = torch.optim.Adam(model.get_other_params(), lr=hp.lr)
    else:
        optimizer = torch.optim.Adam(model.parameters(), lr=hp.lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           mode='max',
                                                           patience=5,
                                                           factor=0.5,
                                                           verbose=True)
    steps_done = 0
    max_f1 = 0
    init_num_stuck_evals = 0
    num_steps = (hp.n_epochs * len(train_iter.data())) // hp.real_batch_size
    # Quantize the number of training steps to eval steps
    num_steps = (num_steps // hp.eval_steps) * hp.eval_steps
    logging.info("Total training steps: %d" % num_steps)

    location = path.join(model_path, "model.pt")
    if path.exists(location):
        logging.info("Loading previous checkpoint")
        checkpoint = torch.load(location)
        model.encoder.weighing_params = checkpoint['weighing_params']
        if hp.fine_tune:
            model.encoder.model.load_state_dict(checkpoint['encoder'])
        model.span_net.load_state_dict(checkpoint['span_net'])
        model.label_net.load_state_dict(checkpoint['label_net'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        steps_done = checkpoint['steps_done']
        init_num_stuck_evals = checkpoint['num_stuck_evals']
        max_f1 = checkpoint['max_f1']
        torch.set_rng_state(checkpoint['rng_state'])
        logging.info("Steps done: %d, Max F1: %.3f" % (steps_done, max_f1))

    if not hp.eval:
        train(hp,
              model,
              train_iter,
              val_iter,
              optimizer,
              scheduler,
              model_path,
              best_model_path,
              init_steps=steps_done,
              max_f1=max_f1,
              eval_steps=hp.eval_steps,
              num_steps=num_steps,
              init_num_stuck_evals=init_num_stuck_evals)

    val_f1, test_f1 = final_eval(hp, model, best_model_path, val_iter,
                                 test_iter)
    perf_dir = path.join(hp.model_dir, "perf")
    if not path.exists(perf_dir):
        os.makedirs(perf_dir)
    if hp.slurm_job_id and hp.slurm_array_id:
        perf_file = path.join(
            perf_dir, hp.slurm_job_id + "_" + hp.slurm_array_id + ".txt")
    else:
        perf_file = path.join(model_path, "perf.txt")
    with open(perf_file, "w") as f:
        f.write("%s\n" % (model_path))
        f.write("%s\t%.4f\n" % ("Valid", val_f1))
        f.write("%s\t%.4f\n" % ("Test", test_f1))
corpus = read_lines('/eng.txt')
datax, datay, tag_to_int = read_corpus(corpus)

corpus_test = read_lines('/eng_test.txt')
testx, testy, _ = read_corpus(corpus_test)

corpus_validate = read_lines('/eng_validate.txt')
validatex, validatey, _ = read_corpus(corpus_validate)

embed_size = 50
scrf_size = 100
allowed_span_length = 6
epochs = 100
validate_epochs = len(validatex)
test_epochs = len(testx)

model = NERModel(embed_size, scrf_size, tag_to_int, tag_to_int['<STOP>'],
                 tag_to_int['<START>'], allowed_span_length)

optimizer = optim.Adagrad(model.parameters(), lr=0.009)

word_dict = gs.Word2Vec(datax + validatex + testx,
                        min_count=1,
                        size=embed_size)

data_loader = DataLoader(word_dict, datax, datay, testx, testy, validatex,
                         validatey)

train(model, data_loader, optimizer, epochs, validate_epochs)

test(model, data_loader, test_epochs)
예제 #17
0
    parser.add_argument('--decay_rate', type=float, default=0.05, help='decay rate')
    parser.add_argument('--plot_interval', type=int, default=2000, help='plot every # steps')

    args = parser.parse_args()
    torch.manual_seed(args.seed)

    # =============== Load device ===============
    if torch.cuda.is_available():
        if not args.cuda:
            print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    device = torch.device("cuda" if args.cuda else "cpu")

    # =============== Load data ===============
    cleaner = data.Cleaner(args)
    raw_train_data, raw_dev_data, raw_test_data = cleaner.clean()
    dataset = data.Dataset(raw_train_data, raw_dev_data, raw_test_data, args)
    word2idx, tag2idx, char2idx = dataset.word_to_id, dataset.tag_to_id, dataset.char_to_id
    train_data, dev_data, test_data = dataset.train_data, dataset.dev_data, dataset.test_data
    print("{} / {} / {} sentences in train / dev / test.".format(len(train_data), len(dev_data), len(test_data)))

    # =============== Build the model ===============
    model = NERModel(word2idx, tag2idx, char2idx, args)
    if args.cuda:
        model.to(device)
    print('Model Initialized!!, n_params = {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))

    # =============== Train the model ===============
    all_f1, all_acc = create_and_train_model(model, train_data, dev_data, test_data, tag2idx, args)
    print('f1 = {}'.format(all_f1))
    print('acc = {}'.format(all_acc))
예제 #18
0
def train():
    # 配置信息
    options.init(FLAGS)

    # 读入数据
    print("Preparing data...")
    data = data_loader.ConllLoader()
    options.opts.vocab_size = data.vocab_size
    options.opts.num_tags = data.num_tags
    options.opts.dim_handcraft = data.dim_handcraft
    options.opts.char_vocab_size = data.char_vocab_size
    opts = options.opts

    # 输出配置信息
    for item in opts.__dict__:
        print("{:20s}: {}".format(item, opts.__dict__[item]))

    with tf.Graph().as_default(), tf.Session() as session:

        initializer = tf.random_uniform_initializer(-opts.init_scale,
                                                    opts.init_scale)

        # 建模
        print("\n\nBuilding graphs...")
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            m = NERModel(data.dwords, is_training=True, dtype=tf.float32)
            if opts.restore:
                m.restore(session, opts.restore)

        with tf.variable_scope("model", reuse=True, initializer=initializer):
            mvalid = NERModel(data.dwords, is_training=False, dtype=tf.float32)
            mtest = NERModel(data.dwords, is_training=False, dtype=tf.float32)

        tf.global_variables_initializer().run()

        best_valid = -np.inf
        best_test = -np.inf
        start_time = time.time()
        print("\n\nRunning epoches...")
        try:
            for i in range(opts.max_max_epoch):
                lr_decay = opts.learning_rate_decay**max(
                    i - opts.max_epoch, 0.0)
                m.assign_lr(session, opts.learning_rate * lr_decay)  # 学习率

                print("Epoch: %d Learning rate: %f" %
                      (i + 1, session.run(m.lr)))
                run_epoch(session, m, data, "train", display=1)

                print("Validating...")
                valid_score = run_epoch(session, mvalid, data, "valid")
                if valid_score > best_valid:
                    print("New best score on validation dataset:", valid_score)
                    best_valid = valid_score
                    # mvalid.save(session, name="model")

                if (i + 1) % 10 == 0:
                    print("Test...")
                    test_score = run_epoch(session, mtest, data, "test")
                    if test_score > best_test:
                        print("New best score on test dataset:", test_score)
                        best_test = test_score
        except KeyboardInterrupt:
            record.logging("epoches finished = {}".format(i + 1))
            record.record(opts, best_valid, best_test, start_time)
예제 #19
0
def train_eval(args, train_data_path, valid_data_path):

    index = read_pickle(args.index_path)
    word2index, tag2index = index['word2id'], index['tag2id']
    args.num_labels = len(tag2index)
    args.vocab_size = len(word2index)+1
    set_seed(args.seed_num)
    train_dataloader, train_samples = get_dataloader(train_data_path, args.train_batch_size, True)
    valid_dataloader, _ = get_dataloader(valid_data_path, args.valid_batch_size, False)

    if args.model == 'bert':
        bert_config = BertConfig(args.bert_config_path)
        model = NERBert(bert_config, args)
        model.load_state_dict(torch.load(args.bert_model_path), strict=False)
        # model = NERBert.from_pretrained('bert_chinese',
        #                                 # cache_dir='/home/dutir/yuetianchi/.pytorch_pretrained_bert',
        #                                 num_labels=args.num_labels)
    else:
        if args.embedding:
            word_embedding_matrix = read_pickle(args.embedding_data_path)
            model = NERModel(args, word_embedding_matrix)
        else:
            model = NERModel(args)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.model == 'bert':
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if 'bert' not in n], 'lr': 5e-5, 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and ('bert' in n)],
             'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and ('bert' in n)],
             'weight_decay': 0.0}
        ]
        warmup_proportion = 0.1
        num_train_optimization_steps = int(
            train_samples / args.train_batch_size / args.gradient_accumulation_steps) * args.epochs

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=warmup_proportion,
                             t_total=num_train_optimization_steps)
    else:
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(
            filter(lambda p: p.requires_grad, model.parameters()),
            lr=current_learning_rate
        )

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        model.load_state_dict(checkpoint['model_state_dict'])
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    global_step = init_step
    best_score = 0.0

    logging.info('Start Training...')
    logging.info('init_step = %d' % global_step)
    for epoch_id in range(int(args.epochs)):

        tr_loss = 0
        model.train()
        for step, train_batch in enumerate(train_dataloader):


            batch = tuple(t.to(device) for t in train_batch)
            _, loss = model(batch[0], batch[1])
            if n_gpu > 1:
                loss = loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            tr_loss += loss.item()
            loss.backward()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

            if (step + 1) % 500 == 0:
                print(loss.item())

            if args.do_valid and global_step % args.valid_step == 1:
                true_res = []
                pred_res = []
                len_res = []
                model.eval()
                for valid_step, valid_batch in enumerate(valid_dataloader):
                    valid_batch = tuple(t.to(device) for t in valid_batch)

                    with torch.no_grad():
                        logit = model(valid_batch[0])
                    if args.model == 'bert':
                        # 第一个token是‘cls’
                        len_res.extend(torch.sum(valid_batch[0].gt(0), dim=-1).detach().cpu().numpy()-1)
                        true_res.extend(valid_batch[1].detach().cpu().numpy()[:,1:])
                        pred_res.extend(logit.detach().cpu().numpy()[:,1:])
                    else:
                        len_res.extend(torch.sum(valid_batch[0].gt(0),dim=-1).detach().cpu().numpy())
                        true_res.extend(valid_batch[1].detach().cpu().numpy())
                        pred_res.extend(logit.detach().cpu().numpy())
                acc, score = cal_score(true_res, pred_res, len_res, tag2index)
                score = f1_score(true_res, pred_res, len_res, tag2index)
                logging.info('Evaluation:step:{},acc:{},fscore:{}'.format(str(epoch_id), acc, score))
                if score>=best_score:
                    best_score = score
                    if args.model == 'bert':
                        model_to_save = model.module if hasattr(model,
                                                                'module') else model  # Only save the model it-self
                        output_dir = '{}_{}'.format('bert', str(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                            output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
                            torch.save(model_to_save.state_dict(), output_model_file)
                            output_config_file = os.path.join(output_dir, CONFIG_NAME)
                            with open(output_config_file, 'w') as f:
                                f.write(model_to_save.config.to_json_string())
                    else:
                        save_variable_list = {
                            'step': global_step,
                            'current_learning_rate': args.learning_rate,
                            'warm_up_steps': step
                        }
                        save_model(model, optimizer, save_variable_list, args)
                model.train()
예제 #20
0
                                      lowercase=True,
                                      chars=config.chars)
processing_tag = get_processing_word(vocab_tags, lowercase=False)

# get pre trained embeddings
# embeddings = get_trimmed_glove_vectors(config.trimmed_filename)
embeddings = None

# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     config.max_iter)

# build model
model = NERModel(config,
                 embeddings,
                 ntags=len(vocab_tags),
                 nchars=len(vocab_chars))
model.build()
#x=raw_input('xxxxxxx')
# train, evaluate and interact
# model.train(train, dev, vocab_tags)
import time
start = time.time()
model.evaluate(dev, vocab_tags)
print time.time() - start
#model.interactive_shell(vocab_tags, processing_word, test)
예제 #21
0
vocab_words = load_vocab(config.words_filename)
vocab_tags = load_vocab(config.tags_filename)

# get processing functions
processing_word = get_processing_word(vocab_words, lowercase=config.lowercase)
processing_tag = get_processing_word(vocab_tags, lowercase=False)

# get pre trained embeddings

embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev = CoNLLDataset(config.dev_filename, processing_word, processing_tag,
                   config.max_iter)
test = CoNLLDataset(config.test_filename, processing_word, processing_tag,
                    config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word, processing_tag,
                     config.max_iter)

# get logger
logger = get_logger(config.log_path)

# build model
model = NERModel(config, embeddings, ntags=len(vocab_tags), logger=logger)
model.build()

# train, evaluate and interact
model.train(train, dev, vocab_tags)
#model.evaluate(test, vocab_tags)
#model.interactive_shell(vocab_tags, processing_word)
예제 #22
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", type=int, default=128)
    parser.add_argument("--lr", type=float, default=0.0001)
    parser.add_argument("--n_epochs", type=int, default=1)
    parser.add_argument("--finetuning", dest="finetuning", action="store_true")
    parser.add_argument("--logdir", type=str, default="checkpoints/01")
    parser.add_argument("--trainset", type=str, default="data/train.txt")
    parser.add_argument("--validset", type=str, default="data/valid.txt")
    parser.add_argument("--model", type=str, default="bert-base-cased")
    hp = parser.parse_args()

    # device = 'cuda' if torch.cuda.is_available() else 'cpu'
    device = torch.device('cpu')
    model = NERModel(tag_size=len(TAGS),
                     device='cpu',
                     finetuning=True,
                     bert_model=hp.model)
    model = nn.DataParallel(model)

    tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                              do_lower_case=False)
    if tokenizer is not None:
        print("success")

    train_dataset = NerDataset(hp.trainset, tokenizer=tokenizer)
    eval_dataset = NerDataset(hp.validset, tokenizer=tokenizer)

    train_iter = data.DataLoader(dataset=train_dataset,
                                 batch_size=hp.batch_size,
                                 shuffle=True,
                                 num_workers=4,
예제 #23
0
            char2idx = pickle.load(f)
        with open('weights/morph_dict_lower.pkl', 'rb') as f:
            morph2idx = pickle.load(f)

    word2morph = word_to_morph(whole_data_morphs)

    indexed_data = data_to_idx(whole_data, embeddings)
    indexed_char = char_to_idx(whole_data, char2idx)
    indexed_morph = morph_to_idx(whole_data, morph2idx, word2morph)
    indexed_whole_data = combine_data(indexed_data, indexed_char,
                                      indexed_morph, MAX_SEQ_LENGTH)

    # initialize the model
    model = NERModel(word_embedding_dim, char_embedding_dim,
                     morph_embedding_dim, word_hidden_size,
                     char_hidden_size, morph_hidden_size, len(char2idx),
                     len(morph2idx), num_tags, word_num_layers,
                     char_num_layers, morph_num_layers,
                     dropout_prob).to(device)

    # load the model
    if lowercase_model == False:
        model.load_state_dict(
            torch.load('weights/model_upper.pt',
                       map_location=torch.device('cpu')))
    else:
        model.load_state_dict(
            torch.load('weights/model_lower.pt',
                       map_location=torch.device('cpu')))

    model.eval()
    batch_size = 1
예제 #24
0

# get processing functions
processing_word = get_processing_word(vocab_words,
                lowercase=config.lowercase)
processing_tag  = get_processing_word(vocab_tags, lowercase=False)

# get pre trained embeddings

embeddings = get_trimmed_glove_vectors(config.trimmed_filename)

# create dataset
dev   = CoNLLDataset(config.dev_filename, processing_word,
                    processing_tag, config.max_iter)
test  = CoNLLDataset(config.test_filename, processing_word,
                    processing_tag, config.max_iter)
train = CoNLLDataset(config.train_filename, processing_word,
                    processing_tag, config.max_iter)

# get logger
logger = get_logger(config.log_path)

# build model
model = NERModel(config, embeddings, ntags=len(vocab_tags),
                 logger=logger)
model.build()

# train, evaluate and interact
model.train(train, dev, vocab_tags)
# model.evaluate(test, vocab_tags)
model.interactive_shell(vocab_tags, processing_word)
예제 #25
0
    train_dataset = dataset.NERdataset(train_sentences, train_tags)
    val_dataset = dataset.NERdataset(val_sentences, val_tags)

    train_dataloader = torch.utils.data.DataLoader(
        dataset = train_dataset,
        batch_size = config.TRAIN_BATCH_SIZE
    )

    val_dataloader = torch.utils.data.DataLoader(
        dataset = val_dataset,
        batch_size = config.VALID_BATCH_SIZE
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = NERModel(num_tags)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
예제 #26
0
            if arr[i][j] < 3:
                arr[i][j] = enc_tags.inverse_transform([arr[i][j]])[0]
            elif arr[i][j] == 3:
                arr[i][j] = 'X'
            else:
                raise KeyError(str(arr[i][j])+' as key not found in Label Encoder ')
    return arr

if __name__ == "__main__":
    my_parser = argparse.ArgumentParser()
    my_parser.version = '1.0'
    my_parser.add_argument('-g','--grouped_entities', action='store_true',help='if used, evaluate all metrics on exact entity-level matching, instead of just wordpiece-level tokens ')
    args = my_parser.parse_args()
    grouped_entities = args.grouped_entities

    meta_data = joblib.load(config.METADATA_PATH)
    enc_tags = meta_data['enc_tags']

    num_tags = len(list(enc_tags.classes_))
    sentences, tags = preprocess_data(enc_tags)
    test_dataloader = get_dataloader(sentences, tags)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = NERModel(num_tags)
    model.load_state_dict(torch.load(config.MODEL_PATH,map_location=device))
    tags_ypred, tags_ytrue = evaluate(test_dataloader, model, device, num_tags, grouped_entities=grouped_entities)
    # tags_ypred = enc_tags.inverse_transform(tags_ypred)
    # tags_ytrue = enc_tags.inverse_transform(tags_ytrue)
    tags_ypred = decode_transform(tags_ypred, enc_tags)
    tags_ytrue = decode_transform(tags_ytrue, enc_tags)
    # print(tags_ytrue,tags_ypred)
    print(seqeval_classification_report(tags_ytrue, tags_ypred))