예제 #1
0
def main_train():
    # load data sets
    # sentences = [[(words11, tag11), ...], [(word21, tag21), ...], ...]
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # 更新在train_sentences和test_sentences中
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    # 创建或加载字符、词、特征、target的映射字典
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        # 若存在pre-trained embedding file,则同时使用pre-trained和训练集构建字典
        if FLAGS.pre_emb:
            # 统计train中字,返回字典
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            # 使用pre-trained的字增大训练集字的字典
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(),
                FLAGS.emb_file,
                list(itertools.chain.from_iterable(
                    [[w[0] for w in s] for s in test_sentences])
                )
            )
        # 否则只是用训练集构建字典
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, target_to_id, id_to_target = tag_mapping(train_sentences)

        # 创建其他特征的映射字典,返回的三个都为dict
        _f, feature_to_id, id_to_feature = feature_mapping(train_sentences, FLAGS.features)

        # 存储字、target、feature的映射关系
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, target_to_id, id_to_target, feature_to_id, id_to_feature], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, target_to_id, id_to_target, feature_to_id, id_to_feature = pickle.load(f)

    # make path for store log and model config if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = create_config_model(FLAGS, char_to_id, target_to_id, feature_to_id)
    logger = get_logger(FLAGS.log_file)
    print_config(config, logger)

    train(config, train_sentences, dev_sentences, test_sentences, char_to_id, feature_to_id, target_to_id, id_to_char,
          id_to_target, logger)
예제 #2
0
    def load_data(self):
        self.data_path = "data/annotated/" + self.name + ".json"
        make_path(self.data_path)

        if os.path.exists(self.data_path):
            self.ann_data = load_json(self.data_path)
        else:
            self.ann_data = []
        self.raw_data = load_json(config.processed_path)

        self.total_num = len(self.raw_data)
        self.annotated_num = len(self.ann_data)
        self.position = self.annotated_num  # the page showing
예제 #3
0
    def __init__(self, song_length: int, dim, n_channels: int, batch_size: int,
                 args):
        self.path = "../sdb/data/%s/%s.npz"

        self.song_length = song_length
        self.dimension = dim
        self.n_channels = n_channels
        self.input_shape = np.empty((*self.dimension, self.n_channels)).shape
        self.n_labels = 10 if args.d == 'gtzan' else 50
        self.batch_size = batch_size

        self.model = self.build_model()
        self.model.summary()

        self.workers = multiprocessing.cpu_count()
        print('Using ' + str(self.workers) + ' workers')

        # Callbacks
        self.callbacks = []
        self.callbacks.append(LearningRateTracker())
        self.callbacks.append(
            EarlyStopping(monitor='val_loss',
                          patience=3,
                          verbose=0,
                          mode='auto'))
        if args.logging:
            csv_logger = CSVLogger(filename=utils.make_path(
                args.logging, "%s-%s_%s.csv" %
                (args.d, self.model_name, datetime.now())))
            self.callbacks.append(csv_logger)

        self.gpu = None
        if args.gpu:
            self.gpu = args.gpu

        self.dataset = args.d
예제 #4
0
파일: main.py 프로젝트: memoiry/NLU
def train_ner():
    clean(FLAGS)
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    update_tag_scheme(train_sentences, FLAGS.tag_schema)
    update_tag_scheme(test_sentences, FLAGS.tag_schema)
    update_tag_scheme(dev_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        if FLAGS.pre_emb:
            dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0]
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), FLAGS.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 FLAGS.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               FLAGS.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                FLAGS.lower)
    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), 0, len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join("log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss = []
        for i in range(25):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                save_model(sess, model, FLAGS.ckpt_path, logger)
            evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #5
0
def train():
    # load data sets
    train_sentences = load_sentences(args.train_file, args.lower, args.zeros)
    dev_sentences = load_sentences(args.dev_file, args.lower, args.zeros)
    test_sentences = load_sentences(args.test_file, args.lower, args.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    # 检测并维护数据集的 tag 标记
    update_tag_scheme(train_sentences, args.tag_schema)
    update_tag_scheme(test_sentences, args.tag_schema)
    update_tag_scheme(dev_sentences, args.tag_schema)

    # create maps if not exist
    # 根据数据集创建 char_to_id, id_to_char, tag_to_id, id_to_tag 字典,并储存为 pkl 文件
    if not os.path.isfile(args.map_file):
        # create dictionary for word
        if args.pre_emb:
            dico_chars_train = char_mapping(train_sentences, args.lower)[0]
            # 利用预训练嵌入集增强(扩充)字符字典,然后返回字符与位置映射关系
            dico_chars, char_to_id, id_to_char = augment_with_pretrained(
                dico_chars_train.copy(), args.emb_file,
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in test_sentences])))
        else:
            _c, char_to_id, id_to_char = char_mapping(train_sentences,
                                                      args.lower)

        # Create a dictionary and a mapping for tags
        # 获取标记与位置映射关系
        tag_to_id, id_to_tag, intent_to_id, id_to_intent = tag_mapping(
            train_sentences)

        with open(args.map_file, "wb") as f:
            pickle.dump([
                char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id,
                id_to_intent
            ], f)
    else:
        with open(args.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag, intent_to_id, id_to_intent = pickle.load(
                f)

    # 提取句子特征
    # prepare data, get a collection of list containing index
    train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id,
                                 intent_to_id, args.lower)
    dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id,
                               intent_to_id, args.lower)
    test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id,
                                intent_to_id, args.lower)

    # code.interact(local=locals())

    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    # 获取可供模型训练的单个批次数据
    train_manager = BatchManager(train_data, args.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)

    # make path for store log and model if not exist
    make_path(args)
    if os.path.isfile(args.config_file):
        config = load_config(args.config_file)
    else:
        config = config_model(char_to_id, tag_to_id, intent_to_id)
        save_config(config, args.config_file)
    make_path(args)

    logger = get_logger(args.log_file)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True

    # 训练集全量跑一次需要迭代的次数
    steps_per_epoch = train_manager.len_data

    with tf.Session(config=tf_config) as sess:
        # 此处模型创建为项目最核心代码
        model = create_model(sess, Model, args.ckpt_path, load_word2vec,
                             config, id_to_char, logger)
        logger.info("start training")
        loss_slot = []
        loss_intent = []

        # with tf.device("/gpu:0"):
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss_slot, batch_loss_intent = model.run_step(
                    sess, True, batch)
                loss_slot.append(batch_loss_slot)
                loss_intent.append(batch_loss_intent)

                if step % args.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "INTENT loss:{:>9.6f}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss_intent),
                                    np.mean(loss_slot)))
                    loss_slot = []
                    loss_intent = []

            best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)
            if best:
                # if i%7 == 0:
                save_model(sess, model, args.ckpt_path, logger)
        evaluate(sess, model, "test", test_manager, id_to_tag, logger)
예제 #6
0
def train():
    # load data sets
    train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower,
                                     FLAGS.zeros)
    dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros)
    test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros)

    # Use selected tagging scheme (IOB / IOBES)
    #update_tag_scheme(train_sentences, FLAGS.tag_schema)
    #update_tag_scheme(test_sentences, FLAGS.tag_schema)

    # create maps if not exist
    if not os.path.isfile(FLAGS.map_file):
        # create dictionary for word
        _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower)

        # Create a dictionary and a mapping for tags
        _t, tag_to_id, id_to_tag = tag_mapping(train_sentences)
        os.makedirs('%s' % FLAGS.save_path)
        with open(FLAGS.map_file, "wb") as f:
            pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f)
    else:
        with open(FLAGS.map_file, "rb") as f:
            char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f)

    # prepare data, get a collection of list containing index
    train_data = prepare_padding_dataset(train_sentences, FLAGS.max_seq_len,
                                         char_to_id, tag_to_id, FLAGS.lower)
    dev_data = prepare_padding_dataset(dev_sentences, FLAGS.max_seq_len,
                                       char_to_id, tag_to_id, FLAGS.lower)
    test_data = prepare_padding_dataset(test_sentences, FLAGS.max_seq_len,
                                        char_to_id, tag_to_id, FLAGS.lower)

    print("%i / %i / %i sentences in train / dev / test." %
          (len(train_data), len(dev_data), len(test_data)))

    train_manager = BatchManager(train_data, FLAGS.batch_size)
    dev_manager = BatchManager(dev_data, 100)
    test_manager = BatchManager(test_data, 100)
    """
    batch = train_manager.batch_data[0]
    strings, chars, segs, tags = batch
    for chrs in chars:
        print(chrs)
    for chrs in segs:
        print(chrs)
    print(tag_to_id)
    """
    # make path for store log and model if not exist
    make_path(FLAGS)
    if os.path.isfile(FLAGS.config_file):
        config = load_config(FLAGS.config_file)
    else:
        config = config_model(char_to_id, tag_to_id)
        save_config(config, FLAGS.config_file)
    make_path(FLAGS)

    log_path = os.path.join(FLAGS.save_path, "log", FLAGS.log_file)
    logger = get_logger(log_path)
    print_config(config, logger)

    # limit GPU memory
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    steps_per_epoch = train_manager.len_data
    with tf.Session(config=tf_config) as sess:
        model = TransformerCRFModel(config, is_training=True)
        sess.run(tf.global_variables_initializer())
        logger.info("start training")
        loss = []
        for i in range(100):
            for batch in train_manager.iter_batch(shuffle=True):
                step, batch_loss = model.run_step(sess, True, batch)
                loss.append(batch_loss)
                if step % FLAGS.steps_check == 0:
                    iteration = step // steps_per_epoch + 1
                    logger.info("iteration:{} step:{}/{}, "
                                "NER loss:{:>9.6f}".format(
                                    iteration, step % steps_per_epoch,
                                    steps_per_epoch, np.mean(loss)))
                    loss = []

            predict_lists = []
            source_tag = []
            best_dev_f1 = 0.0
            best_test_f1 = 0.0
            for batch in dev_manager.iter_batch(shuffle=False):
                lengths, logits = model.run_step(sess, False, batch)
                _, chars, segs, tags = batch
                transition = model.transition.eval(session=sess)
                pre_seq = model.predict(logits, transition, lengths)
                pre_label = recover_label(pre_seq, lengths, id_to_tag)
                """
                for p in range(len(pre_label)):
                    print(chars[p])
                    print(pre_label[p])
                """
                source_label = recover_label(tags, lengths, id_to_tag)
                predict_lists.extend(pre_label)
                source_tag.extend(source_label)
            train_loss_v = np.round(float(np.mean(loss)), 4)
            print('****************************************************')
            acc, p, r, f = get_ner_fmeasure(source_tag, predict_lists,
                                            config["tag_schema"])
            logger.info('epoch:\t{}\ttrain loss:\t{}\t'.format(
                i + 1, train_loss_v))
            logger.info('dev acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format(
                acc, p, r, f))

            for batch in test_manager.iter_batch(shuffle=False):
                lengths, logits = model.run_step(sess, False, batch)
                _, chars, segs, tags = batch
                transition = model.transition.eval(session=sess)
                pre_seq = model.predict(logits, transition, lengths)
                pre_label = recover_label(pre_seq, lengths, id_to_tag)
                source_label = recover_label(tags, lengths, id_to_tag)
                predict_lists.extend(pre_label)
                source_tag.extend(source_label)

            acc_t, p_t, r_t, f_t = get_ner_fmeasure(source_tag, predict_lists,
                                                    config["tag_schema"])
            logger.info('test acc:\t{}\tp:\t{}\tr:\t{}\tf:\t{}'.format(
                acc_t, p_t, r_t, f_t))
            if f > best_dev_f1:
                save_model(sess, model, FLAGS.ckpt_path, logger)
                best_dev_f1 = f
                best_test_f1 = f_t
                logger.info(
                    'save epoch:\t{} model with best dev f1-score'.format(i +
                                                                          1))

            print('****************************************************\n\n')