np.random.seed(0)

    # 4. Set the logger
    utils.set_logger(os.path.join(args.model_dir, 'train.log'))

    # 5. Create the input data pipeline
    logging.info("Loading the datasets...")
    # 5.1 specify features
    from collections import OrderedDict

    data_encoder = utils.load_obj(
        os.path.join(args.model_dir, 'data_encoder.pkl'))
    label_encoder = utils.load_obj(
        os.path.join(args.model_dir, 'label_encoder.pkl'))
    # 5.2 load data
    data_loader = DataLoader(params, args.data_dir, data_encoder,
                             label_encoder)
    data = data_loader.load_data(['test'])
    test_data = data['test']
    # 5.3 specify the train and val dataset sizes
    params.test_size = test_data['size']
    test_data_iterator = data_loader.batch_iterator(test_data,
                                                    params,
                                                    shuffle=False)
    logging.info("- done.")

    # 6. Modeling
    # 6.1 Define the model
    model = LSTMCRF(
        params=params,
        char_vocab_length=data_encoder[CharEncoder.FEATURE_NAME].vocab_length,
        num_tags=label_encoder[EntityEncoder.FEATURE_NAME].num_tags,
Пример #2
0
    data_encoder = utils.load_obj(
        os.path.join(pretrained_model_dir, 'data_encoder.pkl'))
    pretrained_label_encoder = utils.load_obj(
        os.path.join(pretrained_model_dir, 'label_encoder.pkl'))
    label_encoder = OrderedDict()
    label_encoder[ClassEncoder.FEATURE_NAME] = ClassEncoder(
        os.path.join(args.data_dir, 'feats'))

    # 5.2 load data

    k_fold = None
    combine_train_dev = False
    train_on_dev = False

    data_loader = DataLoader(params, args.data_dir, data_encoder,
                             label_encoder)
    if k_fold:
        logging.info('K-Fold turned on with folds: {}'.format(k_fold))
        splits_dir = [
            os.path.join(args.data_dir, 'split_' + str(split_num))
            for split_num in range(1, k_fold + 1)
        ]
    else:
        splits_dir = [args.data_dir]

    for split_dir in splits_dir:
        logging.info('training for: {}'.format(split_dir))
        args.data_dir = split_dir
        if k_fold:
            split_model_dir = os.path.join(args.model_dir,
                                           os.path.basename(split_dir))
    np.random.seed(0)

    # 4. Set the logger
    utils.set_logger(os.path.join(args.model_dir, 'train.log'))

    # 5. Create the input data pipeline
    logging.info("Loading the datasets...")
    # 5.1 specify features

    data_encoder = utils.load_obj(
        os.path.join(args.model_dir, 'data_encoder.pkl'))
    label_encoder = utils.load_obj(
        os.path.join(args.model_dir, 'label_encoder.pkl'))

    # 5.2 load data
    data_loader = DataLoader(params, args.data_dir, data_encoder,
                             label_encoder)
    data = data_loader.load_data([data_to_use])
    test_data = data[data_to_use]
    # 5.3 specify the train and val dataset sizes
    params.test_size = test_data['size']
    test_data_iterator = data_loader.batch_iterator(test_data,
                                                    params,
                                                    shuffle=False,
                                                    sort_by_legth=False)
    logging.info("- done.")

    # 6. Modeling
    # 6.1 Define the model
    from src.tc.model.net import CNNTC

    model = CNNTC(