Пример #1
0
def export(model, config):
    runner = KerasModelDatasetRunner(model,
                                     model_dir=config['model_dir'],
                                     model_name='dssm',
                                     configs=config,
                                     logger_name='dssm')
    runner.export(os.path.join(config['model_dir'], 'export', '1'), ckpt=None)
Пример #2
0
def train(model, config):
    runner = KerasModelDatasetRunner(model,
                                     model_dir=config['model_dir'],
                                     model_name='dssm',
                                     configs=config,
                                     logger_name='dssm')
    train_dataset = datasets.build_train_dataset(config['train_files'], config)
    eval_dataset = None
    if config['eval_files']:
        eval_dataset = datasets.build_eval_dataset(config['eval_files'],
                                                   config)
    runner.train(train_dataset, val_dataset=None)
    weights = runner.model.get_weights()
    vocab = []
    with open(config['vocab_file'], "r", encoding="utf-8") as f:
        for word in f.readlines():
            vocab.append(word.strip("\n"))
    if len(vocab) != weights[0].shape[0]:
        print("vocab size:{} != weights[0].shape[0]:{}".format(
            len(vocab), weights[0].shape[0]))
    with open(os.path.join(config['model_dir'], "embedding.txt"),
              "w",
              encoding="utf-8") as f:
        f.write(" ".join([str(weights[0].shape[0]),
                          str(weights[0].shape[1])]) + "\n")
        for i, vec in enumerate(weights[0]):
            f.write(" ".join([vocab[i]] + [str(x) for x in vec]) + "\n")
    print('weights:{}, {}'.format(len(weights), weights[0].shape))
    print('weights:{}, {}'.format(weights[1].shape, weights[2].shape))
    print('weights:{}, {}'.format(weights[1], weights[2]))
Пример #3
0
def predict(model, config):
    runner = KerasModelDatasetRunner(model,
                                     model_dir=config['model_dir'],
                                     model_name='dssm',
                                     configs=config,
                                     logger_name='dssm')
    predict_dataset, uuid_key_list = datasets.build_predict_dataset(
        config['predict_files'], config)
    res = runner.predict(predict_dataset, )
    print("predict data count".format(len(res)))
    print(model.output_names)
    print(model.output)
    rt = dict(zip(model.output_names, res))
    rrt = {}
    for k, v in rt.items():
        rrt[k] = dict(zip(uuid_key_list, v))
    print(rt)
    print("uuid_key_list: {}, out:{}".format(len(uuid_key_list), len(res[-1])))
    with open(config['total_files'], "r",
              encoding="utf-8") as f, open(config['predict_files_rt'],
                                           "w",
                                           encoding="utf-8") as f_o:
        for line in tqdm(f.readlines()):
            line = json.loads(line)
            uuid_key = line.get("uuid_key")
            if uuid_key in uuid_key_list:
                for name in rrt.keys():
                    line[name] = float(list(rrt.get(name).get(uuid_key))[0])
                f_o.write(json.dumps(line, ensure_ascii=False) + "\n")
Пример #4
0
def evaluate(model, config):
    if not config.get('eval_files', None):
        raise ValueError('`eval_files` must provided.')
    runner = KerasModelDatasetRunner(model,
                                     model_dir=config['model_dir'],
                                     model_name='dssm',
                                     configs=config,
                                     logger_name='dssm')
    eval_dataset = datasets.build_eval_dataset(config['eval_files'], config)
    runner.eval(eval_dataset)
Пример #5
0
def predict(model, config):
    runner = KerasModelDatasetRunner(model,
                                     model_dir=config['model_dir'],
                                     model_name='dssm',
                                     configs=config,
                                     logger_name='dssm')
    predict_dataset = datasets.build_predict_dataset(config['predict_files'],
                                                     config)
    res = runner.predict(predict_dataset)
    print(res[0])
    print(res[1])
Пример #6
0
def train(model, config):
    runner = KerasModelDatasetRunner(model,
                                     model_dir=config['model_dir'],
                                     model_name='dssm',
                                     configs=config,
                                     logger_name='dssm')
    train_dataset = datasets.build_train_dataset(config['train_files'], config)
    eval_dataset = None
    if config['eval_files']:
        eval_dataset = datasets.build_eval_dataset(config['eval_files'],
                                                   config)
    runner.train(train_dataset, val_dataset=eval_dataset)
Пример #7
0
                        default='/tmp/matchpyramid',
                        help='File path to save model.')

    args, _ = parser.parse_known_args()

    if args.model == 'indicator':
        model = models.build_indicator_model(models.model_config)
    elif args.model == 'dot':
        model = models.build_dot_model(models.model_config)
    elif args.model == 'cosine':
        model = models.build_cosine_model(models.model_config)
    else:
        raise ValueError('Invalid model: %s' % args.model)

    runner = KerasModelDatasetRunner(model=model,
                                     model_name='mp',
                                     model_dir=args.model_dir,
                                     configs=None)

    if args.action == 'train':
        train_files = [os.path.join(utils.testdat_dir(), 'train.txt')]
        # use train files as validation files, not recommend in actual use
        valid_files = [os.path.join(utils.testdat_dir(), 'train.txt')]
        train_dataset = dataset.build_train_dataset(train_files)
        valid_dataset = dataset.build_eval_dataset(valid_files)
        runner.train(dataset=train_dataset,
                     val_dataset=valid_dataset,
                     ckpt=args.model_dir)
    elif args.action == 'eval':
        eval_files = [os.path.join(utils.testdat_dir(), 'train.txt')]
        eval_dataset = dataset.build_eval_dataset(eval_files)
        runner.eval(dataset=eval_dataset)
Пример #8
0
    logging.info('vocab size of tokenizer: %d' % tokenizer.vocab_size)
    config['vocab_size'] = tokenizer.vocab_size

    args, _ = parser.parse_known_args()
    if 'mlp' == args.model:
        model = models.build_mlp_model(config)
    elif 'lstm' == args.model:
        model = models.build_lstm_model(config)
    else:
        raise ValueError('Invalid model: %s' % args.model)

    dataset = XYZSameFileDataset(x_tokenizer=tokenizer, y_tokenizer=tokenizer, config=None)

    runner = KerasModelDatasetRunner(
        model=model,
        model_name='dssm',
        model_dir=config['model_dir'],
        configs=config)

    if 'train' == args.action:
        train_files = config['train_files']
        train_dataset = dataset.build_train_dataset(train_files=train_files)
        eval_dataset = dataset.build_eval_dataset(eval_files=config['eval_files']) if config['eval_files'] else None
        runner.train(dataset=train_dataset, val_dataset=eval_dataset, ckpt=None)

    elif 'eval' == args.action:
        if not config['eval_files']:
            raise ValueError('eval_files must not be None in eval mode.')
        eval_dataset = dataset.build_eval_dataset(eval_files=config['eval_files'])
        runner.eval(dataset=eval_dataset, ckpt=None)
        logging.info('Finished to evaluate model.')