def export(model, config): runner = KerasModelDatasetRunner(model, model_dir=config['model_dir'], model_name='dssm', configs=config, logger_name='dssm') runner.export(os.path.join(config['model_dir'], 'export', '1'), ckpt=None)
def train(model, config): runner = KerasModelDatasetRunner(model, model_dir=config['model_dir'], model_name='dssm', configs=config, logger_name='dssm') train_dataset = datasets.build_train_dataset(config['train_files'], config) eval_dataset = None if config['eval_files']: eval_dataset = datasets.build_eval_dataset(config['eval_files'], config) runner.train(train_dataset, val_dataset=None) weights = runner.model.get_weights() vocab = [] with open(config['vocab_file'], "r", encoding="utf-8") as f: for word in f.readlines(): vocab.append(word.strip("\n")) if len(vocab) != weights[0].shape[0]: print("vocab size:{} != weights[0].shape[0]:{}".format( len(vocab), weights[0].shape[0])) with open(os.path.join(config['model_dir'], "embedding.txt"), "w", encoding="utf-8") as f: f.write(" ".join([str(weights[0].shape[0]), str(weights[0].shape[1])]) + "\n") for i, vec in enumerate(weights[0]): f.write(" ".join([vocab[i]] + [str(x) for x in vec]) + "\n") print('weights:{}, {}'.format(len(weights), weights[0].shape)) print('weights:{}, {}'.format(weights[1].shape, weights[2].shape)) print('weights:{}, {}'.format(weights[1], weights[2]))
def predict(model, config): runner = KerasModelDatasetRunner(model, model_dir=config['model_dir'], model_name='dssm', configs=config, logger_name='dssm') predict_dataset, uuid_key_list = datasets.build_predict_dataset( config['predict_files'], config) res = runner.predict(predict_dataset, ) print("predict data count".format(len(res))) print(model.output_names) print(model.output) rt = dict(zip(model.output_names, res)) rrt = {} for k, v in rt.items(): rrt[k] = dict(zip(uuid_key_list, v)) print(rt) print("uuid_key_list: {}, out:{}".format(len(uuid_key_list), len(res[-1]))) with open(config['total_files'], "r", encoding="utf-8") as f, open(config['predict_files_rt'], "w", encoding="utf-8") as f_o: for line in tqdm(f.readlines()): line = json.loads(line) uuid_key = line.get("uuid_key") if uuid_key in uuid_key_list: for name in rrt.keys(): line[name] = float(list(rrt.get(name).get(uuid_key))[0]) f_o.write(json.dumps(line, ensure_ascii=False) + "\n")
def evaluate(model, config): if not config.get('eval_files', None): raise ValueError('`eval_files` must provided.') runner = KerasModelDatasetRunner(model, model_dir=config['model_dir'], model_name='dssm', configs=config, logger_name='dssm') eval_dataset = datasets.build_eval_dataset(config['eval_files'], config) runner.eval(eval_dataset)
def predict(model, config): runner = KerasModelDatasetRunner(model, model_dir=config['model_dir'], model_name='dssm', configs=config, logger_name='dssm') predict_dataset = datasets.build_predict_dataset(config['predict_files'], config) res = runner.predict(predict_dataset) print(res[0]) print(res[1])
def train(model, config): runner = KerasModelDatasetRunner(model, model_dir=config['model_dir'], model_name='dssm', configs=config, logger_name='dssm') train_dataset = datasets.build_train_dataset(config['train_files'], config) eval_dataset = None if config['eval_files']: eval_dataset = datasets.build_eval_dataset(config['eval_files'], config) runner.train(train_dataset, val_dataset=eval_dataset)
default='/tmp/matchpyramid', help='File path to save model.') args, _ = parser.parse_known_args() if args.model == 'indicator': model = models.build_indicator_model(models.model_config) elif args.model == 'dot': model = models.build_dot_model(models.model_config) elif args.model == 'cosine': model = models.build_cosine_model(models.model_config) else: raise ValueError('Invalid model: %s' % args.model) runner = KerasModelDatasetRunner(model=model, model_name='mp', model_dir=args.model_dir, configs=None) if args.action == 'train': train_files = [os.path.join(utils.testdat_dir(), 'train.txt')] # use train files as validation files, not recommend in actual use valid_files = [os.path.join(utils.testdat_dir(), 'train.txt')] train_dataset = dataset.build_train_dataset(train_files) valid_dataset = dataset.build_eval_dataset(valid_files) runner.train(dataset=train_dataset, val_dataset=valid_dataset, ckpt=args.model_dir) elif args.action == 'eval': eval_files = [os.path.join(utils.testdat_dir(), 'train.txt')] eval_dataset = dataset.build_eval_dataset(eval_files) runner.eval(dataset=eval_dataset)
logging.info('vocab size of tokenizer: %d' % tokenizer.vocab_size) config['vocab_size'] = tokenizer.vocab_size args, _ = parser.parse_known_args() if 'mlp' == args.model: model = models.build_mlp_model(config) elif 'lstm' == args.model: model = models.build_lstm_model(config) else: raise ValueError('Invalid model: %s' % args.model) dataset = XYZSameFileDataset(x_tokenizer=tokenizer, y_tokenizer=tokenizer, config=None) runner = KerasModelDatasetRunner( model=model, model_name='dssm', model_dir=config['model_dir'], configs=config) if 'train' == args.action: train_files = config['train_files'] train_dataset = dataset.build_train_dataset(train_files=train_files) eval_dataset = dataset.build_eval_dataset(eval_files=config['eval_files']) if config['eval_files'] else None runner.train(dataset=train_dataset, val_dataset=eval_dataset, ckpt=None) elif 'eval' == args.action: if not config['eval_files']: raise ValueError('eval_files must not be None in eval mode.') eval_dataset = dataset.build_eval_dataset(eval_files=config['eval_files']) runner.eval(dataset=eval_dataset, ckpt=None) logging.info('Finished to evaluate model.')