def run_model(config, save_dir, evaluate_only=False): tf.logging.set_verbosity('INFO') config = load_config([config]) model = load_custom_model(save_dir, NMTCustom()) session_config = tf.ConfigProto(intra_op_parallelism_threads=0, inter_op_parallelism_threads=0) runner = Runner(model, config, seed=None, num_devices=1, gpu_allow_growth=False, session_config=session_config) if evaluate_only: runner.evaluate() else: runner.train_and_evaluate()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "run", choices=["train_and_eval", "train", "eval", "infer", "export"], help="Run type.") parser.add_argument("--config", required=True, nargs="+", help="List of configuration files.") parser.add_argument("--model", default="", help="Model configuration file.") parser.add_argument( "--run_dir", default="", help="If set, model_dir will be created relative to this location.") parser.add_argument( "--data_dir", default="", help="If set, data files are expected to be relative to this location." ) parser.add_argument("--features_file", default=[], nargs="+", help="Run inference on this file.") parser.add_argument( "--predictions_file", default="", help= ("File used to save predictions. If not set, predictions are printed " "on the standard output.")) parser.add_argument( "--checkpoint_path", default=None, help=("Checkpoint or directory to use for inference or export " "(when a directory is set, the latest checkpoint is used).")) parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use for in-graph replication.") parser.add_argument( "--chief_host", default="", help="hostname:port of the chief worker (for distributed training).") parser.add_argument( "--worker_hosts", default="", help=("Comma-separated list of hostname:port of workers " "(for distributed training).")) parser.add_argument( "--ps_hosts", default="", help=("Comma-separated list of hostname:port of parameter servers " "(for distributed training).")) parser.add_argument( "--task_type", default="chief", choices=["chief", "worker", "ps", "evaluator"], help="Type of the task to run (for distributed training).") parser.add_argument("--task_index", type=int, default=0, help="ID of the task (for distributed training).") parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "ERROR", "FATAL", "INFO", "WARN"], help="Logs verbosity.") parser.add_argument("--seed", type=int, default=None, help="Random seed.") parser.add_argument("--gpu_allow_growth", default=False, action="store_true", help="Allocate GPU memory dynamically.") args = parser.parse_args() tf.logging.set_verbosity(getattr(tf.logging, args.log_level)) # Setup cluster if defined. if args.chief_host: os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "chief": [args.chief_host], "worker": args.worker_hosts.split(","), "ps": args.ps_hosts.split(",") }, "task": { "type": args.task_type, "index": args.task_index } }) # Load and merge run configurations. config = load_config(args.config) if args.run_dir: config["model_dir"] = os.path.join(args.run_dir, config["model_dir"]) if args.data_dir: config["data"] = _prefix_paths(args.data_dir, config["data"]) if not os.path.isdir(config["model_dir"]): tf.logging.info("Creating model directory %s", config["model_dir"]) os.makedirs(config["model_dir"]) model = load_model(config["model_dir"], model_file=args.model) runner = Runner(model, config, seed=args.seed, num_devices=args.num_gpus, gpu_allow_growth=args.gpu_allow_growth) if args.run == "train_and_eval": runner.train_and_evaluate() elif args.run == "train": runner.train() elif args.run == "eval": runner.evaluate(checkpoint_path=args.checkpoint_path) elif args.run == "infer": if not args.features_file: parser.error("--features_file is required for inference.") elif len(args.features_file) == 1: args.features_file = args.features_file[0] runner.infer(args.features_file, predictions_file=args.predictions_file, checkpoint_path=args.checkpoint_path) elif args.run == "export": runner.export(checkpoint_path=args.checkpoint_path)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("run", choices=[ "train_and_eval", "train", "eval", "infer", "export", "score" ], help="Run type.") parser.add_argument("--config", required=True, nargs="+", help="List of configuration files.") parser.add_argument("--model_type", default="", choices=list(classes_in_module(catalog)), help="Model type from the catalog.") parser.add_argument("--model", default="", help="Custom model configuration file.") parser.add_argument( "--run_dir", default="", help="If set, model_dir will be created relative to this location.") parser.add_argument( "--data_dir", default="", help="If set, data files are expected to be relative to this location." ) parser.add_argument("--features_file", default=[], nargs="+", help="Run inference on this file.") parser.add_argument( "--predictions_file", default="", help= ("File used to save predictions. If not set, predictions are printed " "on the standard output.")) parser.add_argument("--log_prediction_time", default=False, action="store_true", help="Logs some prediction time metrics.") parser.add_argument( "--checkpoint_path", default=None, help=("Checkpoint or directory to use for inference or export " "(when a directory is set, the latest checkpoint is used).")) parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use for in-graph replication.") parser.add_argument( "--chief_host", default="", help="hostname:port of the chief worker (for distributed training).") parser.add_argument( "--worker_hosts", default="", help=("Comma-separated list of hostname:port of workers " "(for distributed training).")) parser.add_argument( "--ps_hosts", default="", help=("Comma-separated list of hostname:port of parameter servers " "(for distributed training).")) parser.add_argument( "--task_type", default="chief", choices=["chief", "worker", "ps", "evaluator"], help="Type of the task to run (for distributed training).") parser.add_argument("--task_index", type=int, default=0, help="ID of the task (for distributed training).") parser.add_argument("--log_level", default="INFO", choices=["DEBUG", "ERROR", "FATAL", "INFO", "WARN"], help="Logs verbosity.") parser.add_argument("--seed", type=int, default=None, help="Random seed.") parser.add_argument("--gpu_allow_growth", default=False, action="store_true", help="Allocate GPU memory dynamically.") parser.add_argument( "--intra_op_parallelism_threads", type=int, default=0, help=("Number of intra op threads (0 means the system picks " "an appropriate number).")) parser.add_argument( "--inter_op_parallelism_threads", type=int, default=0, help=("Number of inter op threads (0 means the system picks " "an appropriate number).")) parser.add_argument( "--session_config", default=None, help=( "Path to a file containing a tf.ConfigProto message in text format " "and used to create the TensorFlow sessions.")) args = parser.parse_args() tf.logging.set_verbosity(getattr(tf.logging, args.log_level)) # Setup cluster if defined. if args.chief_host: os.environ["TF_CONFIG"] = json.dumps({ "cluster": { "chief": [args.chief_host], "worker": args.worker_hosts.split(","), "ps": args.ps_hosts.split(",") }, "task": { "type": args.task_type, "index": args.task_index } }) # Load and merge run configurations. config = load_config(args.config) if args.run_dir: config["model_dir"] = os.path.join(args.run_dir, config["model_dir"]) if args.data_dir: config["data"] = _prefix_paths(args.data_dir, config["data"]) is_chief = args.task_type == "chief" if is_chief and not tf.gfile.Exists(config["model_dir"]): tf.logging.info("Creating model directory %s", config["model_dir"]) tf.gfile.MakeDirs(config["model_dir"]) model = load_model(config["model_dir"], model_file=args.model, model_name=args.model_type, serialize_model=is_chief) session_config = tf.ConfigProto( intra_op_parallelism_threads=args.intra_op_parallelism_threads, inter_op_parallelism_threads=args.inter_op_parallelism_threads) if args.session_config is not None: with open(args.session_config, "rb") as session_config_file: text_format.Merge(session_config_file.read(), session_config) runner = Runner(model, config, seed=args.seed, num_devices=args.num_gpus, gpu_allow_growth=args.gpu_allow_growth, session_config=session_config) if args.run == "train_and_eval": runner.train_and_evaluate() elif args.run == "train": runner.train() elif args.run == "eval": runner.evaluate(checkpoint_path=args.checkpoint_path) elif args.run == "infer": if not args.features_file: parser.error("--features_file is required for inference.") elif len(args.features_file) == 1: args.features_file = args.features_file[0] runner.infer(args.features_file, predictions_file=args.predictions_file, checkpoint_path=args.checkpoint_path, log_time=args.log_prediction_time) elif args.run == "export": runner.export(checkpoint_path=args.checkpoint_path) elif args.run == "score": if not args.features_file: parser.error("--features_file is required for scoring.") if not args.predictions_file: parser.error("--predictions_file is required for scoring.") runner.score(args.features_file, args.predictions_file, checkpoint_path=args.checkpoint_path)