def finalize_config(config, override=None, options=None): """Finalizes the configuration with possible override and options.""" if config is not None and (override or options): config = copy.deepcopy(config) if override: config_util.merge_config(config, override) if options: config_util.update_config_with_options(config, options) return config
def process_input(self, source, target=None, target_name=None, metadata=None, config=None, options=None): """Processes one translation example at inference. Args: source: In preprocess, a string. In postprocess, a (possibly multipart) list of tokens. target: In preprocess, a string. In postprocess, a (possibly multipart) list of tokens. target_name: The name of the target that is passed during inference. metadata: Additional metadata of the input. config: A configuration override for this example. options: A dictionary with operators options. Returns: - In preprocess, a tuple (source_tokens, target_tokens, metadata). - In postprocess, a string (the postprocessed target) """ # This method should be thread-safe as the inference server is starting a new # thread for each request. # Rebuild pipeline if the example has its own configuration. if config: if config_util.is_v2_config(self._config): raise ValueError("Configuration override is not supported for V2 " "configurations") config = config_util.merge_config(copy.deepcopy(self._config), config) pipeline = self.build_pipeline(config) else: pipeline = self._pipeline tu = TranslationUnit( source=source, metadata=metadata, source_tokenizer=pipeline.start_state.get('src_tokenizer'), ) if target is not None: tu.add_target( target, name=target_name, tokenizer=pipeline.start_state.get('tgt_tokenizer')) tu_batch = ([tu], {}) tu_batch = pipeline(tu_batch, options=options) tu = tu_batch[0][0] if self._postprocess: return tu.tgt_detok src_tokens = tu.src_tok.tokens subprocess.Popen(["echo", "************"]) subprocess.Popen(["echo", str(src_tokens)]) # pdb.set_trace() tgt_tokens = tu.tgt_tok.tokens if tu.tgt_tok is not None else [None for _ in src_tokens] return src_tokens, tgt_tokens, tu.metadata
def get_operator_params(config, override_label=None): """Returns the operator parameters from the configuration.""" config = copy.deepcopy(config) config.pop("op", None) override_config = config.pop("overrides", None) # TODO: implement multiple override labels per batch/corpus. if override_config and override_label and override_label in override_config: override_config = override_config[override_label] config = merge_config(config, override_config) return config
def finalize_config(config, override=None, options=None): """Finalizes the configuration with possible override and options.""" if config is None: supported_features = None else: supported_features = config.get('supported_features') if config_util.is_v2_config(config): if override: raise ValueError( "Configuration override is not supported for V2 " "configurations") if options: options = config_util.read_options(config, options) config = None else: if override or options: config = copy.deepcopy(config) if override: config_util.merge_config(config, override) if options: config_util.read_options(config, options) options = None return config, options, supported_features
def get_operator_params(config, operator_type, override_label=None): """Returns the operator parameters from the configuration.""" config = copy.deepcopy(config) config.pop("op", None) override_config = config.pop("overrides", None) if override_label and override_config: override = [ label for label in override_label if label in override_config ] override_num = len(override) if override_num > 1: raise RuntimeError( "One corpus requires different overrides (%s) for the same operator (%s)." % (override_config, operator_type)) if override_num == 1: override = override[0] override_config = override_config[override] config = merge_config(config, override_config) return config
def build_operator( operator_type, operator_cls, operator_params, global_config, process_type, build_state, index, shared_state=None, inference_config=None, ): """Creates an operator instance from its configuration.""" # Propagate source and target languages _add_lang_info(operator_params, global_config, "source") _add_lang_info(operator_params, global_config, "target") args = [] if shared_state: args.append(shared_state) name = operator_params.get("name", "%s_%d" % (operator_type, index + 1)) if inference_config: op_inference_config = inference_config.get(name) if op_inference_config: operator_params = merge_config(operator_params, op_inference_config) if process_type == ProcessType.TRAINING: operator_cls.validate_parameters(operator_params, name) logger.debug("Building operator %s", name) operator = operator_cls(operator_params, process_type, build_state, *args) # We set common private attributes here so that operators do not need to call # the base constructor. operator._name = name operator._verbose = operator_params.get("verbose", False) operator._process_type = process_type return operator
def exec_function(self, args): """Main entrypoint.""" if self._config is None and self._model is None: self.parser.error( 'at least one of --config or --model options must be set') config = self._config or {} parent_model = self._model or config.get('model') if parent_model is not None and not self._stateless: # Download model locally and merge the configuration. remote_model_path = self._storage.join(self._model_storage_read, parent_model) model_path = os.path.join(self._models_dir, parent_model) model_config = utility.fetch_model(self._storage, remote_model_path, model_path, should_check_integrity) if 'modelType' not in model_config: if parent_model.endswith('_release'): model_config['modelType'] = 'release' else: model_config['modelType'] = 'checkpoint' config = config_util.merge_config(copy.deepcopy(model_config), config) else: model_path = None model_config = None if args.cmd == 'train': if parent_model is not None and config['modelType'] not in ( 'checkpoint', 'base', 'preprocess'): raise ValueError( 'cannot train from a model that is not a training checkpoint, ' 'a base model, or a preprocess model') return self.train_wrapper(self._task_id, config, self._storage, self._model_storage_write, self._image, parent_model=parent_model, model_path=model_path, model_config=model_config, gpuid=self._gpuid, push_model=not self._no_push) elif args.cmd == 'buildvocab': self.build_vocab(self._task_id, config, self._storage, self._model_storage_write, self._image, push_model=not self._no_push) elif args.cmd == 'trans': if not self._stateless and (parent_model is None or config['modelType'] != 'checkpoint'): raise ValueError('translation requires a training checkpoint') return self.trans_wrapper( config, model_path, self._storage, args.input, args.output, as_release=args.as_release, release_optimization_level=args.release_optimization_level, gpuid=self._gpuid, copy_source=args.copy_source, add_bt_tag=args.add_bt_tag, no_postprocess=args.no_postprocess) elif args.cmd == 'release': if not self._stateless and (parent_model is None or config['modelType'] != 'checkpoint'): raise ValueError('releasing requires a training checkpoint') if args.destination is None: args.destination = self._model_storage_write self.release_wrapper(config, model_path, self._image, storage=self._storage, destination=args.destination, optimization_level=args.optimization_level, gpuid=self._gpuid, push_model=not self._no_push) elif args.cmd == 'serve': if (not self._stateless and (parent_model is None or config['modelType'] not in ('checkpoint', 'release'))): raise ValueError( 'serving requires a training checkpoint or a released model' ) if config['modelType'] == 'checkpoint': model_path = self.release_wrapper( config, model_path, self._image, local_destination=self._output_dir, optimization_level=args.release_optimization_level, gpuid=self._gpuid, push_model=False) config = utility.load_model_config(model_path) self.serve_wrapper(config, model_path, args.host, args.port, gpuid=self._gpuid) elif args.cmd == 'preprocess': if not args.build_model: self.preprocess(config, self._storage) else: if parent_model is not None and config['modelType'] not in ( 'checkpoint', 'base'): raise ValueError( 'cannot preprocess from a model that is not a training ' 'checkpoint or a base model') return self.preprocess_into_model(self._task_id, config, self._storage, self._model_storage_write, self._image, parent_model=parent_model, model_path=model_path, model_config=model_config, push_model=not self._no_push)
def test_key_override(): a = {"a": {"b": 42, "c": "d"}, "e": "f"} b = {"a": None} c = config.merge_config(a, b) assert c == {"a": None, "e": "f"}
def test_key_replace(): a = {"a": {"b": 42, "c": "d"}, "e": "f"} b = {"e": {"x": "y"}} c = config.merge_config(a, b) assert c == {"a": {"b": 42, "c": "d"}, "e": {"x": "y"}}
def preprocess_example(preprocessor, index, raw_example, config=None, config_override=None): """Applies preprocessing function on example.""" if not isinstance(raw_example, dict): raise InvalidRequest("example %d is not a JSON object" % index) source_text = raw_example.get("text") if source_text is None: raise InvalidRequest("missing text field in example %d" % index) mode = raw_example.get("mode", "default") options = None example_config_override = raw_example.get("config") # Resolve example options. if config is not None: example_options = raw_example.get("options") if example_options: options_or_override = config_util.read_options( config, example_options) if config_util.is_v2_config(config): options = options_or_override else: example_config_override = config_util.merge_config( example_config_override or {}, options_or_override) # Merge example-level config override into batch-level config override. if example_config_override: if config_override: config_override = config_util.merge_config( copy.deepcopy(config_override), example_config_override) else: config_override = example_config_override target_prefix = raw_example.get("target_prefix") target_fuzzy = raw_example.get("fuzzy") if target_prefix is not None and target_fuzzy is not None: raise InvalidRequest( "Using both a target prefix and a fuzzy target is currently unsupported" ) target_text = None target_name = None if target_prefix is not None: target_text = target_prefix elif target_fuzzy is not None: supported_features = config.get( "supported_features") if config else None if supported_features is not None and supported_features.get( "NFA", False): target_text = target_fuzzy target_name = "fuzzy" else: logger.warning( "The fuzzy target is ignored because this model does not " "support Neural Fuzzy Adaptation") if preprocessor is None: source_tokens = source_text target_tokens = None metadata = None else: source_tokens, target_tokens, metadata = preprocessor.process_input( source_text, target=target_text, target_name=target_name, config=config_override, options=options, ) # Move to the general multiparts representation. if not source_tokens or not isinstance(source_tokens[0], list): source_tokens = [source_tokens] target_tokens = [target_tokens] metadata = [metadata] return TranslationExample( index=index, config=config_override, options=options, source_tokens=source_tokens, target_tokens=target_tokens, mode=mode, metadata=metadata, )
def preprocess_example(preprocessor, index, raw_example, config=None, config_override=None): """Applies preprocessing function on example.""" if not isinstance(raw_example, dict): raise ValueError('example %d is not a JSON object' % index) source_text = raw_example.get('text') if source_text is None: raise ValueError('missing text field in example %d' % index) mode = raw_example.get('mode', 'default') example_config_override = raw_example.get('config') if example_config_override: if config_override: config_override = config_util.merge_config( copy.deepcopy(config_override), example_config_override) else: config_override = example_config_override config, options, supported_features = finalize_config( config, override=config_override, options=raw_example.get('options')) target_prefix = raw_example.get('target_prefix') target_fuzzy = raw_example.get('fuzzy') if target_prefix is not None and target_fuzzy is not None: raise ValueError( "Using both a target prefix and a fuzzy target is currently unsupported" ) target_text = None target_name = None if target_prefix is not None: target_text = target_prefix elif target_fuzzy is not None: if supported_features is not None and supported_features.get( "NFA", False): target_text = target_fuzzy target_name = "fuzzy" else: logger.warning( "The fuzzy target is ignored because this model does not " "support Neural Fuzzy Adaptation") if preprocessor is None: source_tokens = source_text target_tokens = None metadata = None else: source_tokens, target_tokens, metadata = preprocessor.process_input( source_text, target=target_text, target_name=target_name, config=config, options=options, ) # Move to the general multiparts representation. if not source_tokens or not isinstance(source_tokens[0], list): source_tokens = [source_tokens] target_tokens = [target_tokens] metadata = [metadata] return TranslationExample(index=index, config=config, options=options, source_tokens=source_tokens, target_tokens=target_tokens, mode=mode, metadata=metadata)
def handle_request(self, request): if 'src' not in request: self.send_error(400, 'missing src field') return results = {'tgt': []} if not request['src']: self.send_result(results) return if not isinstance(request['src'], list): self.send_error(400, 'src field must be a list') return timeout = global_timeout max_batch_size = global_max_batch_size batch_config = config request_options = request.get('options') if request_options is not None and isinstance( request_options, dict): timeout = request_options.get('timeout', timeout) max_batch_size = request_options.get('max_batch_size', max_batch_size) if 'config' in request_options: batch_config = config_util.merge_config( copy.deepcopy(config), request['options']['config']) extra_config = [] batch_metadata = [] batch_offsets = [] batch_tokens = [] offset = 0 for src in request['src']: local_config = batch_config if 'config' in src or 'options' in src: local_config = copy.deepcopy(local_config) if 'config' in src: local_config = config_util.merge_config( local_config, src['config']) if 'options' in src: try: config_util.update_config_with_options( local_config, src['options']) except ValueError as e: self.send_error(400, e.message) return data = preprocess_fn(serving_state, src['text'], local_config) # Preprocessing may return additional metadata. if isinstance(data, tuple): tokens, metadata = data else: tokens, metadata = data, None # Preprocessing may split input text into multiple parts. if tokens and isinstance(tokens[0], list): size = len(tokens) # Flatten the parts in the batch collection. batch_tokens.extend(tokens) batch_metadata.extend(metadata) else: size = 1 batch_tokens.append(tokens) batch_metadata.append(metadata) extra_config.append(local_config) batch_offsets.append((offset, offset + size)) offset += size if max_batch_size is not None and len( batch_tokens) > max_batch_size: offset = 0 batch_hypotheses = [] while offset < len(batch_tokens): lower_bound = offset upper_bound = min(offset + max_batch_size, len(batch_tokens)) batch_hypotheses.extend( translate_fn(batch_tokens[lower_bound:upper_bound], backend_info, timeout=timeout)) offset = upper_bound else: batch_hypotheses = translate_fn(batch_tokens, backend_info, timeout=timeout) if batch_hypotheses is None: self.send_error(504, 'translation request timed out') return for local_config, offset in zip(extra_config, batch_offsets): hypotheses = batch_hypotheses[offset[0]:offset[1]] num_parts = offset[1] - offset[0] num_hypotheses = len(hypotheses[0]) src_tokens = batch_tokens[offset[0]:offset[1]] src_metadata = batch_metadata[offset[0]:offset[1]] result = [] for h in range(num_hypotheses): if num_parts == 1: src = src_tokens[0] if src_metadata[0] is not None: src = (src, src_metadata[0]) tgt = hypotheses[0][h].output scores = hypotheses[0][h].score attention = hypotheses[0][h].attention else: # For multi parts inputs, send all result parts to the postprocessing. src = (src_tokens, src_metadata) tgt = [] scores = [] attention = None for j in range(num_parts): tgt.append(hypotheses[j][h].output) scores.append(hypotheses[j][h].score) result.append( _build_result(lambda src, tgt: postprocess_fn( serving_state, src, tgt, local_config), src, tgt, scores=scores, attention=attention, num_parts=num_parts)) results['tgt'].append(result) self.send_result(results)
def process_input( self, source, target=None, target_name=None, metadata=None, config=None, options=None, ): """Processes one translation example at inference. Args: source: In preprocess, a string. In postprocess, a (possibly multipart) list of tokens. target: In preprocess, a string. In postprocess, a (possibly multipart) list of tokens. target_name: The name of the target that is passed during inference. metadata: Additional metadata of the input. config: A configuration override for this example. options: A dictionary with operators options. Returns: - In preprocess, a tuple (source_tokens, target_tokens, metadata). - In postprocess, a string (the postprocessed target) """ # This method should be thread-safe as the inference server is starting a new # thread for each request. # Rebuild pipeline if the example has its own configuration. if config: if config_util.is_v2_config(self._config): raise ValueError( "Configuration override is not supported for V2 " "configurations") config = config_util.merge_config(copy.deepcopy(self._config), config) pipeline = self.build_pipeline(config) else: pipeline = self._pipeline tu = TranslationUnit( source=source, metadata=metadata, source_tokenizer=pipeline.start_state.get("src_tokenizer"), ) proc = "Postprocess" if self._postprocess else "Preprocess" logger.debug("[%d] %s source input: %s", threading.current_thread().ident, proc, source) if target is not None: tu.add_target( target, name=target_name, tokenizer=pipeline.start_state.get("tgt_tokenizer"), ) logger.debug( "[%d] %s target input: %s", threading.current_thread().ident, proc, target, ) tu_batch = ([tu], {}) tu_batch = pipeline(tu_batch, options=options) tu = tu_batch[0][0] if self._postprocess: logger.debug( "[%d] %s target output: %s", threading.current_thread().ident, proc, tu.tgt_detok, ) return tu.tgt_detok src_tokens = tu.src_tok.tokens tgt_tokens = (tu.tgt_tok.tokens if tu.tgt_tok is not None else [None for _ in src_tokens]) logger.debug( "[%d] %s source output: %s", threading.current_thread().ident, proc, src_tokens, ) if tu.tgt_tok is not None: logger.debug( "[%d] %s target output: %s", threading.current_thread().ident, proc, tgt_tokens, ) return src_tokens, tgt_tokens, tu.metadata