def verify(self): super(Concat3DConf, self).verify() # to check if the ranks of all the inputs are equal rank_equal_flag = True for i in range(len(self.input_ranks)): if self.input_ranks[i] != self.input_ranks[0]: rank_equal_flag = False break if rank_equal_flag == False: raise ConfigurationError( "For layer Concat3D, the ranks of each inputs should be equal!" ) if self.concat3D_axis == 1: # to check if the dimensions of all the inputs are equal input_dims = list(self.input_dims) dim_equal_flag = True for i in range(len(input_dims)): if input_dims[i][-1] != input_dims[0][-1]: dim_equal_flag = False break if dim_equal_flag == False: raise Exception( "Concat3D with axis = 1 require that the input dimensions should be the same!" ) # to check if the concat3D_axis is legal if self.concat3D_axis not in [1, 2]: raise ConfigurationError( "For layer Concat3D, the concat axis must be 1 or 2!")
def inference(self): """ Dimension inference of encoder and decoder is conducted here, but not in the Model. Returns: """ self.encoder_conf_cls.use_gpu = self.use_gpu self.decoder_conf_cls.use_gpu = self.use_gpu # inference inside the encoder and decoder self.encoder_conf_cls.input_dims = copy.deepcopy(self.input_dims) self.encoder_conf_cls.inference() # rank varification between encoder and decoder former_output_ranks = [self.encoder_conf_cls.output_rank] for input_rank, former_output_rank in zip( self.decoder_conf_cls.input_ranks, former_output_ranks): if input_rank != -1 and input_rank != former_output_rank: raise ConfigurationError( "Input ranks of decoder %s are inconsistent with former encoder %s" % (self.decoder_name, self.encoder_name)) self.decoder_conf_cls.input_ranks = copy.deepcopy(former_output_ranks) # some dimension of decoder are inferenced from encoder self.decoder_conf_cls.input_dims = [self.encoder_conf_cls.output_dim] self.decoder_conf_cls.input_context_dims = [ self.encoder_conf_cls.output_context_dim ] self.decoder_conf_cls.inference() self.output_dim = self.decoder_conf_cls.output_dim self.output_rank = 3
def forward(self, x, x_len, y, y_len): """ Args: x: [batch_size, dim] x_len: [batch_size] y: [batch_size, dim] y_len: [batch_size] Returns: Tensor: [batch_size, 1], None """ batch_size = x.size()[0] if "cos" in self.layer_conf.operations: result = F.cosine_similarity(x, y) elif "euclidean" in self.layer_conf.operations: result = torch.sqrt(torch.sum((x - y)**2, dim=1)) elif "manhattan" in self.layer_conf.operations: result = torch.sum(torch.abs((x - y)), dim=1) elif "chebyshev" in self.layer_conf.operations: result = torch.abs((x - y)).max(dim=1) else: raise ConfigurationError("This operation is not supported!") result = result.view(batch_size, 1) return result, None
def forward(self, *args): """ process input Args: *args: (Tensor): string, string_len, string2, string2_len e.g. string (Tensor): [batch_size, seq_len, dim], string_len (Tensor): [batch_size] Returns: Tensor: [batch_size, seq_len, output_dim], [batch_size] """ dim_flag = True input_dims = list(self.layer_conf.input_dims) if (args[0].shape[1] * args[0].shape[2]) != (args[2].shape[1] * args[2].shape[2]): if args[0].shape[1] == args[2].shape[1] and ( input_dims[1][-1] == 1 or input_dims[0][-1] == 1): dim_flag = True else: dim_flag = False if dim_flag == False: raise ConfigurationError( "For layer ElementWisedMultiply3D, the dimensions of each inputs should be equal or 1 ,or the elements number of two inputs (expect for the first dimension) should be equal" ) return torch.addcmul( torch.zeros(args[0].size()).to('cuda'), 1, args[0], args[2]), args[1]
def get_topological_sequence(self): """ get topological sequence of nodes in the model Returns: """ total_layer_ids = Queue() for layer_id in self.layers.keys(): if layer_id != EMBED_LAYER_ID: total_layer_ids.put(layer_id) topological_list = [] circular_cnt = 0 # used for checking if there is at least one legal topological sorting while not total_layer_ids.empty(): layer_id = total_layer_ids.get() if len(self.layer_dependencies[layer_id]) == 0: for layer_id2 in self.layer_dependencies: if layer_id in self.layer_dependencies[layer_id2]: self.layer_dependencies[layer_id2].remove(layer_id) circular_cnt = 0 topological_list.append(layer_id) else: total_layer_ids.put(layer_id) circular_cnt += 1 if circular_cnt >= total_layer_ids.qsize(): rest_layers = [] while not total_layer_ids.empty(): rest_layers.append(total_layer_ids.get()) raise ConfigurationError("The model architecture is illegal because there is a circular dependency " "or there are some isolated layers. The layers can not be resolved: [%s]" % (", ".join(rest_layers))) logging.debug("Topological sequence of nodes: %s" % (",".join(topological_list))) return topological_list
def get_item(self, keys, default=None, use_default=False): """ Args: keys: default: if some key is not found and default is None, we would raise an Exception, except that use_default is True use_default: if you really want to set default to None, set use_default=True Returns: """ item = self.conf valid_keys = [] try: for key in keys: item = item[key] valid_keys.append(key) except: error_keys = copy.deepcopy(valid_keys) error_keys.append(key) if default is None and use_default is False: raise ConfigurationError( "The configuration file %s is illegal. There should be an item configuration[%s], " "but the item %s is not found." % (self.conf_path, "][".join(error_keys), key)) else: print( "configuration[%s] is not found in %s, use default value %s" % ("][".join(error_keys), self.conf_path, repr(default))) item = default return item
def verify(self): super(CombinationConf, self).verify() # to check if the ranks of all the inputs are equal rank_equal_flag = True for i in range(len(self.input_ranks)): if self.input_ranks[i] != self.input_ranks[0]: rank_equal_flag = False break if rank_equal_flag == False: raise ConfigurationError( "For layer Combination, the ranks of each inputs should be consistent!" ) if "difference" in self.operations: assert len( self.input_dims ) == 2, "Difference operation requires that there should be two inputs" if "difference" in self.operations or "dot_multiply" in self.operations: input_dims = list(self.input_dims) dim_equal_flag = True for i in range(len(input_dims)): if input_dims[i] != input_dims[0]: dim_equal_flag = False break if dim_equal_flag == False: raise Exception( "Difference and dot multiply require that the input dimensions should be the same" )
def forward(self, *args): """ process input Args: *args: (Tensor): string, string_len, string2, string2_len e.g. string (Tensor): [batch_size, seq_len, dim], string_len (Tensor): [batch_size] Returns: Tensor: [batch_size, seq_len, output_dim], [batch_size] """ # to check if the dimensions of all the inputs are legal for the Minus3D dim_flag = True input_dims = list(self.layer_conf.input_dims) if (args[0].shape[1] * args[0].shape[2]) != (args[2].shape[1] * args[2].shape[2]): if args[0].shape[1] == args[2].shape[1] and ( input_dims[1][-1] == 1 or input_dims[0][-1] == 1): dim_flag = True else: dim_flag = False if dim_flag == False: raise ConfigurationError( "For layer Minus3D, the dimensions of each inputs should be equal or 1 ,or the elements number of two inputs (expect for the first dimension) should be equal" ) if self.layer_conf.abs_flag == False: return (args[0] - args[2]), args[1] if self.layer_conf.abs_flag == True: return torch.abs(args[0] - args[2]), args[1]
def check_version_compat(self, nb_version, conf_version): """ check if the version of toolkit and configuration file is compatible Args: nb_version: x.y.z conf_version: x.y.z Returns: If the x field and y field are both the same, return True, else return False """ nb_version_split = nb_version.split('.') conf_version_split = conf_version.split('.') if len(nb_version_split) != len(conf_version_split): raise ConfigurationError('The tool_version field of your configuration is illegal!') if not (nb_version_split[0] == conf_version_split[0] and nb_version_split[1] == conf_version_split[1]): raise ConfigurationError('The NeuronBlocks version is %s, but the configuration version is %s, please update your configuration to %s.%s.X' % (nb_version, conf_version, nb_version_split[0], nb_version_split[1]))
def inference(self): self.output_dim = copy.deepcopy(self.input_dims[0]) if self.input_dims[0][1] == -1 or self.input_dims[1][1] == -1: raise ConfigurationError( "For Match layer, the sequence length should be fixed") self.output_dim[-1] = self.input_dims[1][1] # y_len super(MatchConf, self).inference() # PUT THIS LINE AT THE END OF inference()
def inference(self): self.output_dim = copy.deepcopy(self.input_dims[0]) if self.input_dims[0][1] == -1 or self.input_dims[1][1] == -1: raise ConfigurationError( "For Expand_plus layer, the sequence length should be fixed") self.output_dim.insert(2, self.input_dims[1][1]) # y_len super(Expand_plusConf, self).inference() # PUT THIS LINE AT THE END OF inference()
def inference(self): self.output_dim = [] if self.input_dims[0][1] == -1: raise ConfigurationError("For Flatten layer, the sequence length should be fixed") else: self.output_dim.append(self.input_dims[0][0]) self.output_dim.append(self.input_dims[0][1]*self.input_dims[0][-1]) super(FlattenConf, self).inference()
def verify(self): super(Concat2DConf, self).verify() # to check if the ranks of all the inputs are equal rank_equal_flag = True for i in range(len(self.input_ranks)): if self.input_ranks[i] != self.input_ranks[0]: rank_equal_flag = False break if rank_equal_flag == False: raise ConfigurationError( "For layer Concat2D, the ranks of each inputs should be equal!" ) # to check if the concat2D_axis is legal if self.concat2D_axis != 1: raise ConfigurationError( "For layer Concat2D, the concat axis must be 1!")
def add_attr_exist_assertion_for_user(self, attr): """ check if there are some attributes being forgot by users Args: attr (str): the attribution name Returns: None """ if not hasattr(self, attr): raise ConfigurationError("For layer %s, please configure %s attribute for %s in the configuration file!" % (type(self).__name__, attr, type(self).__name__))
def inference(self): self.output_dim = [] flatted_length = 1 for i in range(1, len(self.input_dims[0])): if self.input_dims[0][i] == -1: raise ConfigurationError("For Flatten layer, the sequence length should be fixed") else: flatted_length *= self.input_dims[0][i] self.output_dim = [self.input_dims[0][0], flatted_length] super(FlattenConf, self).inference()
def add_layer(self, layer_id, layer): """ register a layer Args: layer_id: layer: Returns: """ if layer_id in self.layers: raise ConfigurationError("The layer id %s is not unique!") else: self.layers[layer_id] = layer
def varify(self): super(MatrixMultiplyConf, self).varify() # # to check if the ranks of all the inputs are equal # rank_equal_flag = True # for i in range(len(self.input_ranks)): # if self.input_ranks[i] != self.input_ranks[0]: # rank_equal_flag = False # break # if rank_equal_flag == False: # raise ConfigurationError("For layer MatrixMultiply, the ranks of each inputs should be equal!") # to check if the value of operation is legal if self.operation not in ['common', 'seq_based', 'dim_based']: raise ConfigurationError( "the operation must be one of the 'common', 'seq_based' and 'dim_based'" )
def verify(self): super(CalculateDistanceConf, self).verify() assert len(self.input_dims ) == 2, "Operation requires that there should be two inputs" # to check if the ranks of all the inputs are equal rank_equal_flag = True for i in range(len(self.input_ranks)): if self.input_ranks[i] != self.input_ranks[0] or self.input_ranks[ i] != 2: rank_equal_flag = False break if rank_equal_flag == False: raise ConfigurationError( "For layer CalculateDistance, the ranks of each inputs should be equal and 2!" )
def get_value_by_key(json, key, key_prefix='', use_default=False, default=None): """ Args: json: a json object key: a key pointing to the value wanted to acquire use_default: if you really want to use default value when key can not be found in json object, set use_default=True default: if key is not found and default is None, we would raise an Exception, except that use_default is True Returns: value: """ try: value = json[key] except: if not use_default: raise ConfigurationError("key[%s] can not be found in configuration file" % (key_prefix + key)) else: value = default return value
def inference(self): shape1 = self.input_dims[0] shape2 = self.input_dims[1] if shape1[1] == -1 or shape2[1] == -1: raise ConfigurationError( "For Interaction layer, the sequence length should be fixed") # print(shape1,shape2) self.output_dim = None if self.matching_type in ['mul', 'plus', 'minus']: self.output_dim = [shape1[0], shape1[1], shape2[1], shape1[2]] elif self.matching_type in ['dot', 'general']: self.output_dim = [shape1[0], shape1[1], shape2[1], 1] elif self.matching_type == 'concat': self.output_dim = [ shape1[0], shape1[1], shape2[1], shape1[2] + shape2[2] ] else: raise ValueError("Invalid `matching_type`." "{self.matching_type} received." "Must be in `mul`, `general`, `plus`, `minus` " "`dot` and `concat`.") # print(self.output_dim) super(InteractionConf, self).inference() # PUT THIS LINE AT THE END OF inference()
def verify(self): super(Minus2DConf, self).verify() # # to check if the ranks of all the inputs are equal # rank_equal_flag = True # for i in range(len(self.input_ranks)): # if self.input_ranks[i] != self.input_ranks[0]: # rank_equal_flag = False # break # if rank_equal_flag == False: # raise ConfigurationError("For layer Minus2D, the ranks of each inputs should be equal!") # to check if the dimensions of all the inputs are equal or is 1 dim_flag = True input_dims = list(self.input_dims) for i in range(len(input_dims)): if input_dims[i][1] != input_dims[0][1] and input_dims[i][ 1] != 1 and input_dims[0][1] != 1: dim_flag = False break if dim_flag == False: raise ConfigurationError( "For layer Minus2D, the dimensions of each inputs should be equal or 1" )
def _check_command_executor_is_set(self): if not self._command_executor: raise ConfigurationError("Command_executor is required property!")
def get_conf(layer_id, layer_name, input_layer_ids, all_layer_configs, model_input_ids, use_gpu, conf_dict=None, shared_conf=None, succeed_embedding_flag=False, output_layer_flag=False, target_num=None, fixed_lengths=None): """ get layer configuration Args layer_id: layer identifier layer_name: name of layer such as BiLSTM input_layer_ids (list): the inputs of current layer all_layer_configs (dict): records the conf class of each layer. model_input_ids (set): the inputs of the model, e.g. ['query', 'passage'] use_gpu: conf_dict: shared_conf: if fixed_lengths is not None, the output_dim of shared_conf should be corrected! flag: output_layer_flag: target_num: used for inference the dimension of output space if someone declare a dimension of -1 fixed_lengths Returns: configuration class coresponds to the layer """ if shared_conf: conf = copy.deepcopy(shared_conf) else: try: conf_dict['use_gpu'] = use_gpu # for classification tasks, we usually add a Linear layer to project the output to dimension of number of classes. If we don't know the #classes, we can use '-1' instead and we would calculate the number of classes from the corpus. if layer_name == 'Linear': if isinstance(conf_dict['hidden_dim'], list) and conf_dict['hidden_dim'][-1] == -1: assert output_layer_flag is True, "Only in the last layer, hidden_dim == -1 is allowed!" assert target_num is not None, "Number of targets should be given!" conf_dict['hidden_dim'][-1] = target_num elif isinstance(conf_dict['hidden_dim'], int) and conf_dict['hidden_dim'] == -1: assert output_layer_flag is True, "Only in the last layer, hidden_dim == -1 is allowed!" assert target_num is not None, "Number of targets should be given!" conf_dict['hidden_dim'] = target_num conf = eval(layer_name + "Conf")(**conf_dict) except NameError as e: raise LayerConfigUndefinedError("\"%sConf\" has not been defined" % layer_name) # verify the rank consistence of joint layers if layer_name == EMBED_LAYER_NAME: # the embedding layer pass else: # make sure all the inputs to current layer exist for input_layer_id in input_layer_ids: if not (input_layer_id in all_layer_configs or input_layer_id in model_input_ids): raise ConfigurationError( "The input %s of layer %s does not exist. Please define it before " "defining layer %s!" % (input_layer_id, layer_id, layer_id)) former_output_ranks = [ all_layer_configs[input_layer_id].output_rank if input_layer_id in all_layer_configs else all_layer_configs[EMBED_LAYER_ID].output_rank for input_layer_id in input_layer_ids ] # inference input_dim conf.input_dims = [ all_layer_configs[input_layer_id].output_dim if input_layer_id in all_layer_configs else all_layer_configs[EMBED_LAYER_ID].output_dim for input_layer_id in input_layer_ids ] # If the inputs come from embedding layer and fixed_lengths exist, set the length to input_dims if len(input_layer_ids) == 1 and input_layer_ids[ 0] in model_input_ids and fixed_lengths: conf.input_dims[0][1] = fixed_lengths[input_layer_ids[0]] # check and verify input ranks if conf.num_of_inputs > 0: if conf.num_of_inputs != len(input_layer_ids): raise ConfigurationError("%s only accept %d inputs but you feed %d inputs to it!" % \ (layer_name, conf.num_of_inputs, len(input_layer_ids))) elif conf.num_of_inputs == -1: conf.num_of_inputs = len(input_layer_ids) if isinstance(conf.input_ranks, list): conf.input_ranks = conf.input_ranks * conf.num_of_inputs else: logging.warning( "[For developer of %s] The input_ranks attribute should be a list!" % (layer_name)) [conf.input_ranks] * conf.num_of_inputs for input_rank, former_output_rank in zip(conf.input_ranks, former_output_ranks): if input_rank != -1 and input_rank != former_output_rank: raise ConfigurationError( "Input ranks of %s are inconsistent with former layers" % layer_id) conf.input_ranks = copy.deepcopy(former_output_ranks) # inference and varification inside the layer conf.inference( ) # update some attributes which relies on input dimension or something else conf.verify() # verify if the configuration is legal logging.debug( 'Layer id: %s; name: %s; input_dims: %s; input_ranks: %s; output_dim: %s; output_rank: %s' % (layer_id, layer_name, conf.input_dims if layer_id != 'embedding' else 'None', conf.input_ranks, conf.output_dim, conf.output_rank)) return conf
def __init__(self, conf, problem, vocab_info, use_gpu): """ Args: inputs: ['string1', 'string2'] layer_archs: The layers must produce tensors with similar shapes. The layers may be nested. [ { 'layer': Layer name, 'conf': {xxxx} }, [ { 'layer': Layer name, 'conf': {}, }, { 'layer': Layer name, 'conf': {}, } ] ] vocab_info: { 'word': { 'vocab_size': xxx, 'init_weights': np matrix } 'postag': { 'vocab_size': xxx, 'init_weights': None } } """ super(Model, self).__init__() inputs = conf.object_inputs_names layer_archs = conf.architecture target_num = problem.output_target_num() # correct the real fixed length if begin/end of sentence are added if conf.fixed_lengths: fixed_lengths_corrected = copy.deepcopy(conf.fixed_lengths) for seq in fixed_lengths_corrected: if problem.with_bos_eos: fixed_lengths_corrected[seq] += 2 else: fixed_lengths_corrected = None self.use_gpu = use_gpu all_layer_configs = dict() self.layers = nn.ModuleDict() self.layer_inputs = dict() self.layer_dependencies = dict() self.layer_dependencies[EMBED_LAYER_ID] = set() # change output_layer_id to list for support multi_output self.output_layer_id = [] for layer_index, layer_arch in enumerate(layer_archs): output_layer_flag = True if 'output_layer_flag' in layer_arch and layer_arch[ 'output_layer_flag'] is True else False succeed_embedding_flag = True if layer_index > 0 and 'inputs' in layer_arch and \ [input in inputs for input in layer_arch['inputs']].count(True) == len(layer_arch['inputs']) else False if output_layer_flag: self.output_layer_id.append(layer_arch['layer_id']) # if hasattr(self, 'output_layer_id'): # raise ConfigurationError("There should be only one output!") # else: # self.output_layer_id = layer_arch['layer_id'] if layer_index == 0: # embedding layer emb_conf = copy.deepcopy(vocab_info) for input_cluster in emb_conf: emb_conf[input_cluster]['dim'] = layer_arch['conf'][ input_cluster]['dim'] emb_conf[input_cluster]['fix_weight'] = layer_arch['conf'][ input_cluster].get('fix_weight', False) all_layer_configs[EMBED_LAYER_ID] = get_conf( EMBED_LAYER_ID, layer_arch['layer'], None, all_layer_configs, inputs, self.use_gpu, conf_dict={'conf': emb_conf}, shared_conf=None, succeed_embedding_flag=False, output_layer_flag=output_layer_flag, target_num=target_num, fixed_lengths=fixed_lengths_corrected) self.add_layer( EMBED_LAYER_ID, get_layer(layer_arch['layer'], all_layer_configs[EMBED_LAYER_ID])) else: if layer_arch[ 'layer'] in self.layers and not 'conf' in layer_arch: # reuse formly defined layers (share the same parameters) logging.debug( "Layer id: %s; Sharing configuration with layer %s" % (layer_arch['layer_id'], layer_arch['layer'])) conf_dict = None shared_conf = all_layer_configs[layer_arch['layer']] else: conf_dict = layer_arch['conf'] shared_conf = None # if the layer is EncoderDecoder, inference the vocab size if layer_arch['layer'] == 'EncoderDecoder': layer_arch['conf']['decoder_conf'][ 'decoder_vocab_size'] = target_num all_layer_configs[layer_arch['layer_id']] = get_conf( layer_arch['layer_id'], layer_arch['layer'], layer_arch['inputs'], all_layer_configs, inputs, self.use_gpu, conf_dict=conf_dict, shared_conf=shared_conf, succeed_embedding_flag=succeed_embedding_flag, output_layer_flag=output_layer_flag, target_num=target_num, fixed_lengths=fixed_lengths_corrected) if layer_arch[ 'layer'] in self.layers and not 'conf' in layer_arch: self.add_layer(layer_arch['layer_id'], self.layers[layer_arch['layer']]) else: self.add_layer( layer_arch['layer_id'], get_layer(layer_arch['layer'], all_layer_configs[layer_arch['layer_id']])) self.layer_inputs[ layer_arch['layer_id']] = layer_arch['inputs'] # register dependencies, except embeddings cur_layer_depend = set() for layer_depend_id in layer_arch['inputs']: if not layer_depend_id in inputs: cur_layer_depend.add(layer_depend_id) self.add_dependency(layer_arch['layer_id'], cur_layer_depend) logging.debug("Layer dependencies: %s" % repr(self.layer_dependencies)) if not hasattr(self, 'output_layer_id'): raise ConfigurationError("Please define an output layer") self.layer_topological_sequence = self.get_topological_sequence()
def load_from_file(self, conf_path): with codecs.open(conf_path, 'r', encoding='utf-8') as fin: try: self.conf = json.load(fin) except Exception as e: raise ConfigurationError( "%s is not a legal JSON file, please check your JSON format!" % conf_path) self.tool_version = self.get_item(['tool_version']) self.language = self.get_item(['language'], default='english').lower() self.problem_type = self.get_item(['inputs', 'dataset_type']).lower() #if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging: self.tagging_scheme = self.get_item(['inputs', 'tagging_scheme'], default=None, use_default=True) if self.mode == 'normal': self.use_cache = self.get_item(['inputs', 'use_cache'], True) elif self.mode == 'philly': self.use_cache = True # OUTPUTS if hasattr(self.params, 'model_save_dir') and self.params.model_save_dir: self.save_base_dir = self.params.model_save_dir else: self.save_base_dir = self.get_item(['outputs', 'save_base_dir']) if self.phase == 'train': # in train.py, it is called pretrained_model_path if hasattr(self.params, 'pretrained_model_path' ) and self.params.pretrained_model_path: self.pretrained_model_path = self.previous_model_path = self.params.pretrained_model_path else: self.pretrained_model_path = self.previous_model_path = self.get_item( ['inputs', 'data_paths', 'pretrained_model_path'], default=None, use_default=True) elif self.phase == 'test' or self.phase == 'predict': # in test.py and predict.py, it is called pretrained_model_path if hasattr( self.params, 'previous_model_path') and self.params.previous_model_path: self.previous_model_path = self.pretrained_model_path = self.params.previous_model_path else: self.previous_model_path = self.pretrained_model_path = os.path.join( self.save_base_dir, self.get_item(['outputs', 'model_name' ])) # namely, the model_save_path if hasattr( self, 'pretrained_model_path' ) and self.pretrained_model_path: # namely self.previous_model_path tmp_saved_problem_path = os.path.join( os.path.dirname(self.pretrained_model_path), '.necessary_cache', 'problem.pkl') self.saved_problem_path = tmp_saved_problem_path if os.path.isfile(tmp_saved_problem_path) \ else os.path.join(os.path.dirname(self.pretrained_model_path), 'necessary_cache', 'problem.pkl') if not (os.path.isfile(self.pretrained_model_path) and os.path.isfile(self.saved_problem_path)): raise Exception( 'Previous trained model %s or its dictionaries %s does not exist!' % (self.pretrained_model_path, self.saved_problem_path)) if self.phase != 'cache': prepare_dir( self.save_base_dir, True, allow_overwrite=self.params.force or self.mode == 'philly', extra_info='will overwrite model file and train.log' if self.phase == 'train' else 'will add %s.log and predict file' % self.phase) if hasattr(self.params, 'log_dir') and self.params.log_dir: self.log_dir = self.params.log_dir if self.phase != 'cache': prepare_dir(self.log_dir, True, allow_overwrite=True) else: self.log_dir = self.save_base_dir if self.phase == 'train': self.train_log_path = os.path.join( self.log_dir, self.get_item(['outputs', 'train_log_name'])) if self.mode == 'philly' or self.params.debug: log_set(self.train_log_path, console_level='DEBUG', console_detailed=True, disable_log_file=self.params.disable_log_file) else: log_set(self.train_log_path, disable_log_file=self.params.disable_log_file) elif self.phase == 'test': self.test_log_path = os.path.join( self.log_dir, self.get_item(['outputs', 'test_log_name'])) if self.mode == 'philly' or self.params.debug: log_set(self.test_log_path, console_level='DEBUG', console_detailed=True, disable_log_file=self.params.disable_log_file) else: log_set(self.test_log_path, disable_log_file=self.params.disable_log_file) elif self.phase == 'predict': self.predict_log_path = os.path.join( self.log_dir, self.get_item(['outputs', 'predict_log_name'])) if self.mode == 'philly' or self.params.debug: log_set(self.predict_log_path, console_level='DEBUG', console_detailed=True, disable_log_file=self.params.disable_log_file) else: log_set(self.predict_log_path, disable_log_file=self.params.disable_log_file) if self.phase != 'cache': self.predict_output_path = self.params.predict_output_path if self.params.predict_output_path else os.path.join( self.save_base_dir, self.get_item(['outputs', 'predict_output_name'], default='predict.tsv')) logging.debug('Prepare dir for: %s' % self.predict_output_path) prepare_dir(self.predict_output_path, False, allow_overwrite=self.params.force or self.mode == 'philly') self.predict_fields = self.get_item( ['outputs', 'predict_fields'], default=DefaultPredictionFields[ProblemTypes[self.problem_type]]) self.model_save_path = os.path.join( self.save_base_dir, self.get_item(['outputs', 'model_name'])) # INPUTS if hasattr(self.params, 'train_data_path') and self.params.train_data_path: self.train_data_path = self.params.train_data_path else: if self.mode == 'normal': self.train_data_path = self.get_item( ['inputs', 'data_paths', 'train_data_path'], default=None, use_default=True) else: self.train_data_path = None if hasattr(self.params, 'valid_data_path') and self.params.valid_data_path: self.valid_data_path = self.params.valid_data_path else: if self.mode == 'normal': self.valid_data_path = self.get_item( ['inputs', 'data_paths', 'valid_data_path'], default=None, use_default=True) else: self.valid_data_path = None if hasattr(self.params, 'test_data_path') and self.params.test_data_path: self.test_data_path = self.params.test_data_path else: if self.mode == 'normal': self.test_data_path = self.get_item( ['inputs', 'data_paths', 'test_data_path'], default=None, use_default=True) else: self.test_data_path = None if self.phase == 'predict': if self.params.predict_data_path: self.predict_data_path = self.params.predict_data_path else: if self.mode == 'normal': self.predict_data_path = self.get_item( ['inputs', 'data_paths', 'predict_data_path'], default=None, use_default=True) else: self.predict_data_path = None if self.phase == 'train' or self.phase == 'cache': if self.valid_data_path is None and self.test_data_path is not None: # We support test_data_path == None, if someone set valid_data_path to None while test_data_path is not None, # swap the valid_data_path and test_data_path self.valid_data_path = self.test_data_path self.test_data_path = None elif self.phase == 'predict': if self.predict_data_path is None and self.test_data_path is not None: self.predict_data_path = self.test_data_path self.test_data_path = None if self.phase == 'train' or self.phase == 'test' or self.phase == 'cache': self.file_columns = self.get_item(['inputs', 'file_header']) else: self.file_columns = self.get_item(['inputs', 'file_header'], default=None, use_default=True) if self.phase == 'predict': if self.file_columns is None: self.predict_file_columns = self.get_item( ['inputs', 'predict_file_header']) else: self.predict_file_columns = self.get_item( ['inputs', 'predict_file_header'], default=None, use_default=True) if self.predict_file_columns is None: self.predict_file_columns = self.file_columns if self.phase != 'predict': if self.phase == 'cache': self.answer_column_name = self.get_item(['inputs', 'target'], default=None, use_default=True) else: self.answer_column_name = self.get_item(['inputs', 'target']) self.input_types = self.get_item(['architecture', 0, 'conf']) # add extra feature feature_all = set([_.lower() for _ in self.input_types.keys()]) formal_feature = set(['word', 'char']) self.extra_feature = len(feature_all - formal_feature) != 0 # add char embedding config # char_emb_type = None # char_emb_type_cols = None # for single_type in self.input_types: # if single_type.lower() == 'char': # char_emb_type = single_type # char_emb_type_cols = [single_col.lower() for single_col in self.input_types[single_type]['cols']] # break self.object_inputs = self.get_item(['inputs', 'model_inputs']) # if char_emb_type and char_emb_type_cols: # for single_input in self.object_inputs: # for single_col in char_emb_type_cols: # if single_input.lower() in single_col: # self.object_inputs[single_input].append(single_col) self.object_inputs_names = [name for name in self.object_inputs] # vocabulary setting self.max_vocabulary = self.get_item( ['training_params', 'vocabulary', 'max_vocabulary'], default=800000, use_default=True) self.min_word_frequency = self.get_item( ['training_params', 'vocabulary', 'min_word_frequency'], default=3, use_default=True) # file column header setting self.file_with_col_header = self.get_item( ['inputs', 'file_with_col_header'], default=False, use_default=True) if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging: self.add_start_end_for_seq = self.get_item( ['inputs', 'add_start_end_for_seq'], default=True) else: self.add_start_end_for_seq = self.get_item( ['inputs', 'add_start_end_for_seq'], default=False) if hasattr(self.params, 'pretrained_emb_path') and self.params.pretrained_emb_path: self.pretrained_emb_path = self.params.pretrained_emb_path else: if self.mode == 'normal': self.pretrained_emb_path = self.get_item( ['inputs', 'data_paths', 'pre_trained_emb'], default=None, use_default=True) else: self.pretrained_emb_path = None if 'word' in self.get_item(['architecture', 0, 'conf' ]) and self.pretrained_emb_path: if hasattr(self.params, 'involve_all_words_in_pretrained_emb' ) and self.params.involve_all_words_in_pretrained_emb: self.involve_all_words_in_pretrained_emb = self.params.involve_all_words_in_pretrained_emb else: self.involve_all_words_in_pretrained_emb = self.get_item( ['inputs', 'involve_all_words_in_pretrained_emb'], default=False) if hasattr( self.params, 'pretrained_emb_type') and self.params.pretrained_emb_type: self.pretrained_emb_type = self.params.pretrained_emb_type else: self.pretrained_emb_type = self.get_item( ['inputs', 'pretrained_emb_type'], default='glove') if hasattr(self.params, 'pretrained_emb_binary_or_text' ) and self.params.pretrained_emb_binary_or_text: self.pretrained_emb_binary_or_text = self.params.pretrained_emb_binary_or_text else: self.pretrained_emb_binary_or_text = self.get_item( ['inputs', 'pretrained_emb_binary_or_text'], default='text') self.pretrained_emb_dim = self.get_item( ['architecture', 0, 'conf', 'word', 'dim']) else: self.pretrained_emb_path = None self.involve_all_words_in_pretrained_emb = None self.pretrained_emb_binary_or_text = None self.pretrained_emb_dim = None self.pretrained_emb_type = None if self.phase == 'train': if hasattr(self.params, 'cache_dir') and self.params.cache_dir: # for aether self.cache_dir = self.params.cache_dir else: if self.mode == 'normal': if self.use_cache: self.cache_dir = self.get_item( ['outputs', 'cache_dir']) else: self.cache_dir = os.path.join( tempfile.gettempdir(), 'neuron_blocks', ''.join( random.sample( string.ascii_letters + string.digits, 16))) else: # for philly mode, we can only save files in model_path or scratch_path self.cache_dir = os.path.join(self.save_base_dir, 'cache') self.problem_path = os.path.join(self.cache_dir, 'problem.pkl') if self.pretrained_emb_path is not None: self.emb_pkl_path = os.path.join(self.cache_dir, 'emb.pkl') else: self.emb_pkl_path = None else: tmp_problem_path = os.path.join(self.save_base_dir, '.necessary_cache', 'problem.pkl') self.problem_path = tmp_problem_path if os.path.isfile( tmp_problem_path) else os.path.join( self.save_base_dir, 'necessary_cache', 'problem.pkl') # training params self.training_params = self.get_item(['training_params']) if self.phase == 'train': self.optimizer_name = self.get_item( ['training_params', 'optimizer', 'name']) self.optimizer_params = self.get_item( ['training_params', 'optimizer', 'params']) self.clip_grad_norm_max_norm = self.get_item( ['training_params', 'clip_grad_norm_max_norm'], default=5) if hasattr(self.params, 'learning_rate') and self.params.learning_rate: self.optimizer_params['lr'] = self.params.learning_rate if hasattr(self.params, 'batch_size') and self.params.batch_size: self.batch_size_each_gpu = self.params.batch_size else: self.batch_size_each_gpu = self.get_item([ 'training_params', 'batch_size' ]) #the batch_size in conf file is the batch_size on each GPU self.lr_decay = self.get_item(['training_params', 'lr_decay'], default=1) # by default, no decay self.minimum_lr = self.get_item(['training_params', 'minimum_lr'], default=0) self.epoch_start_lr_decay = self.get_item( ['training_params', 'epoch_start_lr_decay'], default=1) if hasattr(self.params, 'max_epoch') and self.params.max_epoch: self.max_epoch = self.params.max_epoch else: self.max_epoch = self.get_item(['training_params', 'max_epoch'], default=float('inf')) self.valid_times_per_epoch = self.get_item( ['training_params', 'valid_times_per_epoch'], default=1) self.batch_num_to_show_results = self.get_item( ['training_params', 'batch_num_to_show_results'], default=10) self.max_lengths = self.get_item(['training_params', 'max_lengths'], default=None, use_default=True) self.fixed_lengths = self.get_item( ['training_params', 'fixed_lengths'], default=None, use_default=True) if self.fixed_lengths: self.max_lengths = None if torch.cuda.device_count() > 1: self.batch_size_total = torch.cuda.device_count( ) * self.training_params['batch_size'] self.batch_num_to_show_results = self.batch_num_to_show_results // torch.cuda.device_count( ) else: self.batch_size_total = self.batch_size_each_gpu self.cpu_num_workers = self.get_item( ['training_params', 'cpu_num_workers'], default=-1) #by default, use all workers cpu supports # text preprocessing self.__text_preprocessing = self.get_item( ['training_params', 'text_preprocessing'], default=list()) self.DBC2SBC = True if 'DBC2SBC' in self.__text_preprocessing else False self.unicode_fix = True if 'unicode_fix' in self.__text_preprocessing else False self.remove_stopwords = True if 'remove_stopwords' in self.__text_preprocessing else False # tokenzier if self.language == 'chinese': self.tokenizer = self.get_item(['training_params', 'tokenizer'], default='jieba') else: self.tokenizer = self.get_item(['training_params', 'tokenizer'], default='nltk') if self.extra_feature: if self.DBC2SBC: logging.warning( "Detect the extra feature %s, set the DBC2sbc is False." % ''.join(list(feature_all - formal_feature))) if self.unicode_fix: logging.warning( "Detect the extra feature %s, set the unicode_fix is False." % ''.join(list(feature_all - formal_feature))) if self.remove_stopwords: logging.warning( "Detect the extra feature %s, set the remove_stopwords is False." % ''.join(list(feature_all - formal_feature))) if ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging: if self.unicode_fix: logging.warning( 'For sequence tagging task, unicode_fix may change the number of words.' ) if self.remove_stopwords: self.remove_stopwords = True logging.warning( 'For sequence tagging task, remove stopwords is forbidden! It is disabled now.' ) if self.phase != 'cache': if torch.cuda.is_available( ) and torch.cuda.device_count() > 0 and self.training_params.get( 'use_gpu', True): self.use_gpu = True logging.info( "Activating GPU mode, there are %d GPUs available" % torch.cuda.device_count()) else: self.use_gpu = False logging.info("Activating CPU mode") self.architecture = self.get_item(['architecture']) self.output_layer_id = [] for single_layer in self.architecture: if 'output_layer_flag' in single_layer and single_layer[ 'output_layer_flag']: self.output_layer_id.append(single_layer['layer_id']) # check CNN layer & change min sentence length cnn_rele_layers = ['Conv', 'ConvPooling'] self.min_sentence_len = 0 for layer_index, single_layer in enumerate(self.architecture): if layer_index == 0: continue if sum([_ == single_layer['layer'] for _ in cnn_rele_layers]): # get window_size conf: type maybe int or list for single_conf, single_conf_value in single_layer[ 'conf'].items(): if 'window' in single_conf.lower(): self.min_sentence_len = max( self.min_sentence_len, np.max(np.array([single_conf_value]))) break if self.phase == 'train' or self.phase == 'test': self.loss = BaseLossConf.get_conf(**self.get_item(['loss'])) self.metrics = self.get_item(['metrics']) if 'auc' in self.metrics and ProblemTypes[ self.problem_type] == ProblemTypes.classification: self.pos_label = self.get_item(['inputs', 'positive_label'], default=None, use_default=True)
def raise_configuration_error(self, key): raise ConfigurationError( "The configuration file %s is illegal. the item [%s] is not found." % (self.conf_path, key))