def __init__(self, **kwargs): # Set parameters self.parameters, self.conf_parameters = load_parameters(**kwargs) self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters) self._check_param_compatibility(self.parameters, self.dataset_filepaths) # Load dataset self.modeldata = dataset.Dataset(verbose=self.parameters['verbose'], debug=self.parameters['debug']) token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths, self.parameters) # Launch session. Automatically choose a device # if the specified one doesn't exist session_conf = tf.ConfigProto( intra_op_parallelism_threads=self.parameters['number_of_cpu_threads'], inter_op_parallelism_threads=self.parameters['number_of_cpu_threads'], device_count={'CPU': 1, 'GPU': self.parameters['number_of_gpus']}, allow_soft_placement=True, log_device_placement=False) self.sess = tf.Session(config=session_conf) with self.sess.as_default(): # Initialize or load pretrained model self.model = EntityLSTM(self.modeldata, self.parameters) self.sess.run(tf.global_variables_initializer()) if self.parameters['use_pretrained_model']: self.transition_params_trained = self.model.restore_from_pretrained_model(self.parameters, self.modeldata, self.sess, token_to_vector=token_to_vector) else: self.model.load_pretrained_token_embeddings(self.sess, self.modeldata, self.parameters, token_to_vector) self.transition_params_trained = np.random.rand(len(self.modeldata.unique_labels)+2, len(self.modeldata.unique_labels)+2)
def __init__(self, **kwargs): # Set parameters self.parameters, self.conf_parameters = load_parameters(**kwargs) self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters) self._check_param_compatibility(self.parameters, self.dataset_filepaths) # Load dataset # save dataset in pickle format dataset = None try: dataset = pickle.load(open(os.path.join('dataset.pickle'), "rb")) print("Loading dataset from pickle file") except (OSError, IOError) as e: dataset = ds.Dataset(verbose=self.parameters['verbose'], debug=self.parameters['debug']) dataset.load_dataset(self.dataset_filepaths, self.parameters) pickle.dump(dataset, open(os.path.join('dataset.pickle'), 'wb')) self.modeldata = dataset # token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths, self.parameters) token_to_vector = self.modeldata.token_to_vector # dataset statistics: print("Total sequence no. in train : {}".format(len(dataset.token_indices['train']))) print("Total sequence no. in valid : {}".format(len(dataset.token_indices['valid']))) print("Total sequence no. in test : {}".format(len(dataset.token_indices['test']))) # Launch session. Automatically choose a device # if the specified one doesn't exist gpu_options = tf.GPUOptions(visible_device_list=self.parameters['which_gpu']) session_conf = tf.ConfigProto( intra_op_parallelism_threads=self.parameters['number_of_cpu_threads'], inter_op_parallelism_threads=self.parameters['number_of_cpu_threads'], device_count={'CPU': 2, 'GPU': self.parameters['number_of_gpus']}, gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=False) self.sess = tf.Session(config=session_conf) with self.sess.as_default(): # Initialize or load pretrained model self.model = EntityLSTM(self.modeldata, self.parameters) self.sess.run(tf.global_variables_initializer()) if self.parameters['use_pretrained_model']: self.transition_params_trained = self.model.restore_from_pretrained_model(self.parameters, self.modeldata, self.sess, token_to_vector=token_to_vector) else: self.model.load_pretrained_token_embeddings(self.sess, self.modeldata, self.parameters, token_to_vector) self.transition_params_trained = np.random.rand(len(self.modeldata.unique_labels) + 2, len(self.modeldata.unique_labels) + 2)
def _get_dataset_for_predict(self): dataset = ds.Dataset() dataset.token_to_index = self.dataset.token_to_index.copy() dataset.character_to_index = self.dataset.character_to_index.copy() dataset.label_to_index = self.dataset.label_to_index.copy() dataset.index_to_label = self.dataset.index_to_label.copy() dataset.index_to_character = self.dataset.index_to_character.copy() dataset.UNK_TOKEN_INDEX = self.dataset.UNK_TOKEN_INDEX dataset.PADDING_CHARACTER_INDEX = self.dataset.PADDING_CHARACTER_INDEX return dataset
def __init__(self, parameters_filepath=argument_default_value, pretrained_model_folder=argument_default_value, dataset_text_folder=argument_default_value, character_embedding_dimension=argument_default_value, character_lstm_hidden_state_dimension=argument_default_value, check_for_digits_replaced_with_zeros=argument_default_value, check_for_lowercase=argument_default_value, debug=argument_default_value, dropout_rate=argument_default_value, experiment_name=argument_default_value, freeze_token_embeddings=argument_default_value, gradient_clipping_value=argument_default_value, learning_rate=argument_default_value, load_only_pretrained_token_embeddings=argument_default_value, load_all_pretrained_token_embeddings=argument_default_value, main_evaluation_mode=argument_default_value, maximum_number_of_epochs=argument_default_value, number_of_cpu_threads=argument_default_value, number_of_gpus=argument_default_value, optimizer=argument_default_value, output_folder=argument_default_value, patience=argument_default_value, plot_format=argument_default_value, reload_character_embeddings=argument_default_value, reload_character_lstm=argument_default_value, reload_crf=argument_default_value, reload_feedforward=argument_default_value, reload_token_embeddings=argument_default_value, reload_token_lstm=argument_default_value, remap_unknown_tokens_to_unk=argument_default_value, spacylanguage=argument_default_value, tagging_format=argument_default_value, token_embedding_dimension=argument_default_value, token_lstm_hidden_state_dimension=argument_default_value, token_pretrained_embedding_filepath=argument_default_value, tokenizer=argument_default_value, train_model=argument_default_value, use_character_lstm=argument_default_value, use_crf=argument_default_value, use_pretrained_model=argument_default_value, verbose=argument_default_value, argument_default_value=argument_default_value): # Parse arguments arguments = dict( (k, str(v)) for k, v in locals().items() if k != 'self') # Initialize parameters parameters, conf_parameters = self._load_parameters( arguments['parameters_filepath'], arguments=arguments) dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths( parameters) self._check_parameter_compatiblity(parameters, dataset_filepaths) # Load dataset dataset = ds.Dataset(verbose=parameters['verbose'], debug=parameters['debug']) token_to_vector = dataset.load_dataset(dataset_filepaths, parameters) # Launch session session_conf = tf.ConfigProto( intra_op_parallelism_threads=parameters['number_of_cpu_threads'], inter_op_parallelism_threads=parameters['number_of_cpu_threads'], device_count={ 'CPU': 1, 'GPU': parameters['number_of_gpus'] }, allow_soft_placement= True, # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist log_device_placement=False) sess = tf.Session(config=session_conf) with sess.as_default(): # Create model and initialize or load pretrained model ### Instantiate the model model = EntityLSTM(dataset, parameters) ### Initialize the model and restore from pretrained model if needed sess.run(tf.global_variables_initializer()) if not parameters['use_pretrained_model']: model.load_pretrained_token_embeddings(sess, dataset, parameters, token_to_vector) self.transition_params_trained = np.random.rand( len(dataset.unique_labels) + 2, len(dataset.unique_labels) + 2) else: self.transition_params_trained = model.restore_from_pretrained_model( parameters, dataset, sess, token_to_vector=token_to_vector) del token_to_vector self.dataset = dataset self.dataset_brat_folders = dataset_brat_folders self.dataset_filepaths = dataset_filepaths self.model = model self.parameters = parameters self.conf_parameters = conf_parameters self.sess = sess