Exemplo n.º 1
0
    def __init__(self, **kwargs):

        # Set parameters
        self.parameters, self.conf_parameters = load_parameters(**kwargs)
        self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters)
        self._check_param_compatibility(self.parameters, self.dataset_filepaths)

        # Load dataset
        self.modeldata = dataset.Dataset(verbose=self.parameters['verbose'], debug=self.parameters['debug'])
        token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths, self.parameters)

        # Launch session. Automatically choose a device
        # if the specified one doesn't exist
        session_conf = tf.ConfigProto(
        intra_op_parallelism_threads=self.parameters['number_of_cpu_threads'],
        inter_op_parallelism_threads=self.parameters['number_of_cpu_threads'],
        device_count={'CPU': 1, 'GPU': self.parameters['number_of_gpus']},
        allow_soft_placement=True, 
        log_device_placement=False)

        self.sess = tf.Session(config=session_conf)
        with self.sess.as_default():

            # Initialize or load pretrained model
            self.model = EntityLSTM(self.modeldata, self.parameters)
            self.sess.run(tf.global_variables_initializer())

            if self.parameters['use_pretrained_model']:
                self.transition_params_trained = self.model.restore_from_pretrained_model(self.parameters, 
                    self.modeldata, self.sess, token_to_vector=token_to_vector)
            else:
                self.model.load_pretrained_token_embeddings(self.sess, self.modeldata, 
                    self.parameters, token_to_vector)
                self.transition_params_trained = np.random.rand(len(self.modeldata.unique_labels)+2,
                    len(self.modeldata.unique_labels)+2)
Exemplo n.º 2
0
    def __init__(self, **kwargs):

        # Set parameters
        self.parameters, self.conf_parameters = load_parameters(**kwargs)
        self.dataset_filepaths, self.dataset_brat_folders = self._get_valid_dataset_filepaths(self.parameters)
        self._check_param_compatibility(self.parameters, self.dataset_filepaths)



        # Load dataset
        # save dataset in pickle format
        dataset = None
        try:
            dataset = pickle.load(open(os.path.join('dataset.pickle'), "rb"))
            print("Loading dataset from pickle file")

        except (OSError, IOError) as e:
            dataset = ds.Dataset(verbose=self.parameters['verbose'], debug=self.parameters['debug'])
            dataset.load_dataset(self.dataset_filepaths, self.parameters)
            pickle.dump(dataset, open(os.path.join('dataset.pickle'), 'wb'))

        self.modeldata = dataset
        # token_to_vector = self.modeldata.load_dataset(self.dataset_filepaths, self.parameters)
        token_to_vector = self.modeldata.token_to_vector
        # dataset statistics:

        print("Total sequence no. in train : {}".format(len(dataset.token_indices['train'])))
        print("Total sequence no. in valid : {}".format(len(dataset.token_indices['valid'])))
        print("Total sequence no. in test : {}".format(len(dataset.token_indices['test'])))

        # Launch session. Automatically choose a device
        # if the specified one doesn't exist
        gpu_options = tf.GPUOptions(visible_device_list=self.parameters['which_gpu'])

        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=self.parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=self.parameters['number_of_cpu_threads'],
            device_count={'CPU': 2, 'GPU': self.parameters['number_of_gpus']},
            gpu_options=gpu_options,
            allow_soft_placement=True,
            log_device_placement=False)

        self.sess = tf.Session(config=session_conf)
        with self.sess.as_default():

            # Initialize or load pretrained model
            self.model = EntityLSTM(self.modeldata, self.parameters)
            self.sess.run(tf.global_variables_initializer())

            if self.parameters['use_pretrained_model']:
                self.transition_params_trained = self.model.restore_from_pretrained_model(self.parameters,
                                                                                          self.modeldata, self.sess,
                                                                                          token_to_vector=token_to_vector)
            else:
                self.model.load_pretrained_token_embeddings(self.sess, self.modeldata,
                                                            self.parameters, token_to_vector)
                self.transition_params_trained = np.random.rand(len(self.modeldata.unique_labels) + 2,
                                                                len(self.modeldata.unique_labels) + 2)
Exemplo n.º 3
0
    def _get_dataset_for_predict(self):
        dataset = ds.Dataset()

        dataset.token_to_index = self.dataset.token_to_index.copy()
        dataset.character_to_index = self.dataset.character_to_index.copy()
        dataset.label_to_index = self.dataset.label_to_index.copy()
        dataset.index_to_label = self.dataset.index_to_label.copy()
        dataset.index_to_character = self.dataset.index_to_character.copy()

        dataset.UNK_TOKEN_INDEX = self.dataset.UNK_TOKEN_INDEX
        dataset.PADDING_CHARACTER_INDEX = self.dataset.PADDING_CHARACTER_INDEX

        return dataset
Exemplo n.º 4
0
    def __init__(self,
                 parameters_filepath=argument_default_value,
                 pretrained_model_folder=argument_default_value,
                 dataset_text_folder=argument_default_value,
                 character_embedding_dimension=argument_default_value,
                 character_lstm_hidden_state_dimension=argument_default_value,
                 check_for_digits_replaced_with_zeros=argument_default_value,
                 check_for_lowercase=argument_default_value,
                 debug=argument_default_value,
                 dropout_rate=argument_default_value,
                 experiment_name=argument_default_value,
                 freeze_token_embeddings=argument_default_value,
                 gradient_clipping_value=argument_default_value,
                 learning_rate=argument_default_value,
                 load_only_pretrained_token_embeddings=argument_default_value,
                 load_all_pretrained_token_embeddings=argument_default_value,
                 main_evaluation_mode=argument_default_value,
                 maximum_number_of_epochs=argument_default_value,
                 number_of_cpu_threads=argument_default_value,
                 number_of_gpus=argument_default_value,
                 optimizer=argument_default_value,
                 output_folder=argument_default_value,
                 patience=argument_default_value,
                 plot_format=argument_default_value,
                 reload_character_embeddings=argument_default_value,
                 reload_character_lstm=argument_default_value,
                 reload_crf=argument_default_value,
                 reload_feedforward=argument_default_value,
                 reload_token_embeddings=argument_default_value,
                 reload_token_lstm=argument_default_value,
                 remap_unknown_tokens_to_unk=argument_default_value,
                 spacylanguage=argument_default_value,
                 tagging_format=argument_default_value,
                 token_embedding_dimension=argument_default_value,
                 token_lstm_hidden_state_dimension=argument_default_value,
                 token_pretrained_embedding_filepath=argument_default_value,
                 tokenizer=argument_default_value,
                 train_model=argument_default_value,
                 use_character_lstm=argument_default_value,
                 use_crf=argument_default_value,
                 use_pretrained_model=argument_default_value,
                 verbose=argument_default_value,
                 argument_default_value=argument_default_value):

        # Parse arguments
        arguments = dict(
            (k, str(v)) for k, v in locals().items() if k != 'self')

        # Initialize parameters
        parameters, conf_parameters = self._load_parameters(
            arguments['parameters_filepath'], arguments=arguments)
        dataset_filepaths, dataset_brat_folders = self._get_valid_dataset_filepaths(
            parameters)
        self._check_parameter_compatiblity(parameters, dataset_filepaths)

        # Load dataset
        dataset = ds.Dataset(verbose=parameters['verbose'],
                             debug=parameters['debug'])
        token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)

        # Launch session
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={
                'CPU': 1,
                'GPU': parameters['number_of_gpus']
            },
            allow_soft_placement=
            True,  # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False)
        sess = tf.Session(config=session_conf)

        with sess.as_default():
            # Create model and initialize or load pretrained model
            ### Instantiate the model
            model = EntityLSTM(dataset, parameters)
            ### Initialize the model and restore from pretrained model if needed
            sess.run(tf.global_variables_initializer())
            if not parameters['use_pretrained_model']:
                model.load_pretrained_token_embeddings(sess, dataset,
                                                       parameters,
                                                       token_to_vector)
                self.transition_params_trained = np.random.rand(
                    len(dataset.unique_labels) + 2,
                    len(dataset.unique_labels) + 2)
            else:
                self.transition_params_trained = model.restore_from_pretrained_model(
                    parameters, dataset, sess, token_to_vector=token_to_vector)
            del token_to_vector

        self.dataset = dataset
        self.dataset_brat_folders = dataset_brat_folders
        self.dataset_filepaths = dataset_filepaths
        self.model = model
        self.parameters = parameters
        self.conf_parameters = conf_parameters
        self.sess = sess