Exemplo n.º 1
0
    def train(self, texts: List[str], target: List[int]) -> None:

        from tensorflow.python.keras.models import Sequential  #type: ignore
        from tensorflow.python.keras.layers import Embedding, Dense, LSTM, GlobalMaxPool1D  #type: ignore
        from tensorflow.keras.optimizers import Adam  #type: ignore
        from tensorflow.keras.callbacks import History  #type: ignore

        if self.downsampling:
            texts, target = downsample(texts, target, self.downsampling_ratio)

        if self.verbose:
            print('1. Vectorizing texts')

        NUMBER_OF_FEATURES: int = 20000
        self.tokenizer = text.Tokenizer(num_words=NUMBER_OF_FEATURES)
        self.tokenizer.fit_on_texts(texts)
        vocabulary: Dict[str, int] = self.tokenizer.word_index

        if self._max_sequence_length == 0:
            self._max_sequence_length = len(max(texts, key=len))

        vectorized_texts: array = self.vectorize_texts(texts)

        if self.embedding_location == '':
            if self.verbose:
                print('2. Skip (no embeddings)')
                print('3. Skip (no embeddings)')
        else:
            if self.verbose:
                print('2. Loading word embeddings')

            embedding_dictionary: Dict[
                str, List[float]] = load_embedding_dictionary(
                    self.embedding_location)
            nr_of_embedding_features: int = len(
                list(embedding_dictionary.values())
                [1])  # Check how many values we have for the first word

            if self.verbose:
                print('3. Creating embedding matrix')

            embedding_matrix: array = create_embedding_matrix_for_vocabulary(
                embedding_dictionary, vocabulary)

        if self.verbose:
            print('4. Building up model')

        #Define a simple LSTM model with a pretrained embedding layer
        model: Sequential = Sequential()

        if self.embedding_location == '':
            #Add an empty embedding layer if we have no pretrained embeddings
            EMPTY_EMBEDDING_LAYER_SIZE: int = 300
            model.add(
                Embedding(len(vocabulary) + 1, EMPTY_EMBEDDING_LAYER_SIZE))

        else:
            model.add(
                Embedding(input_dim=len(vocabulary) + 1,
                          output_dim=nr_of_embedding_features,
                          input_length=vectorized_texts.shape[1],
                          weights=[embedding_matrix],
                          trainable=False))

        model.add(LSTM(16, return_sequences=True))
        model.add(LSTM(16, return_sequences=True))
        model.add(LSTM(16, return_sequences=True))
        model.add(GlobalMaxPool1D())

        model.add(Dense(256))
        model.add(Dense(256))

        model.add(Dense(1, activation='sigmoid'))

        #Compile the model
        optimizer: Adam = Adam(lr=self.learning_rate)
        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['acc'])

        if self.verbose:
            print('5. training the model')

        history: History = model.fit(
            vectorized_texts,
            target,
            epochs=self.learning_epochs,
            #validation_data=(test_vectors, test_target),
            verbose=1,  # Logs once per epoch.
            batch_size=self.learning_batch_size)

        self.model = model
Exemplo n.º 2
0
    def train(self, texts: List[str], target: List[int]) -> None:

        from tensorflow.python.keras.models import Model  #type: ignore
        from tensorflow.python.keras.layers import Input, Embedding, GRU, Dense, Bidirectional, GlobalMaxPool1D, concatenate  #type: ignore
        from tensorflow.keras.optimizers import Adam  #type: ignore
        from tensorflow.keras.callbacks import History  #type: ignore

        if self.downsampling:
            texts, target = downsample(texts, target, self.downsampling_ratio)

        if self.verbose:
            print('1. Vectorizing texts')

        NUMBER_OF_FEATURES: int = 20000
        self.tokenizer = text.Tokenizer(num_words=NUMBER_OF_FEATURES)
        self.tokenizer.fit_on_texts(texts)
        vocabulary: Dict[str, int] = self.tokenizer.word_index

        if self._max_sequence_length == 0:
            self._max_sequence_length = len(max(texts, key=len))

        vectorized_texts: array = self.vectorize_texts(texts)

        if self.include_casing_information:
            casing_information: array = self.texts_to_casing_information(texts)

        if self.embedding_location == '':
            if self.verbose:
                print('2. Skip (no embeddings)')
                print('3. Skip (no embeddings)')
        else:
            if self.verbose:
                print('2. Loading word embeddings')

            embedding_dictionary: Dict[
                str, List[float]] = load_embedding_dictionary(
                    self.embedding_location)
            nr_of_embedding_features: int = len(
                list(embedding_dictionary.values())
                [1])  # Check how many values we have for the first word

            if self.verbose:
                print('3. Creating embedding matrix')

            embedding_matrix: array = create_embedding_matrix_for_vocabulary(
                embedding_dictionary, vocabulary)

        if self.verbose:
            print('4. Building up model')

        #Define a simple BiGru model with a pretrained embedding layer
        word_input: Input = Input(shape=(self._max_sequence_length, ))

        if self.embedding_location == '':
            #Add an empty embedding layer if we have no pretrained embeddings
            EMPTY_EMBEDDING_LAYER_SIZE: int = 300
            layers = Embedding(
                len(vocabulary) + 1, EMPTY_EMBEDDING_LAYER_SIZE)(word_input)

        else:
            layers = Embedding(input_dim=len(vocabulary) + 1,
                               output_dim=nr_of_embedding_features,
                               input_length=vectorized_texts.shape[1],
                               weights=[embedding_matrix],
                               trainable=False)(word_input)

        #Add a separate 'entrance' for the casing information
        if self.include_casing_information:
            word_model: Model = Model(inputs=word_input, outputs=layers)

            casing_input: Input = Input(shape=(self._max_sequence_length, 1))

            casing_model: Model = Model(inputs=casing_input,
                                        outputs=casing_input)
            layers = concatenate([word_model.output, casing_model.output])

        if self.bidirectional:
            layers = Bidirectional(
                GRU(16, activation='tanh', return_sequences=True))(layers)
            layers = Bidirectional(
                GRU(16, activation='tanh', return_sequences=True))(layers)
        else:
            layers = GRU(16, activation='tanh', return_sequences=True)(layers)
            layers = GRU(16, activation='tanh', return_sequences=True)(layers)

        layers = GlobalMaxPool1D()(layers)

        layers = Dense(256)(layers)
        layers = Dense(256)(layers)

        layers = Dense(1, activation='sigmoid')(layers)

        if self.include_casing_information:
            model: Model = Model([word_model.input, casing_model.input],
                                 layers)
        else:
            model: Model = Model(word_input, layers)

        #Compile the model
        optimizer: Adam = Adam(lr=self.learning_rate)
        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['acc'])

        if self.verbose:
            print('5. training the model')

        if self.include_casing_information:

            input = [vectorized_texts, casing_information]

        else:

            input = vectorized_texts

        history: History = model.fit(
            input,
            target,
            epochs=self.learning_epochs,
            #validation_data=(test_vectors, test_target),
            verbose=1,  # Logs once per epoch.
            batch_size=self.learning_batch_size)

        self.model = model