示例#1
0
 def prepareData(self, processed_features, labels):
     # Split train & test
     x_train, x_test, y_train, y_test = train_test_split(
         processed_features, labels, test_size=self.TEST_SIZE, random_state=1)
     # Tokenize and transform to integer index
     tokenizer = Tokenizer(num_words=self.NUM_WORDS)
     tokenizer.fit_on_texts(x_train)
     x_train = tokenizer.texts_to_sequences(x_train)
     x_test = tokenizer.texts_to_sequences(x_test)
     self.INPUT_LENGTH = find_max_length(x_train)
     x_train = pad_sequences(x_train, maxlen=self.INPUT_LENGTH)
     x_test = pad_sequences(x_test, maxlen=self.INPUT_LENGTH)
     # Adding 1 because of reserved 0 index
     self.VOCAB_SIZE = len(tokenizer.word_index) + 1
     return x_train, x_test, y_train, y_test
示例#2
0
    def ft_model(self, processed_features, labels, embedding_matrix,
                 vocab_size, model):
        classes_num = self.dataset.getParameters()["classes_num"]
        X_train, X_test, y_train, y_test = train_test_split(
            processed_features,
            labels,
            test_size=self.TEST_SIZE,
            random_state=0)
        tokenizer = Tokenizer(num_words=self.NUM_WORDS)
        tokenizer.fit_on_texts(X_train)
        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)
        lenth = find_max_length(X_train)
        # vocab_size = len(tokenizer.word_index) + 1
        X_train = pad_sequences(X_train, padding='post', maxlen=lenth)
        X_test = pad_sequences(X_test, padding='post', maxlen=lenth)
        model = Sequential()
        model.add(
            Embedding(vocab_size,
                      64,
                      input_length=lenth,
                      weights=[embedding_matrix],
                      trainable=True))
        model.add(Dropout(0.5))
        model.add(Conv1D(128, 5, activation='relu'))
        model.add(GlobalMaxPooling1D())
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(classes_num, activation=self.ACTIVATION))
        model.compile(optimizer='adam',
                      loss=self.LOSSFUNC,
                      metrics=['accuracy'])
        es_callback = EarlyStopping(monitor='val_loss', patience=4)
        model.summary()

        history = model.fit(X_train,
                            y_train,
                            epochs=self.EPOCHS,
                            verbose=1,
                            validation_data=(X_test, y_test),
                            batch_size=self.BATCH_SIZE,
                            callbacks=[es_callback])
        loss, accuracy = model.evaluate(X_train, y_train, verbose=1)
        print("Training Accuracy: {:.4f}".format(accuracy))
        loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
        print("Testing Accuracy:  {:.4f}".format(accuracy))
        return history
示例#3
0
    def ol_model(self, processed_features, labels):
        # print(processed_features)
        # print(labels)
        classes_num = self.dataset.getParameters()["classes_num"]
        X_train, X_test, y_train, y_test = train_test_split(
            processed_features,
            labels,
            test_size=self.TEST_SIZE,
            random_state=0)
        tokenizer = Tokenizer(num_words=self.NUM_WORDS)
        tokenizer.fit_on_texts(X_train)
        X_train = tokenizer.texts_to_sequences(X_train)
        X_test = tokenizer.texts_to_sequences(X_test)
        lenth = find_max_length(X_train)
        vocab_size = len(tokenizer.word_index) + 1
        X_train = pad_sequences(X_train, padding='post', maxlen=lenth)
        X_test = pad_sequences(X_test, padding='post', maxlen=lenth)
        model = Sequential()
        model.add(Embedding(vocab_size, 64, input_length=lenth))
        model.add(Flatten())
        # model.add(Dense(4, activation='relu'))
        # model.add(Dropout(0.4))
        model.add(Dense(classes_num, activation=self.ACTIVATION))
        model.compile(loss=self.LOSSFUNC,
                      optimizer='adam',
                      metrics=['accuracy'])
        es_callback = EarlyStopping(monitor='val_loss', patience=3)
        model.summary()

        history = model.fit(X_train,
                            y_train,
                            validation_data=(X_test, y_test),
                            epochs=self.EPOCHS,
                            batch_size=self.BATCH_SIZE,
                            verbose=1,
                            callbacks=[es_callback])

        predicted_sentiment = model.predict(X_test)
        scores = model.evaluate(X_test, y_test, verbose=1)
        print("Accuracy: %.2f%%" % (scores[1] * 100))
        return history