Exemplo n.º 1
0
    def _prepare_data(self, data_path, test_size, random_state):
        """Loads data and prepares for training

        Args:
            data_path (str): File path to the data
            test_size (float): Percent of the data to use for the test set
            random_state (int): Seed for randomly splitting data for train and test sets
        """
        ct = CleanText()

        df = pd.read_pickle(data_path)
        df = df[df['issue'] != '']

        df['clean_text'] = df['ticket_text'].apply(
            lambda x: ct.prepare_text(x))

        weights = self._weights_helper(df['issue'])

        trainLines, trainLabels = df['clean_text'], df['issue']
        labels = pd.get_dummies(trainLabels)

        X_train, X_test, y_train, y_test = train_test_split(
            trainLines,
            labels,
            test_size=test_size,
            random_state=random_state,
            stratify=labels)

        encoder = EncodeText()
        length = encoder.max_length(X_train)
        vocab_size = encoder.vocab_size(X_train)
        X_train = encoder.encode_text(X_train)
        X_test = encoder.encode_text(X_test, test_data=True)

        self.weights = weights
        self.labels = labels
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.length = length
        self.vocab_size = vocab_size
Exemplo n.º 2
0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, MaxPooling1D, Conv1D, concatenate
from tensorflow.keras import metrics
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from numpy import array
import pandas as pd

df = pd.read_pickle('./data.pkl')
ct = CleanText()
encoder = EncodeText()

df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')]

df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x))

trainLines, trainLabels = df['clean_text'], df['issue']

lb = LabelEncoder()
transformed_labels = lb.fit_transform(trainLabels)
transformed_labels = to_categorical(transformed_labels)

X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels)


length = encoder.max_length(X_train)
vocab_size = encoder.vocab_size(X_train)
X_train = encoder.encode_text(X_train)
X_test = encoder.encode_text(X_test, test_data=True)
Exemplo n.º 3
0
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import array
import numpy as np
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.layers import Bidirectional, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import AUC
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight

df = pd.read_pickle('./data.pkl')

clean = CleanText()

df['clean_text'] = df['ticket_text'].apply(lambda x: clean.prepare_text(x))


def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


def max_length(lines):
    return max([len(s.split()) for s in lines])


def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
Exemplo n.º 4
0
encoder.load_encoder('./encoder_files/encoder.pkl')
encoder.load_encoder_variables('./encoder_files/encoder_variables.json')

X_test = encoder.encode_text(X_test, test_data=True)

cnn = load_model('./model_files/cnn_classification_model.h5')
rnn = load_model('./model_files/rnn_classification_model.h5')
hybrid = load_model('./model_files/hybrid_attention_classification_model.h5')

clean = CleanText()

test_text = ['''I cant get my morgan stanley account to connect to EveryDollar. If I cant get it to connect, 
    Im going to need to get a refund. Its the only value I get from the app''']

tt = [clean.prepare_text(t) for t in test_text]
tt = encoder.encode_text(tt, test_data=True)

cnn_res = cnn.predict(tt)
y_test.columns[np.argmax(cnn_res)]

rnn_res = rnn.predict(tt)
y_test.columns[np.argmax(rnn_res)]

hybrid_res = hybrid.predict(tt)
y_test.columns[np.argmax(hybrid_res)]



cnn_res = cnn.predict(X_test)
cnn_res_t = (cnn_res > .5)