Exemplo n.º 1
0
    def tokenize(self, comments):
        print('Comments shape is {}'.format(comments.shape))

        token = Tokenizer(num_words=self.vocab_size)
        token.fit_on_texts(comments)

        tokenized_comments = token.texts_to_sequences(comments)

        tokenized_comments = sequence.pad_sequences(
            sequences=tokenized_comments,
            maxlen=self.max_sentence_len,
            padding='post',
            value=0)
Exemplo n.º 2
0
    def prepare(self,
                X,
                Y,
                emb_model,
                seq_length=200,
                stratify='n',
                test_split=0.2,
                emb_dim=100):
        #prepare data for use in NN
        #Convert text to sequences and create word index for use in creating embedding matrix
        from tensorflow.contrib.keras.api.keras.preprocessing.text import Tokenizer
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X)
        X_seq = tokenizer.texts_to_sequences(X)
        word_idx = tokenizer.word_index
        from tensorflow.contrib.keras.api.keras.preprocessing import sequence
        X_seq = sequence.pad_sequences(X_seq, maxlen=seq_length)

        #encode labels in 1h vector
        from sklearn.preprocessing import LabelBinarizer
        label_encoder = LabelBinarizer()
        Y_coded = label_encoder.fit_transform(Y)

        #create test and train split
        from sklearn.model_selection import train_test_split
        if stratify == 'y':
            x_train, x_test, y_train, y_test = train_test_split(
                X_seq,
                Y_coded,
                test_size=test_split,
                random_state=141289,
                stratify=Y_coded)
        else:
            x_train, x_test, y_train, y_test = train_test_split(
                X_seq, Y_coded, test_size=test_split, random_state=141289)

        #learn embedding matrix from the passed model
        import numpy as np
        embedding_mat = np.zeros((len(word_idx) + 1, emb_dim))
        for w, i in word_idx.items():
            try:
                embedding_vector = emb_model[w]
                embedding_mat[i] = embedding_vector
            except KeyError:
                pass  #print ("no "+ word+" pos" + str(i))
        return x_train, x_test, y_train, y_test, embedding_mat, tokenizer, label_encoder
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))

# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
## fill up the missing values
train_X = train_df["question_text"].fillna("_##_").values
test_X = test_df["question_text"].fillna("_##_").values

# Get the response
train_y = train_df['target'].values
train_y = train_y.reshape(len(train_y), 1)

# creates a mapping from the words to the embedding vectors=
embeddings_index = dict(
    get_coefs(*o.split(" ")) for o in open(FLAGS.glove_path, encoding='utf-8'))
vocab_size = len(embeddings_index.keys())
print('vocab size :', vocab_size)

tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
# val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=FLAGS.max_sentence_len)
# val_X = pad_sequences(val_X, maxlen=FLAGS.max_sentence_len)
test_X = pad_sequences(test_X, maxlen=FLAGS.max_sentence_len)

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
del all_embs

word_index = tokenizer.word_index
Exemplo n.º 5
0
                                   loss=loss,
                                   train_op=train_op,
                                   eval_metric_ops=eval_metric_ops)


text_col = 'consumer_complaint_narrative'
target_col = 'product'

data_set = pd.read_csv('dataset/complaints.csv')

features = data_set[text_col].values
targets = data_set[target_col].values

targets = to_categorical(targets, 11)

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(features)

features_seq = tokenizer.texts_to_sequences(features)
word_index = tokenizer.word_index

X_train, X_test, y_train, y_test = train_test_split(features_seq,
                                                    targets,
                                                    random_state=55,
                                                    test_size=0.20)

X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')

print('X - Train ', np.shape(X_train))
print('X - Test ', np.shape(X_test))