示例#1
0
class DLModel(BenchmarkedModel):
    def __init__(self):
        super().__init__()
        max_features = 1024

        model = Sequential()
        model.add(Embedding(max_features, output_dim=256))
        model.add(LSTM(128))
        model.add(Dropout(0.5))
        model.add(Dense(1, activation="sigmoid"))

        model.compile(loss="binary_crossentropy",
                      optimizer="rmsprop",
                      metrics=["accuracy"])
        self.clf = model
        self.vectorizer = Tokenizer()

    def fit(self, data, labels):
        self.vectorizer.fit_on_texts(data)
        processed_data = self.vectorizer.texts_to_matrix(data, mode="count")

        self.clf.fit(processed_data, labels, batch_size=16, epochs=10)

    def predict(self, data):
        processed_data = self.vectorizer.texts_to_matrix(data, mode="count")
        self.clf.predict(processed_data)
示例#2
0
def model(classes):
    tokenizer = Tokenizer()
    train_documents = list()
    train_labels = list()
    test_documents = list()
    test_labels = list()
    for c in classes:
        train_documents += c[0]
        train_labels += c[1]
        test_documents += c[2]
        test_labels += c[3]

    train_labels = to_categorical(train_labels, num_classes)
    test_labels = to_categorical(test_labels, num_classes)

    tokenizer.fit_on_texts(train_documents)
    train = tokenizer.texts_to_matrix(train_documents, "tfidf")
    test = tokenizer.texts_to_matrix(test_documents, "tfidf")
    n_words = test.shape[1]

    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(train, train_labels, epochs=epochs, verbose=verbose)
    loss, acc = model.evaluate(test, test_labels, verbose=verbose)
    print(str(acc * 100))
示例#3
0
def one_hot_word_with_keras():
    tokenizer = Tokenizer(num_words = 1000)
    tokenizer.fit_on_texts(samples)
    sequences = tokenizer.texts_to_sequences(samples)
    one_hot_results = tokenizer.texts_to_matrix(samples, mode = 'binary')
    word_index = tokenizer.word_index
    print("Found {} unique tokens.".format(len(word_index)))
示例#4
0
 def build_matrix(self):
     """
     Transform the data frame to a matrix for the CNN training.
     :return: matrix for: x_train, x_test, y_train, y_test
     """
     self.lb_make = LabelEncoder()
     self.lb_make.fit(self.Y_train)
     tokenizer = Tokenizer(num_words=2000)
     x_array_train = numpy.asarray(self.train['text'])
     x_array_test = numpy.asarray(self.test['text'])
     tokenizer.fit_on_texts(x_array_train)
     x_train_matrix = tokenizer.texts_to_matrix(x_array_train, mode='count')
     x_test_matrix = tokenizer.texts_to_matrix(x_array_test, mode='count')
     y_train_numbers = self.lb_make.transform(self.Y_train)
     y_test_numbers = self.lb_make.transform(self.Y_test)
     y_train_matrix = keras.utils.to_categorical(y_train_numbers, 3)
     y_test_matrix = keras.utils.to_categorical(y_test_numbers, 3)
     self.tokenizer = tokenizer
     return x_train_matrix, x_test_matrix, y_train_matrix, y_test_matrix
示例#5
0
def get_data_as_one_hot(num_words, data_location='data/data', labels_location='data/labels'):
    data, labels = read_data_and_labels(data_location, labels_location)

    tokenizer = Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(data)
    one_hot = tokenizer.texts_to_matrix(data, mode='binary')
    encoded_labels = np.asarray(labels).astype('float32')

    print('Returning encoded text, labels and tokenizer')
    return one_hot, encoded_labels, tokenizer
示例#6
0
class ArticleThemeTokenizer:
    '''
    List of themes in the same order as in the tokenizer, which corresponds as well
    as the index of theme in the prediction
    '''
    orderedThemes: List[str]
    themes_count: int
    tokenizer: Tokenizer

    def __init__(self, articles: Articles):
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(articles.themes())

        self.one_hot_matrix = self.tokenizer.texts_to_matrix(articles.themes())

        # Remove the first column, whose first col contains only 0s.
        self.one_hot_matrix = np.delete(arr=self.one_hot_matrix, obj=0, axis=1)

        # Create ordered list of theme as in tokenizer
        self.orderedThemes: List[str] = []

        for i in range(1,
                       len(self.tokenizer.word_index) +
                       1):  # word_index start at 1, 0 is reserved.
            self.orderedThemes.append(self.tokenizer.index_word[i])

        self.themes_count = len(self.tokenizer.word_index)

    def index_of_theme(self, theme: str):
        return self.tokenizer.word_index[theme] - 1

    def theme_at_index(self, index: int):
        return self.tokenizer.index_word[index + 1]

    def boolean_vector_to_themes(self,
                                 prediction_vector: List[bool]) -> List[str]:

        themes: List[str] = []

        for idx in range(0, len(prediction_vector)):
            if prediction_vector[idx]:
                # +1 because the first index (0) is reserved by default.
                themes.append(self.tokenizer.index_word[idx + 1])

        return themes

    def save(self, path: str):
        tokenizer_json = self.tokenizer.to_json()
        with io.open(path, 'w', encoding='utf-8') as f:
            f.write(tokenizer_json)
示例#7
0
文件: nlp.py 项目: fish895623/pandas1
# %% [markdown]
from keras_preprocessing.text import Tokenizer


samples = [
    "the cat sat on the mat",
    "the dog ate my homework",
    "the the ate ate dog dog",
    "가 나 다",
]

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
one_hot_results = tokenizer.texts_to_matrix(samples, mode="binary")
word_index = tokenizer.word_index
print("%s 개의 토큰" % len(word_index))
word_index

# %%
sequences

# %%
one_hot_results.shape
# %%
one_hot_results[:10, :10]
# %%
samples = ["그 고양이는 맽 위에 앉았다", "그 개는 숙제를 먹었다"]
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)
            

print(token_index)


print()

# Tokenizer 로 분리 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(samples)
token_seq = tokenizer.texts_to_sequences(samples) #텍스트 정수 인덱싱
print(token_seq)

print()
token_mat = tokenizer.texts_to_matrix(samples, mode='binary')#2진 mode ='binary','count','tfidf' 
print(token_mat)
word_index = tokenizer.word_index
print(word_index)
print('found %s unique tokens'%(len(word_index))) #found 9 unique tokens
print(tokenizer.word_counts)
print(tokenizer.document_count)
print(tokenizer.word_docs)


print()
docs = [
    '먼저 ㅓ텍스트의 각 단어를 나누어 토큰화 한다.'
    '텍스트의 단어로 토큰화 해야 딥러닝에서 인식된다.',
    '토큰화 한 결 과는 딥러닝에서 사용할수 있다'
    ]
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from tensorflow.python.keras.models import Sequential

newsgroups_train = fetch_20newsgroups(subset='train', shuffle=True)
sentences = newsgroups_train.data
y = newsgroups_train.target

tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(sentences)
max_len = max([len(s.split()) for s in sentences])
vocab_len = len(tokenizer.word_index) + 1
sentences = tokenizer.texts_to_matrix(sentences)
padded_docs = pad_sequences(sentences, maxlen=max_len)

le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(padded_docs,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)
model = Sequential()

model.add(layers.Dense(300, input_dim=11821, activation='relu'))
model.add(layers.Dense(20, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
示例#10
0
Character = data['Character'].values.tolist()
Dialogue = data['Dialogue'].values.tolist()

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(Dialogue)
for i in range(len(Dialogue)):
    if Character[i] in character_dialog:
        character_dialog[Character[i]].append(Dialogue[i])
    else:
        character_dialog[Character[i]] = []
        character_dialog[Character[i]].append(Dialogue[i])

for key, value in character_dialog.items():
    random.shuffle(value, random.random)
    value = value[:6664]
    value = tokenizer.texts_to_matrix(value)
    dataX.extend(value)
    for i in range(len(value)):
        dataY.append(key)

c = list(zip(dataX, dataY))
random.shuffle(c)
dataX, dataY = zip(*c)

le = LabelBinarizer()
dataY = le.fit_transform(dataY)
dataX = numpy.array(dataX)
dataY = numpy.array(dataY)
model = Sequential()
model.add(Dense(64, input_shape=(len(dataX[0]),), activation='relu'))
model.add(Dense(32, activation='relu'))
Character = data['Character'].values.tolist()
Dialogue = data['Dialogue'].values.tolist()

for i in range(len(Dialogue)):
    if Character[i] in character_dialog:
        character_dialog[Character[i]].append(Dialogue[i])
    else:
        character_dialog[Character[i]] = []
        character_dialog[Character[i]].append(Dialogue[i])

for key, value in character_dialog.items():
    random.shuffle(value, random.random)
    value = value[:5000]
    for i in range(len(value)):
        if len(value[i]) > 10:
            dataX.append(tokenizer_name.texts_to_matrix(list(prepare_word(value[i], n=100))))
            dataY.append(key)

c = list(zip(dataX, dataY))
random.shuffle(c)
dataX, dataY = zip(*c)
le = LabelBinarizer()
dataY = le.fit_transform(dataY)
dataX = numpy.array(dataX)
dataY = numpy.array(dataY)
print(dataX.shape)
model = Sequential()
model.add(Conv1D(512, 3, activation='relu', input_shape=(len(dataX[0]), len(dataX[0][0]))))
model.add(Conv1D(512, 3, activation='relu'))
model.add(MaxPooling1D(3))
model.add(Conv1D(256, 3, activation='relu'))
示例#12
0
# Create tokenizer
num_words_keep = 1000
tokenizer = Tokenizer(num_words=num_words_keep,filters='',lower=False,split=' ',
                      char_level=False, oov_token=None)

# Fit tokenizer on training data
x_train = train.iloc[:,0]

tokenizer.fit_on_texts(texts=x_train)
modes = ['binary', 'count', 'tfidf', 'freq']

# Training data
y_train = train.iloc[:,1]

x_train = tokenizer.texts_to_matrix(x_train, mode=modes[1])
y_train = utils.to_categorical(y_train, num_classes=2)

# Validation data
x_validate = validate.iloc[:,0]
x_validate = tokenizer.texts_to_matrix(x_validate, mode=modes[1])

y_validate = validate.iloc[:,1]
y_validate = utils.to_categorical(y_validate, num_classes=2)

# Test data
x_test = test.iloc[:,0]
x_test = tokenizer.texts_to_matrix(x_test, mode=modes[1])

y_test = test.iloc[:,1]
示例#13
0
twenty_train = fetch_20newsgroups(subset='all',
                                  shuffle=False,
                                  remove=('headers', 'footers'))
x, y = twenty_train.data, twenty_train.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    shuffle=False)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_matrix(x_train, mode='tfidf')
x_test = tokenizer.texts_to_matrix(x_test, mode='tfidf')
nb_classes = np.max(y_train) + 1
y_train = np_utils.to_categorical(y_train, nb_classes)
y_test = np_utils.to_categorical(y_test, nb_classes)
print(nb_classes)

# pre-processing: divide by max and substract mean
input_dim = x_train.shape[1]

# convert list of labels to binary class matrix
reduce_percent = 0.8
loc = 'results/normal'
save_dir = gen_save_dir(loc)
# part_percentage = round(reduce_percent * len(X_train))