示例#1
0
# load pretrain glove word2vec instance for preprocessing
filename = './data/glove.6B.300d.txt'
print('Indexing Glove 6B 300D word vectors.')
embeddings_index = {}
with open(filename, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

print('Vectorizing input text')
# vectorize the input text (both negative and positive )
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(x_text)
sequences = tokenizer.texts_to_sequences(x_text)
word_index = tokenizer.word_index
print(len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(y))

# split the data into a training set and a validation set
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    labels,
                                                    test_size=VALIDATION_SPLIT)

print('Preparing embedding matrix')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
示例#2
0
    line = line.replace(")", "")
    line = line.replace("/", "")
    line = line.replace("\\", "")
    line = line.replace("&", "")
    line = line.replace("#", "")
    line = re.sub('\d', '', line)
    line = line.split(' ')
    line = [w for w in line if not w in stop_words]
    line = str(line)
    line = str(line.strip())[1:-1].replace(' ', ' ')
    strings.append(line)

#encode text as numbers
tok_Len = 100000  # max number of words for tokenizer
tokenizer = Tokenizer(num_words=tok_Len)
tokenizer.fit_on_texts(strings)
sequences = tokenizer.texts_to_sequences(strings)
term_Index = tokenizer.word_index
print('Number of Terms:', len(term_Index))

sen_Len = 162  # max length of each sentences, including padding
tok_Features = pad_sequences(sequences, padding='post', maxlen=sen_Len - 111)
print('Shape of tokenized features tensor:', tok_Features.shape)

indices = np.arange(tok_Features.shape[0])
np.random.shuffle(indices)
time_series = df['created_at_retweets']
time_series.reset_index(drop=True, inplace=True)
print(type(time_series))
time_series = time_series[indices]
tok_Features = tok_Features[indices]
示例#3
0
import seaborn as sns

# Lading the cleaned data csv file
path = "../input/cleaned-data-for-nlp-news-classification/cleaned_data.csv"
data = pd.read_csv(path)

# Tokenizing
vocab_size = 10000
embedding_dim = 32
max_length = 150
trunc_type = 'post'
oov_tok = '<OOV>'
padding_post = 'post'

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X,
                  padding=padding_post,
                  maxlen=max_length,
                  truncating=trunc_type)

# Building Model
keras.backend.clear_session()

model = tf.keras.Sequential([
    keras.layers.Embedding(vocab_size,
                           embedding_dim,
                           input_length=X.shape[1],
                           input_shape=[None]),
    keras.layers.Bidirectional(
示例#4
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""<a href="https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%201%20-%20Lesson%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2019 The TensorFlow Authors.
"""

#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tensorflow.keras.preprocessing.text import Tokenizer

sentences = ['i love my dog', 'I, love my cat', 'You love my dog!']

tokenizer = Tokenizer(num_words=100)  # only count the top 100 frequent words
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)
示例#5
0
	training_labels.append(l.numpy())

for s, l in test:
	testing_sentences.append(s.numpy().decode('utf8'))
	testing_labels.append(l.numpy())


vocab_size = 10000
oov = 'OOV'
truncate = 'post'
maxlen = 300
embeding_dim = 10
output = []

tokenizer = Tokenizer(oov_token=oov, num_words=vocab_size)
tokenizer.fit_on_texts(training_sentences)
train_seq = tokenizer.texts_to_sequences(training_sentences)
train_pad = pad_sequences(train_seq, maxlen=maxlen, truncating=truncate)
test_seq = tokenizer.texts_to_sequences(testing_sentences)
test_pad = pad_sequences(test_seq, maxlen=maxlen, truncating=truncate)

def check_sentences(token_s):
	return ' '.join([tokenizer.index_word.get(i) for i in token_s])
	
print(np.array(train_seq[0]).reshape(-1,))
print(check_sentences(train_seq[0]))

model = tf.keras.Sequential([
	tf.keras.layers.Embedding(vocab_size, embeding_dim, input_length=maxlen),
	tf.keras.layers.Flatten(),
	tf.keras.layers.Dense(6, activation='relu'),
示例#6
0
train_dir = r"D:\data\csv_file\amazon_len_renew\amazon_1000_renew.csv"
test_dir = r"D:\data\csv_file\amazon_len_renew\amazon_test.csv"
glove_100_dir = "D:/data/glove.6B/glove.6B.100d.txt"

original_train_df = pd.read_csv(train_dir)
original_test_df = pd.read_csv(test_dir)

original_test_df, original_val_df = train_test_split(original_test_df,
                                                     test_size=0.4,
                                                     random_state=0)

x = original_train_df['review']
y = original_train_df['label']

t = Tokenizer()
t.fit_on_texts(x)

vocab_size = len(t.word_index) + 1
sequences = t.texts_to_sequences(x)


def max_text():
    for i in range(1, len(sequences)):
        max_length = len(sequences[0])
        if len(sequences[i]) > max_length:
            max_length = len(sequences[i])
    return max_length


text_num = max_text()
maxlen = text_num
示例#7
0
def get_training_data(intents_file_path):

    try:
        with open('token_data.pickle', 'rb') as f:
            training_set, training_labels, word_count, max_sequence_len = pickle.load(
                f)

        return training_set, training_labels, word_count, max_sequence_len

    except:

        with open(intents_file_path) as f:
            data = json.load(f)

        #Parse the data
        sentences = []
        labels = []
        sentences_y = []

        for intent in data["intents"]:
            for pattern in intent["patterns"]:
                sentences.append(pattern)
                sentences_y.append(
                    intent["tag"]
                )  #So we have a tag associated with the pattern

            if intent["tag"] not in labels:
                labels.append(intent["tag"])

        #Create tokenizer
        tokenizer = Tokenizer(oov_token="<OOV>")
        tokenizer.fit_on_texts(sentences)
        word_count = len(tokenizer.word_index) + 1

        #print(tokenizer.word_index)

        #Tokenize and pad
        sequences = tokenizer.texts_to_sequences(sentences)
        max_sequence_len = max([len(x) for x in sequences])
        padded_sequences = pad_sequences(sequences,
                                         maxlen=max_sequence_len,
                                         truncating='post')

        #Label tokenizer
        label_tokenizer = Tokenizer()
        label_tokenizer.fit_on_texts(labels)

        tok_labels = np.array(label_tokenizer.texts_to_sequences(sentences_y))

        training_labels = np.zeros((padded_sequences.shape[0], len(labels)))

        for i in range(padded_sequences.shape[0]):
            training_labels[i][tok_labels[i] - 1] = 1

        with open('token_data.pickle', 'wb') as f:
            pickle.dump((padded_sequences, training_labels, word_count,
                         max_sequence_len), f)

        with open('raw_data.pickle', 'wb') as f:
            pickle.dump((sentences, labels, sentences_y), f)

        return padded_sequences, training_labels, word_count, max_sequence_len
示例#8
0
    texts_false = f.readlines()
    texts_false[0] = texts_false[0].replace('\ufeff', '')

texts = texts_true + texts_false
count_true = len(texts_true)
count_false = len(texts_false)
total_lines = count_true + count_false
print(count_true, count_false, total_lines)

maxWordsCount = 100000
tokenizer = Tokenizer(num_words=maxWordsCount,
                      filters='!–"—#$%&amp;()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r«»',
                      lower=True,
                      split=' ',
                      char_level=False)
tokenizer.fit_on_texts(texts)

max_text_len = 30
data = tokenizer.texts_to_sequences(texts)
data_pad = pad_sequences(data, maxlen=max_text_len)
print(data_pad.shape)

X = data_pad
Y = np.array([[1, 0]] * count_true + [[0, 1]] * count_false)
print(X.shape, Y.shape)

indeces = np.random.choice(X.shape[0], size=X.shape[0], replace=False)
X = X[indeces]
Y = Y[indeces]

with open('dataset/test.txt', 'r', encoding='utf-8') as f:
示例#9
0
for tagged_sentence in tagged_sentences:  # 14,041개의 문장 샘플을 1개씩 불러온다.
    sentence, tag_info = zip(
        *tagged_sentence)  # 각 샘플에서 단어들은 sentence에 개체명 태깅 정보들은 tag_info에 저장.
    sentences.append(list(sentence))  # 각 샘플에서 단어 정보만 저장한다.
    ner_tags.append(list(tag_info))  # 각 샘플에서 개체명 태깅 정보만 저장한다.

print('샘플의 최대 길이 : %d' % max(len(l) for l in sentences))
print('샘플의 평균 길이 : %f' % (sum(map(len, sentences)) / len(sentences)))
plt.hist([len(s) for s in sentences], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

max_words = 10000
src_tokenizer = Tokenizer()
src_tokenizer.fit_on_texts(sentences)

tar_tokenizer = Tokenizer()
tar_tokenizer.fit_on_texts(ner_tags)

vocab_size = max_words
tag_size = len(tar_tokenizer.word_index) + 1
print('단어 집합의 크기 : {}'.format(vocab_size))
print('개체명 태깅 정보 집합의 크기 : {}'.format(tag_size))

X_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(ner_tags)

index_to_word = src_tokenizer.index_word
index_to_ner = tar_tokenizer.index_word
                                inplace=True)
        df[colname] = df[colname].str.lower()  #convert to lower case
    return df


df_train = prepro('training')
df_train['Class'].value_counts().plot(kind="bar", rot=0)

df_train['TEXT'] = df_train['TEXT'].apply(lambda x: ' '.join([
    lemmatizer.lemmatize(word) for word in set(x.split())
    if word not in estopwords
]))

MAX_VOCABS = 5000
tokenizer = Tokenizer(num_words=MAX_VOCABS)
tokenizer.fit_on_texts(pd.concat([df_train['TEXT']]))
x_train = tokenizer.texts_to_sequences(df_train['TEXT'])

MAX_LEN = max([len(i) for i in x_train])
vocab_size = MAX_VOCABS + 1
x_train = pad_sequences(x_train,
                        padding='post',
                        maxlen=MAX_LEN,
                        value=vocab_size)

# convert integers to dummy variables (i.e. one hot encoded)
y_train = pd.get_dummies(df_train['Class']).values
dummy_columns = pd.get_dummies(df_train['Class']).columns
dummy_columns = dummy_columns.tolist
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train,
示例#11
0
class BugModelClient:

    oov_token = '<OOV>'
    vocab_size = None
    embedding_dim = 50
    training_portion = 0.8
    max_length = 100
    num_epochs = 8
    dropout = 0.2

    class_weight = {0 : 1 , 1 : 2}

    data_path = 'datasets/training_dataset_pairs.csv'

    tokenizer_path = 'models/tokenizer.pickle'

    custom_glove_path = 'datasets/custom_glove_50d.txt'

    data = None

    training_size = None

    word_index = None

    tokenizer = None

    embedding_matrix = None

    bug_model = BugModel()

    def init_data(self, data_count):
        self.data = pd.read_csv(self.data_path, sep=',')
        self.data = self.data[:data_count]
        print(len(self.data.index))
        self.data['clean_description_1'] = self.clean_descriptions(self.data['description_1'])
        self.data['clean_description_2'] = self.clean_descriptions(self.data['description_2'])
        self.training_size = int(len(self.data.index) * self.training_portion)

        X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(self.data['clean_description_1'], self.data['clean_description_2'], self.data['duplicates'], test_size=0.2)

        self.tokenizer = Tokenizer(oov_token=self.oov_token)
        self.tokenizer.fit_on_texts(X1_train)
        self.tokenizer.fit_on_texts(X2_train)
        self.word_index = self.tokenizer.word_index
        print(len(self.word_index))
        self.vocab_size = len(self.word_index) + 1

        X1_train = np.array(text_to_padded(X1_train, self.tokenizer, self.max_length))
        X1_test = np.array(text_to_padded(X1_test, self.tokenizer, self.max_length))
        X2_train = np.array(text_to_padded(X2_train, self.tokenizer, self.max_length))
        X2_test = np.array(text_to_padded(X2_test, self.tokenizer, self.max_length))
        
        self.X1_train = X1_train
        self.X1_test = X1_test
        self.X2_train = X2_train
        self.X2_test = X2_test
        self.y_train = y_train
        self.y_test = y_test

    def prepare_embedding(self):
        embeddings_index = dict()
        f = open(self.custom_glove_path, encoding='utf8')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        print('Loaded %s word vectors.' % len(embeddings_index))
        embeddings_matrix = np.zeros((self.vocab_size, self.embedding_dim))
        for word, i in self.tokenizer.word_index.items():
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                embeddings_matrix[i] = embedding_vector
        self.embedding_matrix = embeddings_matrix

    def save_tokenizer(self):
        with open(self.tokenizer_path, 'wb') as handle:
            pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def load_tokenizer(self):
        with open(self.tokenizer_path, 'rb') as handle:
            self.tokenizer = pickle.load(handle)
        self.word_index = self.tokenizer.word_index
        self.vocab_size = len(self.word_index) + 1
        print('Loaded tokenizer with %s words.' % self.vocab_size)

    def clean_descriptions(self, descriptions):
        clean_descriptions = descriptions.apply(lambda x: clean_text(x))
        return clean_descriptions

    def train_model(self):
        self.bug_model.construct_model(self.vocab_size, self.embedding_dim, self.max_length, self.dropout, self.embedding_matrix)
        self.bug_model.fit_model([self.X1_train, self.X2_train], self.y_train, [self.X1_test, self.X2_test], self.y_test, self.num_epochs, self.class_weight)

    def plot_graphs(self):
        self.bug_model.plot_graphs()

    def save_model(self):
        self.bug_model.save_model()
        self.save_tokenizer()

    def load_model(self):
        self.bug_model.load_model()
        self.load_tokenizer()

    def predict(self, descriptions1, descriptions2):
        descriptions1 = np.array(text_to_padded(self.clean_descriptions(descriptions1), self.tokenizer, self.max_length))
        descriptions2 = np.array(text_to_padded(self.clean_descriptions(descriptions2), self.tokenizer, self.max_length))
        return self.bug_model.predict([descriptions1, descriptions2])

    def validate_predict_top_k(self, descriptions, labels, master_labels, all_descriptions, all_labels, all_master_labels, k):
        descriptions = np.array(text_to_padded(self.clean_descriptions(descriptions), self.tokenizer, self.max_length))
        all_descriptions = np.array(text_to_padded(self.clean_descriptions(all_descriptions), self.tokenizer, self.max_length))
        print(labels)
        all_predictions = []
        for index, description in enumerate(descriptions):
            print(index)
            description_repeated = np.full((len(all_descriptions), self.max_length), description)
            predictions = self.bug_model.predict([description_repeated, all_descriptions])
            predictions = np.array([prediction[0] for prediction in predictions])
            predictions_top_indices = (-predictions).argsort()
            prediction_summary = []
            top_k_master_labels = []
            for pred_index in predictions_top_indices:
                if len(top_k_master_labels) >= k:
                    break
                if all_master_labels[pred_index] not in top_k_master_labels:
                    top_k_master_labels.append(all_master_labels[pred_index])
                    prediction_summary.append({'case_id': all_labels[pred_index], 'master_id': all_master_labels[pred_index], 'probability': predictions[pred_index]})
            did_predict = master_labels[index] in top_k_master_labels if master_labels[index] != labels[index] else master_labels[index] not in top_k_master_labels
            for n, pred_index in enumerate(predictions_top_indices):
                if all_master_labels[pred_index] == master_labels[index]:
                    print('Correct target for {} with id {} in position {} with probability of {}'.format(labels[index], all_labels[pred_index], n, predictions[pred_index]))
            all_predictions.append({
                'case_id': labels[index],
                'master_id': master_labels[index],
                'predictions': prediction_summary,
                'correct': did_predict
            })
        return {'predictions': all_predictions, 'recall': len([prediction for prediction in all_predictions if prediction['correct'] == True]) / len(all_predictions)}
# X_train = train_data['tokenized'].values
#import h5py
loaded_model = load_model('snhs_rnn.h5')
# with h5py.File('snhs_rnn_dr02.h5', mode='r') as f:
#   # instantiate model
#   model_config = f.attrs.get('model_config')
#   print(model_config)

data_excel = pd.read_excel('201126_이노션샘플데이터.xlsx')
print(data_excel.head(5))
#data_excel.columns = ['Text']
#print(data_excel.head(5))
print(len(data_excel['reviews']))
x_save_load = np.load('X_save2.npy', allow_pickle=True)
tokenizer = Tokenizer(vocab_size, oov_token="OOV")
tokenizer.fit_on_texts(x_save_load)

f = open('result.txt', 'w', encoding='utf8')
f2 = open('result3.txt', 'w', encoding='utf8')
f.write('감성(test)\n')
f2.write('감성(test)\n')


def sentiment_predict(new_sentence):

    new_sentence = preprocword(new_sentence)
    #print(new_sentence)
    #print([new_sentence])
    encoded = tokenizer.texts_to_sequences([new_sentence])  # 정수 인코딩
    #print(encoded)
    pad_new = pad_sequences(encoded, maxlen=max_len, truncating='post')  # 패딩
train_y = xdf['label']                                                 #Creating the labels array
sentences = xdf['tweet']

from tensorflow.keras.preprocessing.text import Tokenizer              #To tokenize the text
from tensorflow.keras.preprocessing.sequence import pad_sequences      #To pad uneven length sequences

#Hyper parameters
num_words = 10000
pad_type = 'post'
oov_token = "<OOV>"                                                    #Out of vocabulary token
embedding_dim = 16
max_length = 250


tokenizer = Tokenizer(num_words = num_words, oov_token = oov_token)    #Creating a tokenizer object
tokenizer.fit_on_texts(sentences)                                      #Using the method fit_on_texts() to tokenize the text feature
word_index = tokenizer.word_index                                      #Displaying the word-index

 con
sequences = tokenizer.texts_to_sequences(sentences)                    #Using the texts_to_sequences() method to convert the tokens into sequences
padded_sequences = pad_sequences(sequences, maxlen = max_length, padding = 'post', truncating = 'post')
                                                                       #Padding the sequences. Padding type is "post".


pdf = pd.DataFrame(padded_sequences)

from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(pdf, train_y, test_size = 0.1, random_state = 10, shuffle = True) 
                                                                       #Splitting the dataset into train_data and test_data
    
示例#14
0
#
# print('sysevr embedding')
# print(sysevr_emb_dict['const'])

# Tokenize corpus
ast_tokenizer = Tokenizer()

# cg_tokenizer = Tokenizer()
# bcg_tokenizer = Tokenizer()
# fcg_tokenizer = Tokenizer()

# sysevr_tokenizer = Tokenizer()

print("tokenizing asts")
# Fit tokenizers
ast_tokenizer.fit_on_texts(ast_data)

# print("tokenizing cgs")
# bcg_tokenizer.fit_on_texts(back_slices_data)
# fcg_tokenizer.fit_on_texts(forward_slices_data)
#
# print("tokenizing sysevr")
# sysevr_tokenizer.fit_on_texts(sysevr_data)

#################################################
print("creating ast sequence")
ast_sequences = my_preprocessing.ast_sequence(ast_data)

# print("creating cg sequence")
# bcg_sequences = bcg_tokenizer.texts_to_sequences(back_slices_data)
# fcg_sequences = fcg_tokenizer.texts_to_sequences(forward_slices_data)
示例#15
0
class PreProcessor:
    def __init__(self,
                 sentences,
                 ner_tags,
                 val_sentences,
                 val_ner_tags,
                 oov_token: str = "<OOV>"):
        self._sentences = sentences
        self._ner_tags = ner_tags
        self._val_sentences = val_sentences
        self._val_ner_tags = val_ner_tags

        self._input_sequences = None
        self._label_sequences = None
        self._val_input_sequences = None
        self._val_label_sequences = None

        self._tokenizer = Tokenizer(oov_token=oov_token)
        self._label_tokenizer = Tokenizer()

        self._max_sequence_length = None

    def pre_process_data(self):
        self._pre_process_train_input_sequences()
        self._pre_process_train_label_sequences()

        self._pre_process_validation_input_sequences()
        self._pre_process_validation_label_sequences()

    def _pre_process_train_input_sequences(self):
        self._input_sequences = self._pre_process_input_sequence(
            self._sentences)

    def _pre_process_train_label_sequences(self):
        self._label_sequences = self._pre_process_label_sequences(
            self._ner_tags)

    def _pre_process_validation_input_sequences(self):
        self._val_input_sequences = self._pre_process_input_sequence(
            self._val_sentences, validation=True)

    def _pre_process_validation_label_sequences(self):
        self._val_label_sequences = self._pre_process_label_sequences(
            self._val_ner_tags, validation=True)

    def _pre_process_input_sequence(self,
                                    sentences: list,
                                    validation=False) -> np.ndarray:
        if not validation:
            self._tokenizer.fit_on_texts(sentences)
            self._compute_max_sequence_len(sentences)
        input_sequences = self._tokenizer.texts_to_sequences(sentences)
        padded_input_sequences = pad_sequences(
            input_sequences, padding='post', maxlen=self._max_sequence_length)
        return np.array(padded_input_sequences)

    def _pre_process_label_sequences(self,
                                     ner_tags: list,
                                     validation: bool = False) -> np.ndarray:
        if not validation:
            self._label_tokenizer.fit_on_texts(ner_tags)
        label_sequences = self._label_tokenizer.texts_to_sequences(ner_tags)
        padded_label_sequences = pad_sequences(
            label_sequences, padding='post', maxlen=self._max_sequence_length)
        return np.array(padded_label_sequences)

    def _compute_max_sequence_len(self, input_sequences: list):
        seq_lengths = [len(seq) for seq in input_sequences]
        self._max_sequence_length = max(seq_lengths)

    @property
    def input_sequences(self):
        return self._input_sequences

    @property
    def label_sequences(self):
        return self._label_sequences

    @property
    def val_input_sequences(self):
        return self._val_input_sequences

    @property
    def val_label_sequences(self):
        return self._val_label_sequences

    @property
    def num_unique_word_tokens(self):
        return len(self._tokenizer.word_index) + 1  # +1 for padding token

    @property
    def num_unique_label_tokens(self):
        return len(self._label_tokenizer.word_index)

    @property
    def max_sequence_length(self):
        return self._max_sequence_length

    @property
    def label_index_to_words_dict(self):
        return self._label_tokenizer.index_word
示例#16
0
文件: NAML_sub.py 项目: nnnyt/MIND
def preprocess_news_data(filename):
    print('Preprocessing news...')
    all_texts = []
    category_map = {}
    titles = []
    abstracts = []
    categories = []

    with open(filename, 'r') as f:
        for l in f:
            id, category, subcategory, title, abstract, url, entity = l.strip(
                '\n').split('\t')
            title = title.lower()
            # print(word_tokenize(title))
            abstract = abstract.lower()
            # all_texts.append(word_tokenize(title))
            # all_texts.append(word_tokenize(abstract))
            all_texts.append(title + ". " + abstract)
            # map every category to a number
            if subcategory not in category_map:
                category_map[subcategory] = len(category_map)
            # map every subcategory to a number
            titles.append(title)
            abstracts.append(abstract)
            categories.append(subcategory)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    word_index = tokenizer.word_index  # a dict: word_index[word]=index
    print('Found %s unique tokens.' % len(word_index))
    print('Found %s unique categories.' % len(category_map))
    # print(word_index)

    # title
    news_title = np.zeros((len(titles), MAX_TITLE_LENGTH), dtype='int32')
    for i, title in enumerate(titles):
        wordTokens = text_to_word_sequence(title)
        k = 0
        for _, word in enumerate(wordTokens):
            if k < MAX_TITLE_LENGTH:
                news_title[i, k] = word_index[word]
                k = k + 1

    # abstract
    news_abstract = np.zeros((len(abstracts), MAX_ABSTRACT_LENGTH),
                             dtype='int32')
    for i, abstract in enumerate(abstracts):
        wordTokens = text_to_word_sequence(abstract)
        k = 0
        for _, word in enumerate(wordTokens):
            if k < MAX_ABSTRACT_LENGTH:
                news_abstract[i, k] = word_index[word]
                k = k + 1
    # category & subcategory
    news_category = []
    k = 0
    for category in categories:
        news_category.append(category_map[category])
        k += 1
    news_category = to_categorical(np.asarray(news_category))

    return word_index, category_map, news_category, news_abstract, news_title
    for row in reader:
        labels.append(row[0])
        sentence = row[1]
        for word in stopwords:
            token = " " + word + " "
            sentence = sentence.replace(token, " ")
            sentence = sentence.replace("  ", " ")
        sentences.append(sentence)


print(len(sentences))
print(sentences[0])


tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(len(word_index))
# Expected output
# 29714




sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)


示例#18
0
class GatedRecurrentUnit(object):
    def __init__(self,
                 max_tokens=5000,
                 embedding_size=8,
                 num_words=10000,
                 model=None,
                 tokenizer=None):
        self.x_train = []
        self.y_train = []
        self.x_train_tokens = []
        self.list_label = []
        self.model = model
        self.num_words = num_words
        self.max_tokens = max_tokens
        self.embedding_size = embedding_size
        self.tokenizer = tokenizer
        self.summary = True
        self.verbose = 1
        self.epoch = 5
        self.validation_split = 0.1

    def one_hot_encoder(self, y):
        self.list_label = list(set(y))
        label = np.zeros([len(y), len(self.list_label)])
        for i in range(len(y)):
            label[i][self.list_label.index(y[i])] = 1
        return label

    def model_gru(self):

        self.model = Sequential()
        self.model.add(
            Embedding(input_dim=self.num_words,
                      output_dim=self.embedding_size,
                      input_length=self.max_tokens,
                      name='Embedding_Layer'))
        self.model.add(GRU(units=4))
        self.model.add(
            Dense(len(self.list_label),
                  activation='softmax',
                  name='Output_layer'))
        self.model.compile(loss='binary_crossentropy',
                           optimizer=Adam(lr=0.001),
                           metrics=['accuracy'])
        if self.summary:
            print(self.model.summary())

        self.model.fit(self.x_train_tokens,
                       self.y_train,
                       epochs=self.epoch,
                       validation_split=self.validation_split,
                       verbose=self.verbose)

    def text_to_seq(self, x):
        temp = self.tokenizer.texts_to_sequences([x])
        return pad_sequences(temp,
                             maxlen=self.max_tokens,
                             padding='pre',
                             truncating='pre')

    def fit(self, x_train, y_train, epoch=5, validation_split=0.1, verbose=1):

        self.x_train = x_train
        self.y_train = y_train

        self.epoch = epoch
        self.validation_split = validation_split
        self.verbose = verbose

        self.tokenizer = Tokenizer(num_words=self.num_words)
        self.tokenizer.fit_on_texts(self.x_train)

        self.x_train_tokens = self.tokenizer.texts_to_sequences(self.x_train)
        self.x_train_tokens = pad_sequences(self.x_train_tokens,
                                            maxlen=self.max_tokens,
                                            padding='pre',
                                            truncating='pre')

        # if type(self.y_train[0]) == str or type(self.y_train[0]) == int:
        self.y_train = np.array(self.one_hot_encoder(y_train))

        self.model_gru()

    def save_model(self, filename='model'):
        model_json = self.model.to_json()
        with open(
                os.path.join(os.getcwd(),
                             'Model/Output_model/{}.json'.format(filename)),
                'w') as json_file:
            json_file.write(model_json)
        self.model.save_weights(
            os.path.join(os.getcwd(),
                         'Model/Output_model/{}.h5'.format(filename)))
        joblib.dump(
            self.tokenizer,
            os.path.join(
                os.getcwd(),
                'Model/Output_model/{}_tokenizer.joblib'.format(filename)))
        joblib.dump(
            self.list_label,
            os.path.join(
                os.getcwd(),
                'Model/Output_model/{}_list_label.joblib'.format(filename)))
        joblib.dump(
            self.max_tokens,
            os.path.join(
                os.getcwd(),
                'Model/Output_model/{}_max_tokens.joblib'.format(filename)))

    def load_model(self, filename='model'):
        json_file = open(
            os.path.join(os.getcwd(),
                         'Model/Output_model/{}.json'.format(filename)), 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.model = model_from_json(loaded_model_json)
        self.model.load_weights(
            os.path.join(os.getcwd(),
                         'Model/Output_model/{}.h5'.format(filename)))
        self.tokenizer = joblib.load(
            os.path.join(
                os.getcwd(),
                'Model/Output_model/{}_tokenizer.joblib'.format(filename)))
        self.list_label = joblib.load(
            os.path.join(
                os.getcwd(),
                'Model/Output_model/{}_list_label.joblib'.format(filename)))
        self.max_tokens = joblib.load(
            os.path.join(
                os.getcwd(),
                'Model/Output_model/{}_max_tokens.joblib'.format(filename)))
        self.model.compile(loss='binary_crossentropy',
                           optimizer=Adam(lr=0.001),
                           metrics=['accuracy'])

    def predict(self, x):
        return self.model.predict(x)

    def predict_classes(self, x):
        return self.model.predict_classes(x)

    def score(self, x_test, y_test):
        return self.model.evaluate(x_test, y_test)
示例#19
0
def create_tokenizer():
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(MidiParser.vocabulary().keys())
    return tokenizer
示例#20
0
class Decomp_tokenizer(object):

	def __init__(self):
		self.Tokenizer_args =Tokenizer(num_words=100,split=' ')
		self.Tokenizer_instr =Tokenizer(num_words=100,split='\n')
		self.label_mapping = {}

	def fit_instr(self, data):
		#fdata = random.sample(data, 50)
		self.Tokenizer_instr.fit_on_texts(data)

	def fit_args(self, data):
		data = self.no_newline(data)
		self.Tokenizer_args.fit_on_texts(data)


	def fit_label(self, data):
		mapping = {}
		unique = list(set(data))
		for index, i in enumerate(unique):
			a = [0]*len(unique)
			a[index] = 1
			mapping[i] = a
		print("SAVE THIS PLEASE")
		print(mapping)
		print("_________________________________________")
		self.label_mapping = mapping

	def tokenizeLabels(self, all):
		outs = []
		for i in all:
			outs.append(self.label_mapping.get(i))
		return outs

	def tokenize_labels_to_file(self, filename, data):
		outs = []
		ddict = {}
		for i in data:
			outs.append([self.label_mapping.get(i)])
		ddict["data"] = outs

		with open(filename+".json", "+w") as f:
			f.write(json.dumps(ddict))

	def read_data_from_file(self,filename):
		with open(filename+".json", 'r') as f:
			jdata = json.load(f)
			keys = jdata['data']
		print("[*Reading in {}.json*]".format(filename))
		return keys

	def tokenize_data_to_file(self, filename, data):
		tokens = self.Tokenizer.texts_to_sequences(tqdm(data))
		data = {
		}
		print("writing to dict")
		data["data"] = tokens
		print("writing dataset into file: {}.json".format(filename))
		with open(filename+".json", '+w') as f:
			f.write(json.dumps(data))

	def tokenize_args(self,data):
		data = self.no_newline(data)
		tokens = self.Tokenizer_args.texts_to_sequences(tqdm(data))
		return tokens

	def tokenize_instr(self,data):
		tokens = self.Tokenizer_instr.texts_to_sequences(tqdm(data))
		return tokens

	def no_newline(self,data):
		fin = []
		for i in data:
			fin.append(i.replace("\n", " "))
		return fin

	def __str__(self):
		return str(self.Tokenizer.word_index)

	def save_status(self,):
		with open("Tokenizer_args_data.json", '+w') as f:
			f.write(json.dumps(self.Tokenizer_args.to_json()))
		with open("Toekenizer_instr_data.json",'+w') as f:
			f.write(json.dumps(self.Tokenizer_instr.to_json()))
		with open("Label_data.json", '+w') as f:
			f.write(json.dumps(self.label_mapping))

	def recover_status(self):
		with open("Label_data.json", 'r') as f:
			self.label_mapping = json.load(f)
		with open("Tokenizer_data.json", 'r') as f:
			self.Tokenizer = keras.preprocessing.text.tokenizer_from_json(json.load(f))
示例#21
0
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

complaints = dataset["phrase"].values
labels = dataset[["prompt"]].values

X_train, y_train, X_test, y_test = train_test_split(complaints,
                                                    labels,
                                                    test_size=0.20,
                                                    random_state=42)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
dict(list(word_index.items())[0:10])

train_seq = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_seq,
                             maxlen=max_length,
                             padding=padding_type,
                             truncating=trunc_type)

validation_seq = tokenizer.texts_to_sequences(y_train)
validation_padded = pad_sequences(validation_seq,
                                  maxlen=max_length,
                                  padding=padding_type,
                                  truncating=trunc_type)
示例#22
0
        x.append(text)
        y.append(int(record['is_sarcastic']))
    return x, y


# DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK
vocab_size = 1000
max_length = 120
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
training_size = 20000
sentences = []
labels = []
x, y = fetch_data()
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x)
x_seq = tokenizer.texts_to_sequences(x)
x_seq = pad_sequences(x_seq,
                      maxlen=max_length,
                      padding='post',
                      truncating='post')

random_sel = random.sample(range(200, len(x_seq)), 2000)
x_test = x_seq[random_sel]
y_test = np.array(y)[random_sel]

model = models.load_model('sarcasm.h5')
print('testing model..')
print(model.evaluate(x_test, y_test))
class DDTokenizer:
    def __init__(self, num_words, oov_token='<UNK>'):
        self.tokenizer = Tokenizer(num_words=num_words,
                                   oov_token=oov_token,
                                   filters='!"#$%&*+,-./:;<>?\\^_`{|}~\t\n',
                                   char_level=True,
                                   lower=False)
        self.has_trained = False

        self.pad_type = 'post'
        self.trunc_type = 'post'

        # The encoded data
        self.word_index = {}

    def fit(self, train_data):
        # Get max training sequence length
        print("Training Tokenizer...")
        self.tokenizer.fit_on_texts(train_data)
        self.has_trained = True
        print("Done training...")

        # Get our training data word index
        self.word_index = self.tokenizer.word_index

    def encode(self,
               data,
               use_padding=True,
               padding_size=None,
               normalize=False):
        # Encode training data sentences into sequences
        train_sequences = self.tokenizer.texts_to_sequences(data)

        # Get max training sequence length if there is none passed
        if padding_size is None:
            maxlen = max([len(x) for x in train_sequences])
        else:
            maxlen = padding_size

        if use_padding:
            train_sequences = pad_sequences(train_sequences,
                                            padding=self.pad_type,
                                            truncating=self.trunc_type,
                                            maxlen=maxlen)

        if normalize:
            train_sequences = np.multiply(1 / len(self.tokenizer.word_index),
                                          train_sequences)

        return train_sequences

    def pad(self, data, padding_size=None):
        # Get max training sequence length if there is none passed
        if padding_size is None:
            padding_size = max([len(x) for x in data])

        padded_sequence = pad_sequences(data,
                                        padding=self.pad_type,
                                        truncating=self.trunc_type,
                                        maxlen=padding_size)

        return padded_sequence

    def decode(self, array):
        assert self.has_trained, "Train this tokenizer before decoding a string."
        return self.tokenizer.sequences_to_texts(array)

    def test(self, string):
        encoded = list(self.encode(string)[0])
        decoded = self.decode(self.encode(string))

        print("\nEncoding:")
        print("{original} -> {encoded}".format(original=string[0],
                                               encoded=encoded))
        print("\nDecoding:")
        print("{original} -> {encoded}".format(original=encoded,
                                               encoded=decoded[0].replace(
                                                   " ", "")))

    def get_info(self):
        return self.tokenizer.index_word
示例#24
0
from tensorflow.keras.utils import to_categorical
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
"""
문장마다 길이가 다르므로 패딩을 해서 일관된 문장 길이로 만들어주는 모듈
신경망에 배치처리를 하려면 일관된 사이즈로 문장을 만들어줘야 하기 때문에 필요
"""
from tensorflow.keras.preprocessing.text import Tokenizer  # text를 숫자로 변환하는 모듈

text = """경마장에 있는 말이 뛰고 있다\n 그의 말이 법이다\n 가는 말이 고와야 오는 말이 곱다"""

t = Tokenizer()
t.fit_on_texts([text])  # text 문자를 가지고 corpus, word_to_id, id_to_word를 생성

sequences = list()
for line in text.split('\n'):
    encoded = t.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i + 1]
        sequences.append(sequence)

max_len = max(len(l) for l in sequences)

from tensorflow.keras.preprocessing.sequence import pad_sequences
sequences = pad_sequences(sequences, maxlen=6, padding='pre')

from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
sequences = np.array(pad_sequences(sequences, maxlen=6, padding='pre'))
X = sequences[:, :-1]
y = sequences[:, -1]
示例#25
0
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku
import numpy as np
import matplotlib.pyplot as plt

tokenizer = Tokenizer()
data = open('poems.txt', encoding="utf8").read()
corpus = data.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)
# pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(
    pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
# create predictors and label
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = ku.to_categorical(label, num_classes=total_words)

model = Sequential()
    return " ".join(tokens)

#학습 데이터 전처리 진행
dataset.text = dataset.text.apply(lambda x: preprocess(x))

"""학습 데이터 나누기"""

train, test = train_test_split(dataset, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(train))
print("TEST size:", len(test))

documents = [_text.split() for _text in train.text] #list, 1280000*50

vocab_size = 400000
tk = Tokenizer(num_words=vocab_size)
tk.fit_on_texts(train.text) 
x_train = tk.texts_to_sequences(train.text)
x_test = tk.texts_to_sequences(test.text)

labels = train.target.unique().tolist() #POSITIVE NEUTRAL NEGATIVE
labels.append(NEUTRAL)
print(labels)

encoder = LabelEncoder() #문장 -> 숫자 자동으로
encoder.fit(train.target.tolist())

y_train = encoder.transform(train.target.tolist())
y_test = encoder.transform(test.target.tolist())

y_train = y_train.reshape(-1,1) #1열로 자동으로 만들어줍니다.
y_test = y_test.reshape(-1,1)
示例#27
0
all = x + xtest
len(all)

y = df.iloc[:,1].values
y.shape


ytest =df.iloc[:,1].values
ytest.shape

ytrain = pd.DataFrame(y)



tokenizer = Tokenizer(nb_words=10000, split=' ')
tokenizer.fit_on_texts(all)
Xs = tokenizer.texts_to_sequences(all)
Xs = pad_sequences(Xs,maxlen=20,padding='post',truncating='post')
Ys = pd.get_dummies(ytrain).values





Xtrain = Xs[:16000]
Xtest = Xs[16000:]



Ytrain = Ys
Ytest = pd.get_dummies(ytest).values
示例#28
0
def train():

    #Pre-processing

    df = pd.read_csv(DATA_PATH)
    df.drop('Unnamed: 0', axis=1, inplace=True)
    df.dropna(axis=0, inplace=True)
    # df.head(10)

    df['Party'] = pd.Categorical(df.Party)
    df['Party'] = pd.get_dummies(df['Party'], drop_first=True)

    X = df['Tweet']
    Y = df['Party']

    x, y = pre_process(X, Y=Y)

    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=420)

    #Creating a tokenizer
    t = Tokenizer(oov_token="UNK")
    t.fit_on_texts(x)

    vocab_size = len(t.word_index) + 1
    print("Vocabulary size: {}".format(vocab_size))
    max_sent_len = len(max(x, key=len).split()) + 1
    print("Maximum sentence length: {}".format(max_sent_len))
    emb_dim = 75
    print("Embedding Dimensions: {}".format(emb_dim))

    padded_X_train = encode_and_pad(X_train, t, max_sent_len)

    x_train, x_val, Y_train, y_val = train_test_split(padded_X_train,
                                                      y_train,
                                                      test_size=0.1,
                                                      random_state=420)

    checkpoint_path = TRAINING_PATH
    cp_callback = ModelCheckpoint(checkpoint_path,
                                  verbose=1,
                                  save_weights_only=True,
                                  period=20)

    model = Sequential([
        Embedding(input_dim=vocab_size,
                  output_dim=emb_dim,
                  input_length=max_sent_len,
                  trainable=True),
        Bidirectional(CuDNNLSTM(64, return_sequences=False)),
        Dropout(0.5),
        Dense(2, activation='softmax')
    ])

    model.save_weights(checkpoint_path.format(epoch=0))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train,
              Y_train,
              epochs=200,
              batch_size=300,
              shuffle=True,
              callbacks=[cp_callback],
              validation_data=(x_val, y_val))
    model.save_weights(WEIGHTS_PATH)

    dic = {
        'Vocab Size': vocab_size,
        'Max Sent Length': max_sent_len,
        'Emb Dim': emb_dim
    }
    meta_df = pd.DataFrame(dic, index=['Model 1'])
    meta_df.to_csv(META_PATH)

    with open(TOKENIZER_PATH, 'wb') as handle:
        pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("Training done.")
if "all_descriptions_test.zarr" in os.listdir('data') and "embedding_test_128.zarr" in os.listdir('data'):
    with tf.device('/cpu:0'):
        embedding_test = da.from_zarr("data/embedding_test_128.zarr")
        desc_test = da.from_zarr("data/all_descriptions_test.zarr")
else: 
    print("Embedding and Descriptions dask array haven't been saved for testing, please run text_preprocessing.py")
    exit()

allText = []
for desc_array in [desc_train.compute(), desc_train.compute(), desc_validation.compute()]:
    for descs in desc_array:
        for desc in descs:
            allText.append(desc)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(allText)
with open('tokenizer.pickle', 'wb') as handle:
    pkl.dump(tokenizer, handle, protocol=pkl.HIGHEST_PROTOCOL)
print("Saved tokenizer file....")
vocab_size = len(tokenizer.word_index) + 1

max_length = max(len(d.split()) for d in allText)

def count_length(tokenizer, descriptions):
    try:
        with tf.device('/gpu:0'):
            Y = 0
            for i in tqdm(range(len(descriptions))):
                for j in range(len(descriptions[i])):
                    seq = tokenizer.texts_to_sequences([(descriptions[i][j].compute()).tolist()])[0]
                    Y+=len(seq)-1
示例#30
0
print("size of dictionary: {0}".format(len(embdict)))
del (words)

# In[ ]:

MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 300

# In[ ]:

tokenizer = Tokenizer()
tokenizer = Tokenizer(num_words=MAX_NB_WORDS,
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~',
                      lower=True)
tokenizer.fit_on_texts(texts_train + texts_test + texts_ev)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# In[ ]:

embedding_matrix = np.zeros((len(word_index), EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = embdict[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except:
        pass
del (embdict)