def test_tokenizer_serde_fitting(): sample_texts = [ 'There was a time that the pieces fit, but I watched them fall away', 'Mildewed and smoldering, strangled by our coveting', 'I\'ve done the math enough to know the dangers of our second guessing' ] tokenizer = text.Tokenizer(num_words=100) tokenizer.fit_on_texts(sample_texts) seq_generator = tokenizer.texts_to_sequences_generator(sample_texts) sequences = [seq for seq in seq_generator] tokenizer.fit_on_sequences(sequences) tokenizer_json = tokenizer.to_json() recovered = text.tokenizer_from_json(tokenizer_json) assert tokenizer.char_level == recovered.char_level assert tokenizer.document_count == recovered.document_count assert tokenizer.filters == recovered.filters assert tokenizer.lower == recovered.lower assert tokenizer.num_words == recovered.num_words assert tokenizer.oov_token == recovered.oov_token assert tokenizer.word_docs == recovered.word_docs assert tokenizer.word_counts == recovered.word_counts assert tokenizer.word_index == recovered.word_index assert tokenizer.index_word == recovered.index_word assert tokenizer.index_docs == recovered.index_docs
def get_tokenizer(self): ''' :param embedded_matrix_size: 嵌入矩阵大小 :return: tokenizer ''' if not os.path.exists(self.tokenizer_path): self.remove_stop_word_list = self.get_remove_stop_word() tokenizer = text.Tokenizer(num_words=self.embedded_matrix_size, lower=False, char_level=False) tokenizer.fit_on_texts(self.remove_stop_word_list) tokenizer_json = tokenizer.to_json() with open(self.tokenizer_path, "w") as f: f.write(tokenizer_json) print("save tokenizer_json success as '{}'".format( self.tokenizer_path)) return tokenizer else: print("更换数据集需手动删除{}此文件,并重新运行代码后会自动生成tokenizer.".format( self.tokenizer_path)) with open(self.tokenizer_path, "r") as f: tokenizer_json = f.read() tokenizer = text.tokenizer_from_json(tokenizer_json) print("load tokenizer_json success as '{}'".format( self.tokenizer_path)) return tokenizer
def __init__(self, train_data_path='data/token_train.tsv', max_document_length=100, vocabulary_size=5000, embedding_size=300, dropout_keep_prob=0.5, lr=1e-4, batch_size=50, num_epochs=5, dev_size=0.2): self.dev_size = dev_size self.num_epochs = num_epochs self.batch_size = batch_size self.lr = lr self.dropout_keep_prob = dropout_keep_prob self.embedding_size = embedding_size self.vocabulary_size = vocabulary_size self.max_document_length = max_document_length self.train_data_path = train_data_path self.pickle_path = os.path.splitext(train_data_path)[0] + '.model.p' self._tokenizer = text.Tokenizer(num_words=vocabulary_size, char_level=False, filters='') raw_x, raw_y = self._aggregate_raw_data_from_dir(self.train_data_path) self._tokenizer.fit_on_texts(raw_x) self.x_train, self.y_train = self._prepare_from_raw_data(raw_x, raw_y) self.model = self._create_model()
def test_tokenizer_unicode(): sample_texts = [u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'] tokenizer = text.Tokenizer(num_words=5) tokenizer.fit_on_texts(sample_texts) assert len(tokenizer.word_counts) == 5
def test_tokenizer_oov_flag(): """Test of Out of Vocabulary (OOV) flag in text.Tokenizer """ x_train = ['This text has only known words'] x_test = ['This text has some unknown words'] # 2 OOVs: some, unknown # Default, without OOV flag tokenizer = text.Tokenizer() tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 4 # discards 2 OOVs # With OOV feature tokenizer = text.Tokenizer(oov_token='<unk>') tokenizer.fit_on_texts(x_train) x_test_seq = tokenizer.texts_to_sequences(x_test) assert len(x_test_seq[0]) == 6 # OOVs marked in place
def test_tokenizer_serde_no_fitting(): tokenizer = text.Tokenizer(num_words=100) tokenizer_json = tokenizer.to_json() recovered = text.tokenizer_from_json(tokenizer_json) assert tokenizer.get_config() == recovered.get_config() assert tokenizer.word_docs == recovered.word_docs assert tokenizer.word_counts == recovered.word_counts assert tokenizer.word_index == recovered.word_index assert tokenizer.index_word == recovered.index_word assert tokenizer.index_docs == recovered.index_docs
def test_tokenizer_lower_flag(): """Tests for `lower` flag in text.Tokenizer """ # word level tokenizer with sentences as texts word_tokenizer = text.Tokenizer(lower=True) texts = [ 'The cat sat on the mat.', 'The dog sat on the log.', 'Dog and Cat living Together.' ] word_tokenizer.fit_on_texts(texts) expected_word_counts = OrderedDict([('the', 4), ('cat', 2), ('sat', 2), ('on', 2), ('mat', 1), ('dog', 2), ('log', 1), ('and', 1), ('living', 1), ('together', 1)]) assert word_tokenizer.word_counts == expected_word_counts # word level tokenizer with word_sequences as texts word_tokenizer = text.Tokenizer(lower=True) word_sequences = [['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing']] word_tokenizer.fit_on_texts(word_sequences) expected_word_counts = OrderedDict([('the', 2), ('cat', 1), ('is', 2), ('sitting', 1), ('dog', 1), ('standing', 1)]) assert word_tokenizer.word_counts == expected_word_counts # char level tokenizer with sentences as texts char_tokenizer = text.Tokenizer(lower=True, char_level=True) texts = [ 'The cat sat on the mat.', 'The dog sat on the log.', 'Dog and Cat living Together.' ] char_tokenizer.fit_on_texts(texts) expected_word_counts = OrderedDict([('t', 11), ('h', 5), ('e', 6), (' ', 14), ('c', 2), ('a', 6), ('s', 2), ('o', 6), ('n', 4), ('m', 1), ('.', 3), ('d', 3), ('g', 5), ('l', 2), ('i', 2), ('v', 1), ('r', 1)]) assert char_tokenizer.word_counts == expected_word_counts
def test_sequential_fit(): texts = [ 'The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.' ] word_sequences = [['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing']] tokenizer = text.Tokenizer() tokenizer.fit_on_texts(texts) tokenizer.fit_on_texts(word_sequences) assert tokenizer.document_count == 5 tokenizer.texts_to_matrix(texts) tokenizer.texts_to_matrix(word_sequences)
def test_tokenizer(): sample_texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = text.Tokenizer(num_words=10) tokenizer.fit_on_texts(sample_texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(sample_texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: tokenizer.texts_to_matrix(sample_texts, mode)
def get_word2vec(dataframe): #词向量 # jieba分词 wordList = dataframe["title"].apply(lambda x: list(jieba.cut(x))) #words_dict =[] texts = [] stoplist = [] # 去掉停用词 for words in wordList: line = [word for word in words if word not in stoplist] #words_dict.extend([word for word in line]) texts.append(line) maxlen = 0 for line in texts: if maxlen < len(line): maxlen = len(line) max_words = 50000 # 利用keras的Tokenizer进行onehot,并调整未等长数组 tokenizer = text.Tokenizer(num_words=max_words) tokenizer.fit_on_texts(texts) data_w = tokenizer.texts_to_sequences(texts) word2vec = sequence.pad_sequences(data_w, maxlen=maxlen) return word2vec
def __init__(self): """Constructor for the selector.""" logging.info('Initializing tokenizer..') words, embedding_matrix = self._build_embedding_matrix() self.tokenizer = text.Tokenizer(num_words=len(words), lower=False) # Tokenizer treats each item in a nested list as a token. self.tokenizer.fit_on_texts([[word] for word in words]) # Preppend a array of zeros to the embeddings matrix that will be used by # out-of-vocabulary words. embedding_matrix = np.concatenate( [np.zeros((1, embedding_matrix.shape[1])), embedding_matrix]) assert len(words) == len(self.tokenizer.word_index), ( 'embeddings_matrix and tokenizer.word_index do not have the same size:' ' {} and {}, respectively'.format(len(words), len(self.tokenizer.word_index))) assert all([ self.tokenizer.word_index[word] == i + 1 for i, word in enumerate(words) ]), ('embeddings_matrix and tokenizer.word_index are not aligned.') self.model = self._build_model(embedding_matrix)
def load_dataset(test_sen=None): EMBEDDING_FILE = config.EMBEDDING_300 df = pd.read_csv(config.DATA_PATH) X = df["content"].values Y = df["label"].values x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=109) # data preprocessing print(X[0]) puncList = ["।", "”", "“", "’"] x = "".join(puncList) filterString = x + '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n০১২৩৪৫৬৭৮৯' tokenizer = text.Tokenizer( num_words=50000, filters=filterString, lower=False, ) tokenizer.fit_on_texts(x_train) train_idx = tokenizer.texts_to_sequences(x_train) test_idx = tokenizer.texts_to_sequences(x_test) word_index = tokenizer.word_index embeddings_index = {} for i, line in enumerate(open(EMBEDDING_FILE, encoding="utf-8")): val = line.split() embeddings_index[val[0]] = np.asarray(val[1:], dtype='float32') embedding_matrix = np.zeros((len(word_index) + 1, 300)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector x_train = sequence.pad_sequences(train_idx, maxlen=32, padding='post', truncating='post') x_test = sequence.pad_sequences(test_idx, maxlen=32, padding='post', truncating='post') test_size = len(x_test) dev_size = (int)(test_size * 0.1) x_dev = x_test[:dev_size] x_test = x_test[dev_size:] y_dev = y_test[:dev_size] y_test = y_test[dev_size:] x_train = torch.tensor(x_train, dtype=torch.long) y_train = torch.tensor(y_train, dtype=torch.float32) train = TensorDataset(x_train, y_train) train_iter = DataLoader(train, batch_size=32) x_test = torch.tensor(x_test, dtype=torch.long) y_test = torch.tensor(y_test, dtype=torch.float32) test = TensorDataset(x_test, y_test) test_iter = DataLoader(test, batch_size=32) x_dev = torch.tensor(x_dev, dtype=torch.long) y_dev = torch.tensor(y_dev, dtype=torch.float32) valid = TensorDataset(x_dev, y_dev) valid_iter = DataLoader(valid, batch_size=32) word_embeddings = embedding_matrix vocab_size = 50000 return vocab_size, word_embeddings, train_iter, valid_iter, test_iter
# Data Loading train = pd.read_csv('../Data/MELD.Raw/train_sent_emo.csv', dtype=str) validation = pd.read_csv('../Data/MELD.Raw/dev_sent_emo.csv', dtype=str) test = pd.read_csv('../Data/MELD.Raw/test_sent_emo.csv', dtype=str) # Create mapping to identify audio files train["ID"] = 'dia' + train["Dialogue_ID"] + '_utt' + train[ "Utterance_ID"] + '.jpg' validation["ID"] = 'dia' + validation["Dialogue_ID"] + '_utt' + validation[ "Utterance_ID"] + '.jpg' test["ID"] = 'dia' + test["Dialogue_ID"] + '_utt' + test[ "Utterance_ID"] + '.jpg' # Text Features tokenizer = kt.Tokenizer(num_words=5000) tokenizer.fit_on_texts(train['Utterance']) vocab_size = len(tokenizer.word_index) + 1 train_tokens = tokenizer.texts_to_sequences(train['Utterance']) text_features = pd.DataFrame(ks.pad_sequences(train_tokens, maxlen=200)) validation_tokens = tokenizer.texts_to_sequences(validation['Utterance']) validation_features = pd.DataFrame( ks.pad_sequences(validation_tokens, maxlen=200)) # Data Pipeline def train_generator(features, batch): train_generator = ki.ImageDataGenerator(rescale=1. / 255.)
def trainer(dict_csv='test.csv'): data = pd.read_csv(dict_csv, engine='python') train_size = int(len(data) * .7) train_posts = data['documents'] train_tags = data['tags'] test_posts = data['documents'][train_size:] test_tags = data['tags'][train_size:] posts = data['documents'] dlp_data = {'filename': [], 'tags': []} vocab_size = 10000 tokenize = text.Tokenizer(num_words=vocab_size) tokenize.fit_on_texts(train_posts) # save token with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenize, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Saving tokenizer with name tokenizer.pickle") x_train = tokenize.texts_to_matrix(train_posts) x_test = tokenize.texts_to_matrix(test_posts) x_post = tokenize.texts_to_matrix(posts) encoder = preprocessing.LabelBinarizer() encoder.fit(train_tags) y_train = encoder.transform(train_tags) y_test = encoder.transform(test_tags) text_labels = encoder.classes_ num_labels = len(np.unique(y_train)) batch_size = 1024 model = Sequential() ##Buat hidden layer, gunanya buat naikin akurasi model.add(Dense(512, input_shape=(vocab_size,))) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(128)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(512)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(128)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(64)) model.add(BatchNormalization()) model.add(Activation('relu')) model.add(Dense(num_labels)) model.add(BatchNormalization()) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=batch_size, epochs=256, verbose=1, validation_split=0.1) score = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=1) model_json = model.to_json() with open("model.json", "w") as json_file: json_file.write(model_json) print("\n Saved h5 json model to disk with name model.json ") model.save_weights("model.h5") print("\n Saved model to disk with name model.h5") print("Training done") pred = model.predict(np.array(x_post)) pred = pred > 0.5 for i in range(0, len(posts)): print('Document name: %s, is %s' % (data['filename'][i], text_labels[np.argmax(pred[i])])) dlp_data['filename'].append(data['filename'][i]) dlp_data['tags'].append(text_labels[np.argmax(pred[i])]) df = pd.DataFrame(dlp_data, columns=['filename', 'tags']) df.to_csv('dlp.csv', encoding="utf-8") print('Saved CSV model') json_file = open('model.json', 'r') loaded_json_model = json_file.read() json_file.close() loaded_model = model_from_json(loaded_json_model) loaded_model.load_weights("model.h5") print("Loaded model from disk") loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) score = loaded_model.evaluate(x_test, y_test, verbose=1) print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1] * 100))
def LSTM(positive_tweets, negative_tweets): """ Return a LSTM model fitted on postive_tweets and negative_tweets Keyword arguments: positive_tweets -- the file (.csv) that contains the positive tweets negative_tweets -- the file (.csv) that contains the negative tweets """ pos_df = pd.read_csv(positive_tweets, index_col=0) neg_df = pd.read_csv(negative_tweets, index_col=0) train = pd.concat([pos_df, neg_df]) #Randomize order train = train.sample(frac=1, random_state=1) train = train.dropna() # The maximum number of words to be used. (most frequent) MAX_NB_WORDS = 450000 # Max number of words in each complaint. MAX_SEQUENCE_LENGTH = 50 EMBEDDING_DIM = 300 tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(train.tweets.values) word_index = tokenizer.word_index #number of word with that appears at least 5 times words = len([ k for k in tokenizer.word_index.keys() if tokenizer.word_counts[k] > 4 ]) # The maximum number of words to be used. (most frequent) MAX_NB_WORDS = words # Max number of words in each complaint. MAX_SEQUENCE_LENGTH = 50 EMBEDDING_DIM = 300 tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(train.tweets.values) word_index = tokenizer.word_index #Create sequence of index X = tokenizer.texts_to_sequences(train.tweets.values) X = sequence.pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH) Y = train.label X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=1) #Build model batch_size = 8192 model = Sequential() model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1])) model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #To have our best model es = EarlyStopping(monitor='val_acc', mode='max', verbose=1, patience=5) mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=1, save_best_only=True) #Training model.fit(X_train, Y_train, batch_size=batch_size, epochs=250, validation_data=(X_test, Y_test), callbacks=[es, mc]) saved_model = load_model('best_model.h5') return saved_model