def read_copus_generator(self, batch_size=64): """ return a generator with the specified batch_size """ logger.info("Beigin read copus {0}".format(file_name)) data = [] index = 0 with open(file_name, 'r') as fread: while True: try: line = fread.readline() data.append(line) index += 1 if index % 100000 == 0: logger.info("The program has processed {0} lines ". format(index)) except: logger.info("Read End") break tokenizer = Tokenizer(nb_words=30000) tokenizer.fit_on_texts(data) logger.info("word num: {0}".format(len(tokenizer.word_counts))) sorted_word_counts = sorted( tokenizer.word_counts.items(), key=operator.itemgetter(1), reverse=True) # save the word_counts to the meta with open(file_name.replace("train.", "meta."), "w") as fwrite: for word_cnt in sorted_word_counts: key = word_cnt[0] val = word_cnt[1] line = key + ":" + str(val) + "\n" fwrite.write(line) vectorize_data = tokenizer.texts_to_matrix(data) return vectorize_data
class Featurizer: max_words = None tokenizer = None def __init__(self, max_words=1000): self.max_words = max_words self.tokenizer = Tokenizer(num_words=max_words) def fit_transform(self, data): texts = [l['text'] for l in data] self.tokenizer.fit_on_texts(texts) # remove words that cross the max_words limit self.tokenizer.word_index = {k: v for k, v in self.tokenizer.word_index.items() if v <= self.max_words} return self.transform(data) def transform(self, data): texts = [l['text'] for l in data] return self.tokenizer.texts_to_matrix(texts, mode='binary') def transform_inv(self, m): index = {v: k for k, v in self.tokenizer.word_index.items()} # word index by id return [[index.get(i) for i in np.nonzero(line)[0] if i in index] for line in m] def save(self, filepath): with open(filepath + '_word_index.json', 'w') as f: f.write(json.dumps(self.tokenizer.word_index)) @classmethod def load(cls, filepath): with open(filepath + '_word_index.json', 'r') as f: word_index = json.load(f) c = cls(max_words=len(word_index)) c.tokenizer.word_index = word_index return c
def test_sequential_fit(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] word_sequences = [ ['The', 'cat', 'is', 'sitting'], ['The', 'dog', 'is', 'standing'] ] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) tokenizer.fit_on_texts(word_sequences) assert tokenizer.document_count == 5 tokenizer.texts_to_matrix(texts) tokenizer.texts_to_matrix(word_sequences)
def transform(self, dataset=None): """ Transform data into vector and matrices. """ clean = lambda words: [str(word) for word in words if type(word) is not float] x_unlabel = clean(dataset.unlabel) x_train = clean(dataset.train.X) x_test = clean(dataset.test.X) y_train = dataset.train.y y_test = dataset.test.y tokenizer = Tokenizer(nb_words=self.max_words) tokenizer.fit_on_texts(x_unlabel) # save the list of words in the vocabulary self.vocabulary = tokenizer.word_counts X_unlabel = tokenizer.texts_to_matrix(x_unlabel, mode=self.mode) X_unlabel = pad_sequences(X_unlabel, maxlen=self.max_len, dtype='float64') X_train = tokenizer.texts_to_matrix(x_train, mode=self.mode) X_train = pad_sequences(X_train, maxlen=self.max_len, dtype='float64') X_test = tokenizer.texts_to_matrix(x_test, mode=self.mode) X_test = pad_sequences(X_test, maxlen=self.max_len, dtype='float64') y_train = np.asarray(y_train, dtype='int32') y_test = np.asarray(y_test, dtype='int32') Y_train = np_utils.to_categorical(y_train, self.classes) Y_test = np_utils.to_categorical(y_test, self.classes) return Dataset( X_unlabel, Data(X_train, Y_train, y_train), Data(X_test, Y_test, y_test), )
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
# lets take 80% data as training and remaining 20% for test. train_size = int(len(data) * .8) train_posts = data['news'][:train_size] train_tags = data['category'][:train_size] train_files_names = data['filename'][:train_size] test_posts = data['news'][train_size:] test_tags = data['category'][train_size:] test_files_names = data['filename'][train_size:] # define Tokenizer with Vocab Size tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(train_posts) x_train = tokenizer.texts_to_matrix(train_posts, mode='tfidf') x_test = tokenizer.texts_to_matrix(test_posts, mode='tfidf') encoder = LabelBinarizer() encoder.fit(train_tags) y_train = encoder.transform(train_tags) y_test = encoder.transform(test_tags) model = Sequential() model.add(Dense(512, input_shape=(vocab_size,))) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(512)) model.add(Activation('relu')) model.add(Dropout(0.3)) model.add(Dense(num_labels))
tokenizer.fit_on_texts(Y + X) print("size X:", len(X)) print("size Y:", len(Y)) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) print(len(X_train), 'train sentences') print(len(X_test), 'test sentences') print(len(Y_train), 'train classes') print(len(Y_test), 'test classes') c = Counter(Y_train) print(c.items()) X_train = tokenizer.texts_to_matrix(X_train, mode='binary') X_test = tokenizer.texts_to_matrix(X_test, mode='binary') Y_train = tokenizer.texts_to_sequences(Y_train) Y_test = tokenizer.texts_to_sequences(Y_test) Y_train_new = [] Y_test_new = [] for y in Y_train: Y_train_new.append(y[0]) for y in Y_test: Y_test_new.append(y[0]) Y_train = Y_train_new Y_test = Y_test_new
from keras.preprocessing.text import hashing_trick ################################################################## ## 1. text_to_word_sequence, one_hot, hashing_trick texts = ['some thing to eat', 'some thing to drink'] print(text_to_word_sequence(texts[0])) # ['some', 'thing', 'to', 'eat']; 简单的空格分开 print(one_hot(texts[0], 10)) # [5, 7, 5, 7]; (10 表示数字化向量为 10 以内的数字) print(one_hot(texts[1], 10)) # [5, 7, 5, 5]; 因为内部调用了 hash, 所以能够在定了 (text, n) 之后对每个 str 赋值相同 # This is a wrapper to the `hashing_trick` function using `hash` as the hashing function, unicity of word to index mapping non-guaranteed. ################################################################## ## 2. Tokenizer: 索引就是出现的先后位置 # keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', lower=True, split=" ", char_level=False) # Tokenizer 是一个用于向量化文本, 或将文本转换为序列(即单词在字典中的下标构成的列表, 从 1 算起)的类. # num_words: None 或整数, 处理的最大单词数量. 若被设置为整数, 则分词器将被限制为待处理数据集中最常见的 num_words 个单词 # char_level: 如果为 True, 每个字符将被视为一个标记 texts = ['some thing to eat', 'some thing to drink'] tmp_tokenizer = Tokenizer(num_words=None) # num_words:None 或整数, 处理的最大单词数量; 少于此数的单词丢掉 tmp_tokenizer.fit_on_texts(texts) # tmp_tokenizer.fit_on_texts(texts[0]); tmp_tokenizer.fit_on_texts(texts[1]) # 不能这样, 会按单个字母来统计 # 属性 print(tmp_tokenizer.word_counts) # OrderedDict([('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)]); 在训练期间出现的次数 print(tmp_tokenizer.word_docs) # {'thing': 2, 'eat': 1, 'to': 2, 'some': 2, 'drink': 1}; 在训练期间所出现的文档或文本的数量 print(tmp_tokenizer.word_index) # {'some': 1, 'thing': 2, 'to': 3, 'eat': 4, 'drink': 5}; 排名或者索引 print(len(tmp_tokenizer.word_index)) # 5; 词典长度 print(tmp_tokenizer.index_docs) # {2: 2, 4: 1, 3: 2, 1: 2, 5: 1}; 将 word_index 和 word_docs 合并 print(tmp_tokenizer.document_count) # 2; 训练文档数 # 类方法 print(tmp_tokenizer.texts_to_sequences(texts)) # [[1, 2, 3, 4], [1, 2, 3, 5]]; 得到词索引 print(tmp_tokenizer.texts_to_matrix(texts)) # 矩阵化 = one_hot; one-hot 形式的码, 即仅记录词在词典中的下标 # [[ 0. 1. 1. 1. 1. 0.] # [ 0. 1. 1. 1. 0. 1.]]
def get_word_features(emails,verbose=True,nb_words=5000,skip_top=0,maxlen=None,as_matrix=True, matrix_type='count', label_cutoff=0.01,max_n=1): (totalWordsCount,fromCount,domainCount,labels) = getEmailStats(emails) if verbose: print('Creating email dataset with labels %s '%str(labels)) print('Label word breakdown:') total = 0 for label in labels: count = sum(totalWordsCount[label].values()) total+=count print('\t%s:%d'%(label,count)) print('Total word count: %d'%total) labelCounts = {label:0 for label in labels} for email in emails: labelCounts[email.label]+=1 cutoff = int(len(emails)*label_cutoff) removed = 0 for label in labels[:]: if labelCounts[label]<cutoff or label=='Important' or label=='Unread' or label=='Sent': removed+=1 labels.remove(label) labelNums = {labels[i]:i for i in range(len(labels))} if verbose: print('Found %d labels below count threshold of %d '%(removed,cutoff)) if verbose: print('Creating email dataset with labels %s '%str(labels)) print('Label email count breakdown:') total = 0 for label in labels: print('\t%s:%d'%(label,labelCounts[label])) print('Total emails: %d'%sum([labelCounts[label] for label in labels])) texts = [] emailLabels = [] for email in emails: if email.label not in labels: continue text = email.sender+" "+str(email.subject) text+= email.fromDomain text+=email.content texts.append(text.replace('\n','').replace('\r','')) emailLabels.append(labelNums[email.label]) emailLabels = np.array(emailLabels) if max_n==1 or not as_matrix: tokenizer = Tokenizer(nb_words) tokenizer.fit_on_texts(texts) reverse_word_index = {tokenizer.word_index[word]:word for word in tokenizer.word_index} word_list = [reverse_word_index[i+1] for i in range(nb_words)] if as_matrix: feature_matrix = tokenizer.texts_to_matrix(texts, mode=matrix_type) return feature_matrix,emailLabels,word_list,labels else: sequences = tokenizer.texts_to_sequences(texts) return sequences,emailLabels,word_list,labels else: if matrix_type=='tfidf': vectorizer = TfidfVectorizer(ngram_range=(1,max_n),max_features=nb_words) else: vectorizer = CounterVectorizer(ngram_range=(1,max_n),max_features=nb_words,binary=matrix_type=='binary') feature_matrix = vectorizer.fit_transform(texts) word_list = vectorizer.get_feature_names() return feature_matrix,emailLabels,word_list,labels
'==========================================================================================' ) for i in range(len(tweets)): tweet_str = ' '.join(tweets[i]) docs.append(tweet_str) docs_all.extend(tweet_str) vocab = len(set(docs_all)) print('vocabulary_size:', vocab) tokenizer = Tokenizer(num_words=vocab) tokenizer.fit_on_texts(docs) X = tokenizer.texts_to_matrix(docs, mode='count') # 'count'mode='freq' mode='count' print('tweet matrix shape:', X.shape) #print('bag of work by counting:', X[0]) ## concatenation ########################################################## emb_size = 128 vec_list = defaultdict() vec_float = [] embfile = open("./data/vfest_128.harp", 'r') for line in embfile: a = line.strip('\n').split(' ')
'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']) # load our saved model model = load_model('my_model.h5') # load tokenizer tokenizer = Tokenizer() with open('tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) test_files = ["C:\\DL\\20news-bydate\\20news-bydate-test\\comp.graphics\\38758", "C:\\DL\\20news-bydate\\20news-bydate-test\\misc.forsale\\76115", "C:\\DL\\20news-bydate\\20news-bydate-test\\soc.religion.christian\\21329" ] x_data = [] for t_f in test_files: t_f_data = Path(t_f).read_text() x_data.append(t_f_data) x_data_series = pd.Series(x_data) x_tokenized = tokenizer.texts_to_matrix(x_data_series, mode='tfidf') i=0 for x_t in x_tokenized: prediction = model.predict(np.array([x_t])) predicted_label = labels[np.argmax(prediction[0])] print("File ->", test_files[i], "Predicted label: " + predicted_label) i += 1
def main(): ### read training and testing data (Y_data, X_data, tag_list) = read_data(train_path, True) (_, X_test, _) = read_data(test_path, False) all_corpus = X_data + X_test print('Find %d articles.' % (len(all_corpus))) ### tokenizer for all data tokenizer = Tokenizer() tokenizer.fit_on_texts(all_corpus) word_index = tokenizer.word_index file = open('tokenizer.obj', 'wb') pickle.dump(tokenizer, file) ### convert word sequences to index sequence print('Convert to index sequences.') train_sequences = tokenizer.texts_to_sequences(X_data) test_sequences = tokenizer.texts_to_sequences(X_test) train_bag = tokenizer.texts_to_matrix(X_data, mode='count')[:, :10000] test_bag = tokenizer.texts_to_matrix(X_test, mode='count')[:, :10000] ''' ### padding to equal length print ('Padding sequences.') train_sequences = pad_sequences(train_sequences) max_article_length = train_sequences.shape[1] test_sequences = pad_sequences(test_sequences,maxlen=max_article_length) ''' ### train_tag = to_multi_categorical(Y_data, tag_list) ### split data into training set and validation set #(X_train,Y_train),(X_val,Y_val) = split_data(train_bag,train_tag,split_ratio)## X_train = train_bag Y_train = train_tag print(X_train.shape) print(Y_train.shape) ''' ### get mebedding matrix from glove print ('Get embedding dict from glove.') embedding_dict = get_embedding_dict('glove.6B.%dd.txt'%embedding_dim) print ('Found %s word vectors.' % len(embedding_dict)) num_words = len(word_index) + 1 print ('Create embedding matrix.') embedding_matrix = get_embedding_matrix(word_index,embedding_dict,num_words,embedding_dim) ''' ### build model print('Building model.') model = Sequential() ''' model.add(Embedding(num_words, embedding_dim, weights=[embedding_matrix], input_length=max_article_length, trainable=False)) ''' #model.add(Flatten(input_shape=(max_article_length,embedding_dim))) model.add(Dense(128, input_shape=(X_train.shape[1], ), activation='relu')) model.add(Dropout(0.1)) model.add(Dense(128, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(128, activation='relu')) model.add(Dropout(0.1)) model.add(Dense(38, activation='sigmoid')) model.summary() adam = Adam(lr=0.001, decay=1e-6, clipvalue=1.) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=[f1_score]) earlystopping = EarlyStopping(monitor='val_f1_score', patience=10, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath='best.hdf5', verbose=1, save_best_only=True, save_weights_only=False, monitor='val_f1_score', mode='max') hist = model.fit(X_train, Y_train, validation_split=split_ratio, epochs=nb_epoch, batch_size=batch_size, callbacks=[earlystopping, checkpoint]) best_model = load_model('best.hdf5', custom_objects={'f1_score': f1_score}) Y_pred = best_model.predict(test_bag) thresh = 0.4 with open(output_path, 'w') as output: print('\"id\",\"tags\"', file=output) Y_pred_thresh = (Y_pred > thresh).astype('int') for index, labels in enumerate(Y_pred_thresh): labels = [ tag_list[i] for i, value in enumerate(labels) if value == 1 ] labels_original = ' '.join(labels) print('\"%d\",\"%s\"' % (index, labels_original), file=output) print('prediction written.')
train_labels.append(0) elif int(row[lbl_y])>=1 and imp >= split_trn and imp < split_trn + split_tst: #test set for imparity 133 samples imp+=1 test_texts.append(row['texto'].encode('utf-8').lower()) test_labels.append(1) elif int(row[lbl_y])==0 and n_imp >= split_trn and n_imp < split_trn + split_tst: #test set for not imparity 133 samples n_imp+=1 test_texts.append(row['texto'].encode('utf-8').lower()) test_labels.append(0) tokenizer = Tokenizer(nb_words=max_features, filters=keras.preprocessing.text.base_filter(), lower=True, split=" ") tokenizer.fit_on_texts(train_texts) train_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( train_texts ) , maxlen=maxlen ) test_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( test_texts ) , maxlen=maxlen ) train_matrix = tokenizer.texts_to_matrix( train_texts ) test_matrix = tokenizer.texts_to_matrix( test_texts ) embedding_weights = np.zeros( ( max_features , embeddings_dim ) ) affective_weights = np.zeros( ( max_features , 3 ) ) for word,index in tokenizer.word_index.items(): try: if not affective.has_key(word) : affective[word] = np.array( model.predict( np.array( embedding[word] ).reshape(1, -1) )[0] ) except: affective[word] = np.array( [ 5.0 , 5.0 , 5.0 ] ) if index < max_features: try: embedding_weights[index,:] = embeddings[word] affective_weights[index,:] = affective[word] except: embedding_weights[index,:] = np.random.rand( 1 , embeddings_dim ) affective_weights[index,:] = [ 5.0 , 5.0 , 5.0 ]
#remove capital letters and punctuation from both datasets traindata['Phrase'] = traindata['Phrase'].apply(lambda x: x.lower()) traindata['Phrase'] = traindata['Phrase'].apply( (lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) testdata['Phrase'] = testdata['Phrase'].apply(lambda x: x.lower()) testdata['Phrase'] = testdata['Phrase'].apply( (lambda x: re.sub('[^a-zA-z0-9\s]', '', x))) train_sentences = traindata['Phrase'].values train_labels = traindata['Sentiment'].values #tokenize and pad train sentences train_tokenizer = Tokenizer() train_tokenizer.fit_on_texts(train_sentences) train_sentences = train_tokenizer.texts_to_matrix(train_sentences) train_sentences = pad_sequences(train_sentences, maxlen=300) vocab_size = len(train_tokenizer.word_index) + 1 #use label encoder to turn train sentiment labels into categorical data le = preprocessing.LabelEncoder() train_labels = le.fit_transform(train_labels) train_labels = to_categorical(train_labels) #tokenize and pad test sentences test_sentences = testdata["Phrase"] test_tokenizer = Tokenizer() test_tokenizer.fit_on_texts(test_sentences) test_sentences = test_tokenizer.texts_to_matrix(test_sentences) test_sentences = pad_sequences(test_sentences, maxlen=300)
"../..")) # load the vocabulary vocab_filename = project_path + '/ml_model/vocab.txt' vocab = load_doc(vocab_filename) vocab = vocab.split() vocab = set(vocab) # load all training reviews positive_lines = process_docs('../data/txt_sentoken/pos', vocab, True) negative_lines = process_docs('../data/txt_sentoken/neg', vocab, True) # create the tokenizer tokenizer = Tokenizer() # fit the tokenizer on the documents docs = negative_lines + positive_lines tokenizer.fit_on_texts(docs) # encode training data set Xtrain = tokenizer.texts_to_matrix(docs, mode='freq') ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)]) # load all test reviews positive_lines = process_docs('../data/txt_sentoken/pos', vocab, False) negative_lines = process_docs('../data/txt_sentoken/neg', vocab, False) docs = negative_lines + positive_lines # encode training data set Xtest = tokenizer.texts_to_matrix(docs, mode='freq') ytest = array([0 for _ in range(100)] + [1 for _ in range(100)]) n_words = Xtest.shape[1] # load json and create model model_json_path = project_path + '/ml_model/model.json'
genres, test_size=0.3, random_state=42) test_y_eval = np.copy( test_y) # create copy of test array for use in model eval # Build embeddings print('\n[>>> Building word embeddings and class encodings...]') from keras.preprocessing.text import Tokenizer from sklearn.preprocessing import LabelEncoder from keras import utils max_words = 5000 # vocab limit tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(train_lyrics) # word index lookup for vocab train_x = tokenizer.texts_to_matrix(train_lyrics) test_x = tokenizer.texts_to_matrix(test_lyrics) # One-hot encode classes encoder = LabelEncoder() encoder.fit(train_y) train_y = encoder.transform(train_y) test_y = encoder.transform(test_y) class_labels = list(np.unique((df0.genre))) num_classes = len(class_labels) + 1 # 0 reserved for index train_y = utils.to_categorical(train_y, num_classes) test_y = utils.to_categorical(test_y, num_classes) print('\n=== Word Embeddings & Class Encodings Complete ===') print('--- Runtime =', timer(new_time, time.time()), '---') new_time = time.time()
from keras.preprocessing.text import Tokenizer samples = ['The cat sat on the mat.', 'The dog ate my homework.'] # We create a tokenizer, configured to only take # into account the top-1000 most common words tokenizer = Tokenizer(num_words=1000) # This builds the word index tokenizer.fit_on_texts(samples) # This turns strings into lists of integer indices. sequences = tokenizer.texts_to_sequences(samples) # You could also directly get the one-hot binary representations. # Note that other vectorization modes than one-hot encoding are supported! one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') # This is how you can recover the word index that was computed word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) samples = ['The cat sat on the mat.', 'The dog ate my homework.'] # We will store our words as vectors of size 1000. # Note that if you have close to 1000 words (or more) # you will start seeing many hash collisions, which # will decrease the accuracy of this encoding method. dimensionality = 1000 max_length = 10 results = np.zeros((len(samples), max_length, dimensionality))
#print(input_name) while True: # Get whether to pull or not done = firebase.get('/response', 'done') print(done) # User has finished entering commands, get new update and feed into network and post if done: # Get three query fields topic_json = firebase.get('/response', 'topic') topic = parse_json(topic_json) category_arr = text_to_word_sequence(topic) category = tok.texts_to_matrix(category_arr, mode='count') language_json = firebase.get('/response', 'language') language = parse_json(language_json) tech_arr = text_to_word_sequence(language) tech = tok2.texts_to_matrix(tech_arr, mode='count') platform_json = firebase.get('/response', 'platform') platform = parse_json(platform_json) tprogram_arr = text_to_word_sequence(platform) tprogram = tok3.texts_to_matrix(tprogram_arr, mode='count') # Get Predictor input_pred = np.zeros((1, 9)) input_type_index = 0
return train, test data = pd.read_csv("data/bbc-text.csv") train_x, test_x = split_test_train(data["text"]) train_y, test_y = split_test_train(data["category"]) print("Training set size: {0}, Test set size: {1}".format( len(train_x), len(test_x))) preprocess_start = time.time() #Preprocess X max_words = 1000 t = Tokenizer(num_words=max_words) t.fit_on_texts(train_x) train_one_hot_x = t.texts_to_matrix(train_x) #, mode = 'count') test_one_hot_x = t.texts_to_matrix(test_x) #, mode = 'count') #Preprocess Y values = list(set(train_y)) label_encoder = LabelEncoder() y_encoded = label_encoder.fit_transform(train_y) y_encoded = y_encoded.reshape(len(y_encoded), 1) onehot_encoder = OneHotEncoder(sparse=False) train_y_onehot = onehot_encoder.fit_transform(y_encoded) y_encoded = label_encoder.fit_transform(test_y) y_encoded = y_encoded.reshape(len(y_encoded), 1) onehot_encoder = OneHotEncoder(sparse=False) test_y_onehot = onehot_encoder.fit_transform(y_encoded)
def main(): ### read training and testing data (Y_data, X_data, tag_list) = read_data(train_path, True) (_, X_test, _) = read_data(test_path, False) all_corpus = X_data + X_test print('Find %d articles.' % (len(all_corpus))) ### tokenizer for all data tokenizer = Tokenizer() tokenizer.fit_on_texts(all_corpus) word_index = tokenizer.word_index pickle.dump(tokenizer, open('tk', 'wb')) ### convert word sequences to index sequence print('Convert to index sequences.') train_matrix = tokenizer.texts_to_matrix(X_data, mode='tfidf') test_matrix = tokenizer.texts_to_matrix(X_test, mode='tfidf') ### padding to equal length #print ('Padding sequences.') #train_sequences = pad_sequences(train_sequences) #max_article_length = train_sequences.shape[1] #test_sequences = pad_sequences(test_sequences,maxlen=max_article_length) ### train_tag = to_multi_categorical(Y_data, tag_list) ### split data into training set and validation set (X_train, Y_train), (X_val, Y_val) = split_data(train_matrix, train_tag, split_ratio) #X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1])) print(Y_train.shape) print(X_train.shape) ### get mebedding matrix from glove # print ('Get embedding dict from glove.') # embedding_dict = get_embedding_dict('glove/glove.6B.%dd.txt'%embedding_dim) # print ('Found %s word vectors.' % len(embedding_dict)) # num_words = len(word_index) + 1 # print ('Create embedding matrix.') # embedding_matrix = get_embedding_matrix(word_index,embedding_dict,num_words,embedding_dim) ### build model print('Building model.') for x in range(20): model = Sequential() print(x) model.add(Dense(512, activation='elu', input_dim=40587)) model.add(Dropout(0.5)) model.add(Dense(512, activation='tanh')) model.add(Dropout(0.5)) model.add(Dense(512, activation='elu')) model.add(Dropout(0.5)) model.add(Dense(512, activation='elu')) model.add(Dropout(0.5)) # model.add(Dense(512,activation='elu')) # model.add(Dropout(0.5)) # model.add(Dense(128,activation='elu')) # model.add(Dropout(0.5)) model.add(Dense(38, activation='sigmoid')) model.summary() adam = Adam(lr=0.001, decay=1e-6, clipvalue=0.5) tmp = str(x) + '.hdf5' model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=[f1_score]) earlystopping = EarlyStopping(monitor='val_f1_score', patience=10, verbose=1, mode='max') checkpoint = ModelCheckpoint(filepath=tmp, verbose=1, save_best_only=True, save_weights_only=False, monitor='val_f1_score', mode='max') hist = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=1000, batch_size=batch_size, callbacks=[earlystopping, checkpoint]) Y_pred = model.predict(test_matrix) thresh = 0.4 with open(output_path, 'w') as output: print('\"id\",\"tags\"', file=output) Y_pred_thresh = (Y_pred > thresh).astype('int') for index, labels in enumerate(Y_pred_thresh): labels = [ tag_list[i] for i, value in enumerate(labels) if value == 1 ] labels_original = ' '.join(labels) print('\"%d\",\"%s\"' % (index, labels_original), file=output)
''' list_x = pad_sequences(list_x, maxlen=seqlen,truncating='pre') pre_seq = pad_sequences(pre_seq, maxlen=seqlen,truncating='pre') ''' tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(pre_seq + list_x + nllist) word_index = tokenizer.word_index # seq_test = tokenizer.texts_to_sequences(pre_seq) print('seq_size=', len(seq_test)) seq_train = tokenizer.texts_to_sequences(list_x) data1 = pad_sequences(seq_train, maxlen=31, truncating='pre') test1 = pad_sequences(seq_test, maxlen=31, truncating='pre') data = [] pre_data = [] data.append(tokenizer.texts_to_matrix(pre_seq, mode='binary')) for i in range(4): print(trainlist[i]) print(data[0]) print(labels[i]) for i in range(10): print('test=', test1[i]) pre_data = tokenizer.sequences_to_matrix(test1, mode='binary') #=============================================== #=======bulid model========== labels = to_categorical(np.asarray(list_y)) for i in range(4): print(trainlist[i]) print(data[i])
# In[ ]: # Listing 6.3 Using Keras for word-level one-hot encoding from keras.preprocessing.text import Tokenizer samples = ['The cat sat on the mat.', 'The dog ate my homework.'] tokenizer = Tokenizer(num_words = 1000) tokenizer.fit_on_texts(samples) sequences = tokenizer.texts_to_sequences(samples) oneHotResults = tokenizer.texts_to_matrix(samples, mode = 'binary') wordIndex = tokenizer.word_index print('Found %s unique tokens.' % len(wordIndex)) # For data where the number of unique tokens is extremely large, we can use *one-hot hashing trick* which hashes words into vectors of fixed size, rather than assigning an index to each. # # To avoid having multiple words assigned to the same has (called *hash collisions*), the dimensionality of the hashing space should be much larger than the total number of unique tokens. # In[ ]: # Listing 6.4 Word-level one-hot encoding with hashing trick samples = ['The cat sat on the mat.', 'The dog ate my homework.']
# using keras for word-level one-hot encoding from keras.preprocessing.text import Tokenizer samples = ['The cat sat on the mat.', 'The dog ate my homework.'] tokenizer = Tokenizer(num_words=1000) tokenizer.fit_on_texts(samples) sequences = tokenizer.texts_to_sequences(samples) one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary') word_index = tokenizer.word_index print(len(word_index)) print(sequences) print(one_hot_results)
elif int(row[lbl_y] ) == 0 and n_imp >= split_trn and n_imp < split_trn + split_tst: #test set for not imparity 133 samples n_imp += 1 test_texts.append(row['texto'].encode('utf-8').lower()) test_labels.append(0) tokenizer = Tokenizer(nb_words=max_features, filters=keras.preprocessing.text.base_filter(), lower=True, split=" ") tokenizer.fit_on_texts(train_texts) train_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences(train_texts), maxlen=maxlen) test_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences(test_texts), maxlen=maxlen) train_matrix = tokenizer.texts_to_matrix(train_texts) test_matrix = tokenizer.texts_to_matrix(test_texts) embedding_weights = np.zeros((max_features, embeddings_dim)) affective_weights = np.zeros((max_features, 3)) for word, index in tokenizer.word_index.items(): try: if not affective.has_key(word): affective[word] = np.array( model.predict(np.array(embedding[word]).reshape(1, -1))[0]) except: affective[word] = np.array([5.0, 5.0, 5.0]) if index < max_features: try: embedding_weights[index, :] = embeddings[word] affective_weights[index, :] = affective[word] except:
from keras.layers import Embedding import matplotlib.pyplot as plt import numpy as np from keras.callbacks import TensorBoard from time import time df = pd.read_csv('imdb_master.csv', encoding='latin-1') print(df.head()) sentences = df['review'].values y = df['label'].values #tokenizing data tokenizer = Tokenizer(num_words=2000) tokenizer.fit_on_texts(sentences) #getting the vocabulary of data sentences = tokenizer.texts_to_matrix(sentences) le = preprocessing.LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000) # Number of features #print(input_dim) model = Sequential() # model.add(layers.Dense(300,input_dim=500, activation='relu')) model.add(layers.Dense(100, activation='sigmoid')) model.compile(loss='sparse_categorical_crossentropy',
TextData = np.asarray(DM.get_arr(dataJS, TextTypes)) AnswerData = target X = TextData Y = DM.to_one_hot(AnswerData) indices = DM.mixedIndex(X) X = X[indices] Y = Y[indices] tokinizer = Tokenizer(num_words=3000) tokinizer.fit_on_texts(X) sequences = tokinizer.texts_to_sequences(X) one_hot_results = tokinizer.texts_to_matrix(X, mode="binary") X = np.array(one_hot_results) X = np.asarray(X).astype('int') model = models.Sequential() model.add(layers.Dense(100, activation="relu", input_shape=(X.shape[1], ))) model.add(layers.Dropout(0.15)) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dropout(0.1)) #model.add(layers.Dense(16, activation="relu")) model.add(layers.Dense(16, activation='relu')) model.add(layers.BatchNormalization()) model.add(layers.Dense(3, activation='softmax'))