def _load_data(self, train_set, test_set): datasets = {'train': train_set, 'test': test_set} # vectorize # add offset of 2 for PAD and OOV self._tokens_vocab.add_vocab_offset(2) self._chars_vocab.add_vocab_offset(2) self._tags_vocab.add_vocab_offset(1) vec_data = {} for f in datasets.keys(): vec_data[f] = self._prepare_vectors(datasets[f]) for f in datasets.keys(): tokens, words, intents, tags = vec_data[f] x = pad_sequences(tokens, maxlen=self.sentence_len) _w = [] for s in words: _s = pad_sequences(s, maxlen=self.word_len) sentence = np.asarray(_s)[-self.sentence_len:] if sentence.shape[0] < self.sentence_len: sentence = np.vstack((np.zeros((self.sentence_len - sentence.shape[0], self.word_len)), sentence)) _w.append(sentence) w = np.asarray(_w) _y = pad_sequences(tags, maxlen=self.sentence_len) y = one_hot_sentence(_y, self.label_vocab_size) i = one_hot(intents, self.intent_size) self.vecs[f] = [x, w, i, y]
def lstm(trainData, trainMark, testData, embedding_dim, embedding_matrix, maxlen, output_len): # 填充数据,将每个序列长度保持一致 trainData = list(sequence.pad_sequences(trainData, maxlen=maxlen, dtype='float64')) # sequence返回的是一个numpy数组,pad_sequences用于填充指定长度的序列,长则阶段,短则补0,由于下面序号为0时,对应值也为0,因此可以这样 testData = list(sequence.pad_sequences(testData, maxlen=maxlen, dtype='float64')) # sequence返回的是一个numpy数组,pad_sequences用于填充指定长度的序列,长则阶段,短则补0 # 建立lstm神经网络模型 model = Sequential() # 多个网络层的线性堆叠,可以通过传递一个layer的list来构造该模型,也可以通过.add()方法一个个的加上层 # model.add(Dense(256, input_shape=(train_total_vova_len,))) #使用全连接的输入层 model.add(Embedding(len(embedding_matrix), embedding_dim, weights=[embedding_matrix], mask_zero=False, input_length=maxlen)) # 指定输入层,将高维的one-hot转成低维的embedding表示,第一个参数大或等于0的整数,输入数据最大下标+1,第二个参数大于0的整数,代表全连接嵌入的维度 # lstm层,也是比较核心的层 model.add(LSTM(256)) # 256对应Embedding输出维度,128是输入维度可以推导出来 model.add(Dropout(0.5)) # 每次在参数更新的时候以一定的几率断开层的链接,用于防止过拟合 model.add(Dense(output_len)) # 全连接,这里用于输出层,1代表输出层维度,128代表LSTM层维度可以自行推导出来 model.add(Activation('softmax')) # 输出用sigmoid激活函数 # 编译该模型,categorical_crossentropy(亦称作对数损失,logloss),adam是一种优化器,class_mode表示分类模式 model.compile(loss='categorical_crossentropy', optimizer='sgd') # 正式运行该模型,我知道为什么了,因为没有补0!!每个array的长度是不一样的,因此才会报错 X = np.array(list(trainData)) # 输入数据 print("X:", X) Y = np.array(list(trainMark)) # 标签 print("Y:", Y) # batch_size:整数,指定进行梯度下降时每个batch包含的样本数 # nb_epoch:整数,训练的轮数,训练数据将会被遍历nb_epoch次 model.fit(X, Y, batch_size=200, nb_epoch=10) # 该函数的X、Y应该是多个输入:numpy list(其中每个元素为numpy.array),单个输入:numpy.array # 进行预测 A = np.array(list(testData)) # 输入数据 print("A:", A) classes = model.predict(A) # 这个是预测的数据 return classes
def pad_graph(gr, s0pad=s0pad, s1pad=s1pad): """ pad sequences in the graph """ gr['si0'] = pad_sequences(gr['si0'], maxlen=s0pad, truncating='pre', padding='post') gr['si1'] = pad_sequences(gr['si1'], maxlen=s1pad, truncating='pre', padding='post') gr['f0'] = pad_3d_sequence(gr['f0'], maxlen=s0pad, nd=nlp.flagsdim) gr['f1'] = pad_3d_sequence(gr['f1'], maxlen=s1pad, nd=nlp.flagsdim) gr['score'] = np.array(gr['score'])
def bidirectional_lstm(X_train, y_train, X_test, y_test): X_train = sequence.pad_sequences(X_train, maxlen=max_len) X_test = sequence.pad_sequences(X_test, maxlen=max_len) lstm = LSTM(output_dim=64) gru = GRU(output_dim=64) # original examples was 128, we divide by 2 because results will be concatenated brnn = Bidirectional(forward=lstm, backward=gru) print X_train.shape, y_train.shape print X_test.shape, y_test.shape print('Build model...') model = Sequential() model.add(Embedding(max_features, 128, input_length=max_len)) model.add(brnn) # try using another Bidirectional RNN inside the Bidirectional RNN. Inception meets callback hell. model.add(Dropout(0.5)) model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary") # model.compile(loss='binary_crossentropy', optimizer='rmsprop') print("Train...") model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=4, validation_data=(X_test, y_test), show_accuracy=True) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True) print('Test score:', score) print('Test accuracy:', acc) pred_labels = model.predict_classes(X_test) # print pred_labels accuracy = accuracy_score(y_test, pred_labels) precision, recall, f1, supp = precision_recall_fscore_support(y_test, pred_labels, average='weighted') print precision, recall, f1, supp return accuracy, precision, recall, f1
def load_data(data_source): assert data_source in ["keras_data_set", "local_dir"], "Unknown data source" if data_source == "keras_data_set": (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None, oov_char=None, index_from=None) x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post") x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post") vocabulary = imdb.get_word_index() vocabulary_inv = dict((v, k) for k, v in vocabulary.items()) vocabulary_inv[0] = "<PAD/>" else: x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data() vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} y = y.argmax(axis=1) # Shuffle data shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] train_len = int(len(x) * 0.9) x_train = x[:train_len] y_train = y[:train_len] x_test = x[train_len:] y_test = y[train_len:] return x_train, y_train, x_test, y_test, vocabulary_inv
def imdb_lstm(): max_features = 20000 maxlen = 80 # cut texts after this number of words (among top max_features most common words) batch_size = 32 (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features) print type(X_train) exit(0) print len(X_train), 'train sequences' print len(X_test), 'test sequences' print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() model.add(Embedding(max_features, 128, dropout=0.2)) model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2)) # try using a GRU instead, for fun model.add(Dense(1)) model.add(Activation('sigmoid')) # try using different optimizers and different optimizer configs model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy']) print('Train...') model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=15, validation_data=(X_test, y_test)) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size) print('Test score:', score) print('Test accuracy:', acc)
def test_pad_sequences_vector(): a = [[[1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]] # test padding b = pad_sequences(a, maxlen=3, padding='pre') assert_allclose(b, [[[0, 0], [0, 0], [1, 1]], [[0, 0], [2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]]) b = pad_sequences(a, maxlen=3, padding='post') assert_allclose(b, [[[1, 1], [0, 0], [0, 0]], [[2, 1], [2, 2], [0, 0]], [[3, 1], [3, 2], [3, 3]]]) # test truncating b = pad_sequences(a, maxlen=2, truncating='pre') assert_allclose(b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 2], [3, 3]]]) b = pad_sequences(a, maxlen=2, truncating='post') assert_allclose(b, [[[0, 0], [1, 1]], [[2, 1], [2, 2]], [[3, 1], [3, 2]]]) # test value b = pad_sequences(a, maxlen=3, value=1) assert_allclose(b, [[[1, 1], [1, 1], [1, 1]], [[1, 1], [2, 1], [2, 2]], [[3, 1], [3, 2], [3, 3]]])
def _fit_internal(self, X_train, y_train): if self.X_train is None: self.X_train = sequence.pad_sequences(X_train, maxlen=self.padding_size) else: self.X_train = numpy.vstack((self.X_train, sequence.pad_sequences(X_train, maxlen=self.padding_size))) self.y_train = numpy.append(self.y_train, y_train) print self.X_train.shape, self.y_train.shape
def prepare_lstm_data(train, test, filter_fn=None): def prepare_numeric(data): X = [] y = [] names = [] for card in data: X.append(np.concatenate((card.types, card.colors, [card.power, card.toughness, card.loyalty]))) y.append(card.cost) names.append(card.name) return np.asarray(X), np.asarray(y), np.asarray(names) if filter_fn: train = filter(filter_fn, train) test = filter(filter_fn, test) X_train_text = [ card.tokens for card in train ] X_test_text = [ card.tokens for card in test ] X_train_text = sequence.pad_sequences(np.asarray(X_train_text), MAX_LEN) X_test_text = sequence.pad_sequences(X_test_text, MAX_LEN) X_train_numeric, y_train, _ = prepare_numeric(train) X_test_numeric, y_test, y_test_names = prepare_numeric(test) #Combine text+numeric data X_train = map(np.asarray, [X_train_text, X_train_numeric]) X_test = map(np.asarray, [X_test_text, X_test_numeric]) return X_train, np.asarray(y_train), X_test, np.asarray(y_test), y_test_names
def process_format_model_in(in_out_pairs, max_len, batch_size, pad='pre', cut='pre'): """ 处理输入输出对的格式,使得符合模型的输入要求 :param in_out_pairs: [(s1, s2, label), (word id list, list, str), ...] :param max_len: 最长序列(切词之后)的长度 :param batch_size: :param pad: :param cut: :return: ({'source1': S1, 'source2': S2}, y) S1.shape == S2.shape: 2d numpy array y.shape == (in_out_pairs len, vocab_size+1) """ S1 = [] S2 = [] y = [] for in_out_pair in in_out_pairs: S1.append(in_out_pair[0]) S2.append(in_out_pair[1]) y.append(int(in_out_pair[2])) # lists of list => 2d numpy array S1 = pad_sequences(S1, maxlen=max_len, padding=pad, truncating=cut) S2 = pad_sequences(S2, maxlen=max_len, padding=pad, truncating=cut) # binary classification problem y = np.asarray(y, dtype=np.int16).reshape(batch_size, 1) return {'source1': S1, 'source2': S2}, y
def imdb_test(): # set parameters: max_features = 5000 # number of vocabulary maxlen = 200 # padding batch_size = 16 nb_epoch = 10 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) nb_classes = 2 y_train = np_utils.to_categorical(y_train, nb_classes) y_test = np_utils.to_categorical(y_test, nb_classes) model = imdb_cnn() plot(model, to_file='./images/imdb_model.png') # try using different optimizers and different optimizer configs # model.compile(loss='binary_crossentropy', optimizer='adagrad', class_mode="binary") model.compile(loss='categorical_crossentropy', optimizer='adagrad') print("Train...") early_stopping = EarlyStopping(monitor='val_loss', patience=5) model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, validation_data=(X_test, y_test), show_accuracy=True, callbacks=[early_stopping]) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True) print('Test score:', score) print('Test accuracy:', acc)
def run_keras_cnn_example(): # set parameters: max_features = 5000 maxlen = 100 batch_size = 32 embedding_dims = 100 nb_filter = 250 filter_length = 3 hidden_dims = 250 nb_epoch = 2 print('Loading data...') (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) model.add(Dropout(0.25)) # we add a Convolution1D, which will learn nb_filter # word group filters of size filter_length: model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='valid', activation='tanh', subsample_length=1)) # we use standard max pooling (halving the output of the previous layer): model.add(MaxPooling1D(pool_length=2)) # We flatten the output of the conv layer, # so that we can add a vanilla dense layer: model.add(Flatten()) # We add a vanilla hidden layer: model.add(Dense(hidden_dims)) model.add(Dropout(0.25)) model.add(Activation('tanh')) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(1)) model.add(Activation('sigmoid')) model.compile(loss='binary_crossentropy', optimizer='rmsprop', class_mode='binary') model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=True, validation_data=(X_test, y_test))
def build(self, vocabulary, q_length, a_length): self.xq_data = [map(lambda x: vocabulary[x], terms) for terms in self.xq_data] self.xa_data = [map(lambda x: vocabulary[x], terms) for terms in self.xa_data] self.xq_np = sequence.pad_sequences(self.xq_data, maxlen = q_length) self.xa_np = sequence.pad_sequences(self.xa_data, maxlen = a_length) self.y_np = np.array(self.labels) self.built = True
def main(): top_words = 5000 # Keep only the most frequent 500 words in the dataset. (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=top_words) # Keras requires same length (although 0 will mean no information). max_review_length = 500 X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) embedding_length = 32 input_seq = Input(shape=(500,)) a = Embedding(top_words, embedding_length, input_length=max_review_length)(input_seq) b, state_h, state_c = LSTM(100, return_state=True, return_sequences=True)(a) c = AttentionLayerV2(attention_depth=4)(b) d = Dropout(0.5)(c) e = Dense(1, activation='sigmoid')(d) model = Model(inputs=[input_seq], outputs=[e]) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy']) model.summary() # print(model.predict(np.ones((10, 500)))) model.fit(X_train, y_train, epochs=5, batch_size=64) # Final evaluation of the model scores = model.evaluate(X_test, y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1]*100)) model.save_weights('model_weights.h5')
def generate_training_batches(): index = 0 #IF I understand correctly, state of index will be saved because it's local while True: remaining = len(train_input_doc) - index input_slice = [] target_slice = [] if remaining >= batch_size: input_doc_slice = train_input_doc[index:(index + batch_size)] input_query_slice = train_input_query[index:(index + batch_size)] target_slice = train_target_word[index:(index + batch_size)] index += batch_size else: input_doc_slice = train_input_doc[index:] input_doc_slice += train_input_doc[:(batch_size - remaining)] input_query_slice = train_input_query[index:] input_query_slice += train_input_query[:(batch_size - remaining)] target_slice = train_target_word[index:] target_slice += train_target_word[:(batch_size - remaining)] index = batch_size - remaining x_train_doc = sequence.pad_sequences(input_doc_slice, maxlen = maxdoclen) x_train_query = sequence.pad_sequences(input_query_slice, maxlen = maxquerylen) x_train = np.concatenate((x_train_doc, x_train_query), axis = 1) y_train = np.zeros((batch_size, vocab_size)) y_train[np.arange(batch_size), np.array(target_slice)] = 1 yield {'input': x_train, 'output': y_train}
def lstm_model(X_train, y_train, X_test, y_test): X_train = sequence.pad_sequences(X_train, maxlen=max_len, padding='post') X_test = sequence.pad_sequences(X_test, maxlen=max_len, padding='post') print X_train.shape, y_train.shape print X_test.shape, y_test.shape print('Build model...') model = Sequential() model.add(Embedding(max_features, 128, input_length=max_len)) model.add(LSTM(128)) # try using a GRU instead, for fun model.add(Dropout(0.5)) model.add(Dense(1)) model.add(Activation('sigmoid')) # print X_train.shape, y_train.shape # print X_test.shape, y_test.shape model.compile(loss='binary_crossentropy', optimizer='adam', class_mode="binary") print("Train...") model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=4, validation_data=(X_test, y_test), show_accuracy=True) acc, score = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True) print('Test score:', score) print('Test accuracy:', acc) pred_labels = model.predict_classes(X_test) # print pred_labels accuracy = accuracy_score(y_test, pred_labels) precision, recall, f1, supp = precision_recall_fscore_support(y_test, pred_labels, average='weighted') print precision, recall, f1, supp return accuracy, precision, recall, f1
def evaluate_recurrent_model(dataset,num_classes): (X_train, Y_train), (X_test, Y_test) = dataset max_features = 20000 maxlen = 125 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print("Pad sequences (samples x time) with maxlen %d"%maxlen) X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() model.add(Embedding(max_features, 128, input_length=maxlen)) model.add(GRU(512)) # try using a GRU instead, for fun model.add(Dropout(0.5)) model.add(Dense(num_classes)) model.add(Activation('softmax')) # try using different optimizers and different optimizer configs model.compile(loss='categorical_crossentropy',optimizer='adam') print("Train...") model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=15, validation_data=(X_test, Y_test), show_accuracy=True) score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size, show_accuracy=True) if verbose: print('Test score:', score) print('Test accuracy:', acc) return score[1]
def generate_test_batches(): index = 0 while True: remaining = len(test_input_doc) - index input_slice = [] target_slice = [] if remaining >= batch_size: input_doc_slice = test_input_doc[index:(index + batch_size)] input_query_slice = test_input_query[index:(index + batch_size)] target_slice = test_target_word[index:(index + batch_size)] index += batch_size else: input_doc_slice = test_input_doc[index:] input_doc_slice += test_input_doc[:(batch_size - remaining)] input_query_slice = test_input_query[index:] input_query_slice += test_input_query[:(batch_size - remaining)] target_slice = test_target_word[index:] target_slice += test_target_word[:(batch_size - remaining)] index = batch_size - remaining x_test_doc = sequence.pad_sequences(input_doc_slice, maxlen = maxdoclen) x_test_query = sequence.pad_sequences(input_query_slice, maxlen = maxquerylen) x_test = np.concatenate((x_test_doc, x_test_query), axis = 1) y_test = np.zeros((batch_size, vocab_size)) y_test[np.arange(batch_size), np.array(target_slice)] = 1 yield {'input': x_test, 'output': y_test}
def test_dan_original(): max_features = 20000 maxlen = 100 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print("Loading data...") (X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features, test_split=0.2) print(len(X_train), "train sequences") print(len(X_test), "test sequences") print("Pad sequences (samples x time)") X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) model = dan_original(max_features) # try using different optimizers and different optimizer configs model.compile(loss="binary_crossentropy", optimizer="adagrad", class_mode="binary") print("Train...") model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=3, validation_data=(X_test, y_test), show_accuracy=True) score, acc = model.evaluate(X_test, y_test, batch_size=batch_size, show_accuracy=True) print("Test score:", score) print("Test accuracy:", acc)
def evaluate_conv_model(dataset, num_classes, maxlen=125,embedding_dims=250,max_features=5000,nb_filter=300,filter_length=3,num_hidden=250,dropout=0.25,verbose=True,pool_length=2,with_lstm=False): (X_train, Y_train), (X_test, Y_test) = dataset batch_size = 32 nb_epoch = 5 if verbose: print('Loading data...') print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) if verbose: print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) model.add(Dropout(dropout)) # we add a Convolution1D, which will learn nb_filter # word group filters of size filter_length: model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='valid', activation='relu', subsample_length=1)) if pool_length: # we use standard max pooling (halving the output of the previous layer): model.add(MaxPooling1D(pool_length=2)) if with_lstm: model.add(LSTM(125)) else: # We flatten the output of the conv layer, # so that we can add a vanilla dense layer: model.add(Flatten()) #We add a vanilla hidden layer: model.add(Dense(num_hidden)) model.add(Activation('relu')) model.add(Dropout(dropout)) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(num_classes)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy',optimizer='adam') model.fit(X_train, Y_train, batch_size=batch_size,nb_epoch=nb_epoch, show_accuracy=True,validation_split=0.1) score = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=1 if verbose else 0, show_accuracy=True) if verbose: print('Test score:',score[0]) print('Test accuracy:', score[1]) predictions = model.predict_classes(X_test,verbose=1 if verbose else 0) return predictions,score[1]
def seq_driver(vocab, postIndexs, commentIndexs): vocab = ['0'] + vocab + ['UNK', 'END'] #the first feature is none. this feature will be problem in the last full connect layer?? max_features = len(vocab) #for i in range(max_features): # print i, vocab[i] maxPostLen = max(map(len, (x for x in postIndexs))) maxCommentLen = max(map(len, (x for x in commentIndexs))) maxlen = max(maxPostLen, maxCommentLen) X = pad_sequences(postIndexs, maxlen, 'int32', 'post', 'post') Y = pad_sequences(commentIndexs, maxlen, 'int32', 'post', 'post') #Y = pad_sequences(commentIndexs, maxlen) #print 'after padd' #batch_test(X, Y, max_features, maxlen) def to_one_hot(id): zeros = [0] * max_features zeros[id] = 1 return zeros xs = np.asarray(X) Y = map(lambda x: map(to_one_hot, x), Y) ys = np.asarray(Y) print('maxfeature, maxlen: ', max_features, maxlen) print("XS Shape: ", xs.shape) print("YS Shape: ", ys.shape) seq2seq(xs, ys, max_features, maxlen, vocab)
def TrainModel_Data(X,Y): X_train = sequence.pad_sequences(np.array(X), maxlen=maxlen) X_test = sequence.pad_sequences(np.array(X[:100]), maxlen=maxlen) y_train = np.array(Y) y_test = np.array(Y[:100]) print('Build model...') model = Sequential() model.add(Embedding(max_features, embedding_size, input_length=maxlen)) model.add(Dropout(0.25)) model.add(Convolution1D(nb_filter=nb_filter, filter_length=filter_length, border_mode='valid', activation='relu', subsample_length=nb_classes)) model.add(MaxPooling1D(pool_length=pool_length)) model.add(LSTM(lstm_output_size)) model.add(Dense(nb_classes)) model.add(Activation('sigmoid')) model.compile(loss='categorical_crossentropy',optimizer='adam') print('Train...') model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,validation_data=(X_test, y_test)) score = model.evaluate(X_test, y_test) print('Test score:', score) SaveModel(model) return model
def smartpadding(X, word_idx, max_sentence_length): newX = [] for i,story in enumerate(X): sentences = splitter([word_idx["."], word_idx["?"] ], list(story)) sentences = filter(lambda a: a != [], sentences) new_sentence = [] #print(i) #print(story) #print(sentences) for sentence in sentences: if(sentence == []): continue sentence = np.array(sentence) #print (sentence[-1]) s = sentence[sentence > 0] if(max_sentence_length < len(s)): print("Maximum sentence length is not maximum, found" , len(s), max_sentence_length) s = pad_sequences([s], maxlen=max_sentence_length, padding="pre")[0] #print(s.shape) #print(s) new_sentence.extend(s) newX.append(new_sentence) print ("maxlen", max(map(len, newX))) newX = pad_sequences(newX, maxlen=max(map(len, newX)),padding = "pre") print(newX.shape) return newX
def train(): # load the dataset but only keep the top n words, zero the rest (X_train, Y_train), (X_test, Y_test) = imdb.load_data(nb_words=top_words) # truncate and pad input sequences X_train = sequence.pad_sequences(X_train, maxlen=max_review_length) X_test = sequence.pad_sequences(X_test, maxlen=max_review_length) # create the model embedding_vecor_length = 32 model = Sequential() model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length)) model.add(Dropout(0.2)) model.add(Convolution1D(nb_filter=32, filter_length=3, border_mode='same', activation='relu')) model.add(MaxPooling1D(pool_length=2)) model.add(LSTM(100)) model.add(Dropout(0.2)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=2, batch_size=64) # Final evaluation of the model scores = model.evaluate(X_test, Y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1]*100)) model.save("imdb_%0.2f.pkl" % scores[1])
def vectorize_data(filenames, maxlen=100, max_charlen=20, output_label_size=6, output_label_dict=None, output_type="boundary", return_chars=False): assert output_label_dict is not None, "The output label dictionary should be specified before vectorizing data" X = [] X_char = [] Y = [] for i, filename in enumerate(filenames): for docid, doc in pp.get_documents(filename): for seq in pp.get_sequences(doc): x = [] x_char = [] y = [] for token in seq: x.append(1 + token.word_index) # Add 1 to include token for padding if return_chars: x_char.append((1 + np.array(token.char_seq)).tolist()) # Add 1 to include token for padding if output_type == "category": y_idx = 1 + output_label_dict.get(token.c_label, -1) # Add 1 to include token for padding else: y_idx = 1 + output_label_dict.get(token.b_label, -1) # Add 1 to include token for padding y.append(y_idx) # Add 1 to include token for padding X.append(x) if return_chars: padded_sequence = pad_sequences([[] for k in xrange(maxlen - len(x_char))], maxlen=max_charlen).tolist() +\ pad_sequences(x_char[:maxlen], maxlen=max_charlen).tolist() X_char.append(padded_sequence) Y.append(y) X = pad_sequences(X, maxlen=maxlen) Y = pad_sequences(Y, maxlen=maxlen) X = np.array(X) Y = vtu.to_onehot(Y, output_label_size) if return_chars: return X, Y, np.array(X_char) return X, Y
def extract_representation(maxlen, settings, model=None, seed=107, test_split=0.2) : # load data save_block, save_sep, yes_dir, no_dir = get_file_paths(train=True) (X,y) = ISTapps.load_ISTapps(maxlen, separate=True, save_file=save_block, yes_directory=yes_dir, no_directory=no_dir, seed=seed) X = np.asarray(X) y = np.asarray(y) split_point = int(len(X)*(1-test_split)) X_train_prime = X[:split_point] y_train_prime = y[:split_point] X_test_prime = X[split_point:] y_test_prime = y[split_point:] # convert train set into a huge block of sequences and shuffle again (X_train, y_train) = ISTapps.extract_from_apps(X_train_prime, y_train_prime, maxlen, seed, shuffle=True) # convert model for LSTM success comparison (X_test, y_test) = ISTapps.extract_from_apps(X_test_prime, y_test_prime, maxlen, seed, shuffle=True) if not model : model = tune_model(X_train, y_train, X_test, y_test, settings) # return training data as a shuffled sentence block and separated by app X_train_block = model.predict(X_train) X_train_rep = [model.predict(sequence.pad_sequences(app,maxlen)) for app in X_train_prime] X_test_rep = [model.predict(sequence.pad_sequences(app,maxlen)) for app in X_test_prime] assert(len(X_train_rep) == len(y_train_prime)) assert(len(X_test_rep) == len(y_test_prime)) return (X_train_block, y_train), (X_train_rep, y_train_prime), (X_test_rep, y_test_prime), model
def vectorize(self, slist, emb, spad=60): """ build an spad-ed matrix of word indices from a list of token sequences; returns an si, sj tuple of indices in vocab and emb respectively """ silist = [] sjlist = [] for s in slist: si = [] sj = [] for t in s: if self.icase: t = t.lower() if t in self.word_idx: si.append(self.word_idx[t]) sj.append(0) elif emb is not None and t in emb.w: si.append(0) sj.append(emb.w[t]) else: si.append(1) # OOV sj.append(0) silist.append(si) sjlist.append(sj) if spad is not None: return (pad_sequences(silist, maxlen=spad, truncating='post', padding='post'), pad_sequences(sjlist, maxlen=spad, truncating='post', padding='post')) else: return (silist, sjlist)
def fitmodel(model, X_train, X_test, y_train, y_test, epoch=30): X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) score = model.evaluate(X_test, y_test, batch_size=32) print "score before eval: ", score model.fit(X_train, y_train, batch_size=32, nb_epoch=epoch) score = model.evaluate(X_test, y_test, batch_size=32) print "score after eval: ", score
def preprocessingCaption(_cap, wordtoidx): _cap = map(lambda cap: [wordtoidx[word] for word in cap.lower().split(' ')[:-1] if word in wordtoidx], _cap) max_steps = np.max(map(lambda x: len(str(x).split(' ')),_cap)) # 79 _cap = Sequence.pad_sequences(_cap, maxlen=max_steps+1, padding='post') # ndarray _cap = Sequence.pad_sequences(_cap, maxlen=max_steps+2, padding='pre') return _cap, max_steps
def pad_sentences(question1_word_sequences, question2_word_sequences, is_duplicate, maxlen): q1_data = pad_sequences(question1_word_sequences, maxlen=maxlen) q2_data = pad_sequences(question2_word_sequences, maxlen=maxlen) labels = np.array(is_duplicate, dtype=int) print('Shape of question1 data tensor:', q1_data.shape) print('Shape of question2 data tensor:', q2_data.shape) print('Shape of label tensor:', labels.shape) return q1_data, q2_data, labels
def _process(self, X_temp, indexes): data_ids = self.tokenizer.texts_to_sequences(X_temp) max_length = self.max_length batch_x = sequence.pad_sequences(data_ids, maxlen=max_length, padding='post') batch_y = self.labels[indexes] return batch_x, batch_y
for seq in hindi_seqlist] with open(os.path.join(base_dir, "model", model_name + "_hindi2index.pickle"), mode="wb") as file: pickle.dump(hindi2index, file) with open(os.path.join(base_dir, "model", model_name + "_eng2index.pickle"), mode="wb") as file: pickle.dump(eng2index, file) #maxsequence length max_len = 40 # sequence padding eng_padded_seq = pad_sequences(maxlen=max_len, sequences=encoded_eng_seqlist, padding="post", value=eng2index[" "]) hindi_padded_seq = pad_sequences(maxlen=max_len, sequences=encoded_hindi_seqlist, padding="post", value=hindi2index[" "]) #one hot encoding of hindi sequence y = [ to_categorical(seq, num_classes=len(hindi2index)) for seq in hindi_padded_seq ] eng_train, eng_test, y_train, y_test = train_test_split(eng_padded_seq, y, test_size=0.05)
word2vec_model.save('my_word2vec_model_256_false.model')''' x_train = [] for comment in table_x: s = comment[0].split(',', 1)[1] s_cut = jieba.cut(s) temp = [] for word in s_cut: if word in my_word2vec_model.wv.vocab: temp.append(my_word2vec_model[word]) #else : temp.append(my_word2vec_model["oov"]) x_train.append(temp) x_train = pad_sequences(x_train, maxlen=48, dtype='int32', padding='post', truncating='post', value=my_word2vec_model[" "]) ########################## building model ################################################# model = Sequential() model.add( LSTM(256, return_sequences=True, input_length=48, input_dim=256, dropout=0.5, recurrent_dropout=0.5, kernel_initializer='he_normal')) model.add( LSTM(256, return_sequences=False,
encoded_labels = t.texts_to_sequences(y) test_encoded_docs = t.texts_to_sequences(test_x) test_encoded_labels = t.texts_to_sequences(test_y) word_index = t.word_index index_word = {v:k for k, v in word_index.items()} def decode_sequence(seq): decoded = "" for s in seq: if not s==0: decoded = decoded + index_word[int(s)] + " " return decoded.strip() # pad documents to a max length of z words (where z should be the maximum sequence length) padded_docs = pad_sequences(encoded_docs, maxlen=max_len, padding='post') padded_labels = pad_sequences(encoded_labels, maxlen=max_len, padding='post') test_padded_docs = pad_sequences(test_encoded_docs, maxlen=max_len, padding='post') test_padded_labels = pad_sequences(test_encoded_labels, maxlen=max_len, padding='post') x = padded_docs y = padded_labels test_x = test_padded_docs test_y = test_padded_labels def eval_batch(x_train, y_train, x_test, y_test, classifier, components, no_clusters, dimensionality): cluster_finder = cluster.KMeans(n_clusters=no_clusters) if classifier=='mbk':
x = re.sub('[0-9]{3}', '###', x) x = re.sub('[0-9]{2}', '##', x) return x #Removing Numbers train['text'] = train['text'].apply(lambda x: remove_numbers(x)) test['text'] = test['text'].apply(lambda x: remove_numbers(x)) ## Tokenize the sentences tokenizer = Tokenizer(num_words=MAX_WORD_TO_USE) tokenizer.fit_on_texts(list(train['text'])) train_X = tokenizer.texts_to_sequences(train['text']) test_X = tokenizer.texts_to_sequences(test['text']) ## Pad the sentences train_X = pad_sequences(train_X, maxlen=MAX_LEN) test_X = pad_sequences(test_X, maxlen=MAX_LEN) #Converting target to one-hot format train_y = pd.get_dummies(train['label']).values test_y = pd.get_dummies(test['label']).values #words_dict is a dictionary like this: #words_dict = {'the':5,'among':20,'interest':578} #words_dict includes words and their corresponding numbers. words_dict = tokenizer.word_index #Present working directory working_dir = os.getcwd() EMBEDDING_FILE = '../glove.6B.{}d.txt'.format(EMBED_SIZE)
y_train = encoder.fit_transform(y_train) y_test = encoder.fit_transform(y_test) onehotencoder = OneHotEncoder(sparse=False) y_train = onehotencoder.fit_transform(y_train.reshape(-1, 1)) y_test = onehotencoder.fit_transform(y_test.reshape(-1, 1)) tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(dataDF['text']) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) vocab_size = len(tokenizer.word_index) + 1 maxlen = 100 X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) X_test = pad_sequences(X_test, padding='post', maxlen=maxlen) embedding_dim = 300 embedding_matrix = create_embedding_matrix(glove_model, tokenizer.word_index, embedding_dim) model = Sequential() model.add( Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=True)) model.add(layers.GlobalMaxPool1D())
token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} indice_token = {token_indice[k]: k for k in token_indice} # max_features is the highest integer that could be found in the dataset. max_features = np.max(list(indice_token.keys())) + 1 # Augmenting X_train and X_test with n-grams features X_train = add_ngram(X_train, token_indice, ngram_range) X_test = add_ngram(X_test, token_indice, ngram_range) print('Average train sequence length: {}'.format( np.mean(list(map(len, X_train)), dtype=int))) print('Average test sequence length: {}'.format( np.mean(list(map(len, X_test)), dtype=int))) print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=maxlen) X_test = sequence.pad_sequences(X_test, maxlen=maxlen) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('Build model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions model.add(Embedding(max_features, embedding_dims, input_length=maxlen)) # we add a GlobalAveragePooling1D, which will average the embeddings # of all words in the document model.add(GlobalAveragePooling1D())
seed=1234, iter=25) w2v_model.save('w2v_model.pkl') tokenizer = Tokenizer(num_words=len(word_set)) tokenizer.fit_on_texts(corpus) train_q1 = tokenizer.texts_to_sequences(train_q1) train_q2 = tokenizer.texts_to_sequences(train_q2) test_q1 = tokenizer.texts_to_sequences(test_q1) test_q2 = tokenizer.texts_to_sequences(test_q2) dev_q1 = tokenizer.texts_to_sequences(dev_q1) dev_q2 = tokenizer.texts_to_sequences(dev_q2) train_pad_q1 = pad_sequences(train_q1, maxlen=MAX_SEQUENCE_LENGTH) train_pad_q2 = pad_sequences(train_q2, maxlen=MAX_SEQUENCE_LENGTH) test_pad_q1 = pad_sequences(test_q1, maxlen=MAX_SEQUENCE_LENGTH) test_pad_q2 = pad_sequences(test_q2, maxlen=MAX_SEQUENCE_LENGTH) dev_pad_q1 = pad_sequences(dev_q1, maxlen=MAX_SEQUENCE_LENGTH) dev_pad_q2 = pad_sequences(dev_q2, maxlen=MAX_SEQUENCE_LENGTH) embedding_matrix = np.zeros([len(tokenizer.word_index) + 1, EMB_DIM]) for word, idx in tokenizer.word_index.items(): try: embedding_matrix[idx, :] = w2v_model.wv[word] except: print('1')
labels = np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1]) #토큰화 token = Tokenizer() token.fit_on_texts(docs) print(token.word_index) # {'너무': 1, '참': 2, '재밌어요': 3, '최고에요': 4, '잘': 5, '만든': 6, '영화에요': 7, '추천하고': 8, '싶은': 9, '영화': 10, '입니다': 11, # '한번': 12, '더': 13, '보고': 14, '싶네요': 15, '글쎄요': 16, '별로에요': 17, # '생각보다': 18, '지루해요': 19, '연기가': 20, '어색해요': 21, '재미없어요': 22, '재미없다': 23, '재밌네요': 24} # 자주나온 단어는 인덱스를 앞으로 준다. x = token.texts_to_sequences(docs) print(x) #[[1, 3], [4], [2, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16], [17], [18, 19], [20, 21], [22], [1, 23], [2, 24]] pad_x = pad_sequences(x, padding='pre', value=0) print(pad_x) word_size = len(token.word_index) + 1 print(word_size) model = Sequential() model.add(Embedding(25, 10, input_length=4)) model.add(Conv1D(10, 2)) model.add(Conv1D(10, 2)) model.add(MaxPool1D()) # model.add(Embedding(word_size,10,input_length=4)) model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) model.summary()
def train_LSTM_Cross_Domain(tweets_train, tweets_test, vocab, MAX_SEQUENCE_LENGTH): a, p, r, f1 = 0., 0., 0., 0. a1, p1, r1, f11 = 0., 0., 0., 0. pn,rn,fn = 0.,0.,0. sentence_len = MAX_SEQUENCE_LENGTH batch_size =128 X_train, y_train = gen_sequence(tweets_train,vocab,'categorical') X_test, y_test = gen_sequence(tweets_test,vocab,'binary') X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH) X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH) y_train = np.array(y_train) y_train = y_train.reshape((len(y_train), 1)) X_temp = np.hstack((X_train, y_train)) model = lstm_model(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM) if INITIALIZE_WEIGHTS_WITH == "glove": weights = get_embedding_weights(vocab) model.layers[0].set_weights([weights]) elif INITIALIZE_WEIGHTS_WITH == "random": shuffle_weights(model) else: print ("ERROR!") return for epoch in range(EPOCHS): for X_batch in batch_gen(X_temp, BATCH_SIZE): x = X_batch[:, :sentence_len] y_temp = X_batch[:, sentence_len] try: y_temp = np_utils.to_categorical(y_temp, num_classes=3) except Exception as e: print (e) #print (x.shape, y_temp.shape) loss, acc = model.train_on_batch(x, y_temp, class_weight=None) #print (loss, acc) temp = model.predict_on_batch(X_test) y_pred_aux = np.argmax(temp, axis=1) y_pred=[] for i in y_pred_aux: if i == 2: y_pred.append(1) else: y_pred.append(i) # print (classification_report(y_test, y_pred)) # print (precision_recall_fscore_support(y_test, y_pred)) wordEmb = model.layers[0].get_weights()[0] word2vec_model = create_model(wordEmb,vocab) tweets_train = select_tweets_whose_embedding_exists(tweets_train, word2vec_model) tweets_test = select_tweets_whose_embedding_exists(tweets_test, word2vec_model) X_train, y_train = gen_data(tweets_train,word2vec_model,'categorical') X_test, y_test = gen_data(tweets_test,word2vec_model,'binary') precision, recall, f1_score, acc, p_weighted, p_macro, r_weighted, r1_macro, f1_weighted, f11_macro = gradient_boosting_classifier(X_train, y_train, X_test, y_test, 'cross') a += acc p += p_weighted p1 += p_macro r += r_weighted r1 += r1_macro f1 += f1_weighted f11 += f11_macro pn += precision rn += recall fn += f1_score print_scores(p, p1, r,r1, f1, f11,pn, rn, fn, 1)
i=0 y_valid=np.zeros((len(label_valid),max(label_valid)+1)) for x in label_valid: y_valid[i][x]=1 i=i+1 t = Tokenizer() t.fit_on_texts(input_train) vocab_size = len(t.word_index) + 1 # integer encode the documents encoded_docs = t.texts_to_sequences(input_train) #print(encoded_docs) # pad documents to a max length of 4 words max_length = max(len_finder) padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') #print(padded_docs) # load the whole embedding into memory embeddings_index = dict() f = open("G:\\NLP\\Dataset\\GloVe\\glove.6B.100d.txt", encoding="utf8") for line in f: values = line.split() word = values[0] coefs = asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() #print('Loaded %s word vectors.' % len(embeddings_index)) # create a weight matrix for words in training docs embedding_matrix = zeros((vocab_size, 100)) for word, i in t.word_index.items(): embedding_vector = embeddings_index.get(word)
# Create a tokenizer #============================================================================== tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, lower=True ) tokenizer.fit_on_texts(docs) sequences = tokenizer.texts_to_sequences(docs) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) # convert text to sequence of tokens and pad them to ensure equal length vectors x = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) #============================================================================== # Training, testing and validation #============================================================================== seed =1000 x_train, x_test, y_train, y_test = train_test_split(x, dummy_y, train_size=0.7, random_state=seed) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, train_size=0.7, random_state=seed) '''
T = list(itertools.chain(*T)) # Generate a dictionary of valid characters valid_chars = {x: idx + 1 for idx, x in enumerate(set(''.join(X + T)))} max_features = len(valid_chars) + 1 maxlen = np.max([len(x) for x in X]) print(maxlen) # Convert characters to int and pad X1 = [[valid_chars[y] for y in x] for x in X] T1 = [[valid_chars[y] for y in x] for x in T] X_train = sequence.pad_sequences(X1, maxlen=maxlen) X_test = sequence.pad_sequences(T1, maxlen=maxlen) y_train = np.array(trainlabel) y_test = np.array(testlabel) hidden_dims = 128 nb_filter = 128 filter_length = 2 embedding_vecor_length = 128 pool_length = 2 lstm_output_size = 70 model = Sequential() model.add(Embedding(max_features, embedding_vecor_length, input_length=maxlen))
# 1-b set test and train train_x = train['x'] train_y = train['y'] test_x = test['x'] test_y = test['y'] # 2-1 Tokenize the data from keras.preprocessing.text import Tokenizer token = Tokenizer(7000) token.fit_on_texts(train_x) x_train_seq = token.texts_to_sequences(train_x) x_test_seq = token.texts_to_sequences(test_x) # 2-2 set max length of data x_train = sequence.pad_sequences(x_train_seq, maxlen=100) x_test = sequence.pad_sequences(x_test_seq, maxlen=100) #%% from s from keras.models import Sequential from keras.layers.core import Dense,Dropout,Activation,Flatten from keras.layers.embeddings import Embedding from keras.layers.recurrent import SimpleRNN, import matplotlib.pyplot as plt modelRNN = Sequential() #建立模型 #Embedding層將「數字list」轉換成「向量list」 modelRNN.add(Embedding(output_dim=4, #輸出的維度是32,希望將數字list轉換為32維度的向量 input_dim=7000, #輸入的維度是3800,也就是我們之前建立的字典是3800字 input_length=100)) #數字list截長補短後都是380個數字
def train(self, n_epochs=10): if not hasattr(self, 'model'): self.create_model() self.compile_model() self.serialize_class_data() self.serialize_model() validation_split = 0.1 split_at = int(len(self.X_train) * (1. - validation_split)) x, val_x = self.X_train[:split_at], self.X_train[split_at:] y, val_y = self.Y_train[:split_at], self.Y_train[split_at:] training_loss_history = [] validation_loss_history = [] for epoch in range(n_epochs): print('Epoch', epoch) training_loss = [] end = int(float(len(x)) / self.batch_size) progbar = Progbar(end) for i in range(0, len(x), self.batch_size): inp = sequence.pad_sequences(x[i:i+self.batch_size], maxlen=self.maxlen) out = y[i:i+self.batch_size] loss = self.model.train_on_batch(inp, out) training_loss.append(loss) j = int(float(i) / self.batch_size) if j % 16 == 0: progbar.update(j) progbar.update(end) # test on validation set validation_loss = [] print() print('Evaluating on validation set:') end = int(float(len(val_x)) / self.batch_size) progbar = Progbar(end) for i in range(0, len(val_x), self.batch_size): inp = sequence.pad_sequences(val_x[i:i+self.batch_size], maxlen=self.maxlen) out = val_y[i:i+self.batch_size] output = self.model.test_on_batch(inp, out) validation_loss.append(output) j = int(float(i) / self.batch_size) if j % 16 == 0: progbar.update(j) progbar.update(end) training_loss_history.append(np.mean(training_loss)) validation_loss_history.append(np.mean(validation_loss)) filename = op.join(self.serialization_dir, 'weights_epoch%d.h5' % epoch) self.model.save_weights(filename, overwrite=True) print print ('Mean training loss: %5.3f; mean validation loss: %5.3f\n' % (training_loss_history[-1], validation_loss_history[-1])) if (len(validation_loss_history) > 1 and validation_loss_history[-1] >= validation_loss_history[-2]): break self.training_history = (map(float, training_loss_history), map(float, validation_loss_history))
def prepare_text(text): text_clean = process_texts(text) text_word_sequences = tokenizer.texts_to_sequences(text_clean) input_text = pad_sequences(text_word_sequences, maxlen = config.MAX_SEQUENCE_LENGTH, padding = 'post') return input_text
str(iteration + 1)): iteration += 1 print("\n\n\n\nMaking nueral Network for iteration:", iteration) #Making Training and Testing Data X_Train = [Features[x] for x in train_index] X_Test = [Features[x] for x in test_index] radicalTrain = [Radical[x] for x in train_index] radicalTest = [Radical[x] for x in test_index] tokenisedTrain = tokenizer.texts_to_sequences(X_Train) tokenisedTest = tokenizer.texts_to_sequences(X_Test) max_review_length = 180 X_Train = sequence.pad_sequences(tokenisedTrain, maxlen=max_review_length, padding='post') X_Test = sequence.pad_sequences(tokenisedTest, maxlen=max_review_length, padding='post') #Radical radicalModel = Sequential() radicalModel.add( Embedding(vocabSize, 100, input_length=max_review_length, weights=[embedding_matrix], trainable=False)) radicalModel.add(GRU(100, dropout=0.2, recurrent_dropout=0.2)) radicalModel.add(Dense(1, activation='sigmoid'))
print("批判性: ", score) if __name__ == '__main__': comment_judgements = readJudgementsFromFile() comments, judgements = get_comment_and_judgement(comment_judgements) word_dataset = build_word_dataset(comment_judgements) word_index_dict = build_up_word_index_dict(word_dataset) comments = [comment_to_indices(comment, word_index_dict) for comment in comments] judgements = np.array(judge_to_one_hot(judgements)) # -----Preparing the training and testing data----- trainAmount = int(len(comments) * 0.6) data = pad_sequences(comments, maxlen=max_seq_len, dtype='float32') random_mask = np.arange(len(data)) np.random.shuffle(random_mask) data = data[random_mask] judgements = judgements[random_mask] train_data = data[:trainAmount] train_labels = judgements[:trainAmount] test_data = data[trainAmount:] test_labels = judgements[trainAmount:] validation_data = test_data[:200] validation_labels = test_labels[:200] print("Train Data's shape: ", train_data.shape) print("Train Labels' shape: ", train_labels.shape)
def main(logger, args): df_train, _ = load_data(INPUT_DIR, logger) if args['debug']: df_train = df_train.iloc[:30000] texts_train = df_train['question_text'] else: logger.info('Preprocess text') texts_train = preprocess_text(df_train, return_df=False) seq_train, tokenizer = tokenize_texts(texts_train, logger) logger.info('Pad train text data') seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH) label_train = df_train['target'].values.reshape(-1, 1) embed_types = [0, 1, 2] logger.info('Start multiprocess nlp feature extraction and embedding matrices loading') with mp.Pool(processes=2) as p: results = p.map(parallel_apply, [ (extract_nlp_features, (df_train,)), (load_multiple_embeddings, (tokenizer.word_index, embed_types, args['debug'])) ]) df_train_extracted = results[0] embedding_matrices = results[1] embedding_matrix = np.concatenate( [np.array([embedding_matrices[i] for i in [0, 1, 2]]).mean(0)] + [ embedding_matrices[j] for j in [1] ], axis=1 ) nlp_columns = ['total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!', 'n_you'] for col in nlp_columns: scaler = StandardScaler() df_train_extracted[col] = scaler.fit_transform( df_train_extracted[col].values.astype(np.float32).reshape(-1, 1)).reshape(-1, ) x_nlp = [df_train_extracted[col].values.reshape(-1, 1) for col in nlp_columns] nlp_size = len(x_nlp) # ===== training and evaluation loop ===== # device_ids = args['device_ids'] output_device = device_ids[0] torch.cuda.set_device(device_ids[0]) torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = True batch_size = args['batch_size'] * len(device_ids) trigger = TRIGGER if args['debug']: epochs = 3 n_splits = 2 else: epochs = EPOCHS n_splits = KFOLD logger.info('Start training and evaluation loop') model_specs = [ {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2}, {'activation': 'relu', 'dim': 16, 'dropout': 0.2}), 'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}, {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}), 'upper_layer_types': ({'dim': 64, 'dropout': 0.5}, {'dim': 64, 'dropout': 0.3} ), 'gamma': 2.0, 'alpha': 0.75, 'combined': False, 'weight': 1.0}, {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2}, {'activation': 'relu', 'dim': 16, 'dropout': 0.2}), 'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}, {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}), 'upper_layer_types': ({'dim': 64, 'dropout': 0.5}, {'dim': 64, 'dropout': 0.3} ), 'gamma': 2.0, 'alpha': 0.50, 'combined': False, 'weight': 1.0}, {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2}, {'activation': 'relu', 'dim': 16, 'dropout': 0.2}), 'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}, {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}), 'upper_layer_types': ({'dim': 64, 'dropout': 0.5}, {'dim': 64, 'dropout': 0.3} ), 'gamma': 2.0, 'alpha': 0.75, 'combined': True, 'weight': 1.0}, {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2}, {'activation': 'relu', 'dim': 16, 'dropout': 0.2}), 'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}, {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}), 'upper_layer_types': ({'dim': 64, 'dropout': 0.5}, {'dim': 64, 'dropout': 0.3} ), 'gamma': 2.0, 'alpha': 0.75, 'combined': True, 'weight': 5.0}, {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2}, {'activation': 'relu', 'dim': 16, 'dropout': 0.2}), 'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}, {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}), 'upper_layer_types': ({'dim': 64, 'dropout': 0.5}, {'dim': 64, 'dropout': 0.3} ), 'gamma': 2.0, 'alpha': 0.50, 'combined': True, 'weight': 5.0}, {'nlp_layer_types': ({'activation': 'relu', 'dim': 16, 'dropout': 0.2}, {'activation': 'relu', 'dim': 16, 'dropout': 0.2}), 'rnn_layer_types': ({'type': 'lstm', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}, {'type': 'gru', 'dim': 64, 'num_layers': 1, 'dropout': 0.0}), 'upper_layer_types': ({'dim': 64, 'dropout': 0.5}, {'dim': 64, 'dropout': 0.3} ), 'gamma': 2.0, 'alpha': 0.75, 'combined': True, 'weight': 3.0}, ] model_name_base = 'NLPFeaturesDeepRNN' for spec_id, spec in enumerate(model_specs): model_name = model_name_base + f'_specId={spec_id}' skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED) oof_mv_preds = np.zeros(len(seq_train)) oof_preds_proba = np.zeros(len(seq_train)) oof_opt_preds = np.zeros(len(seq_train)) oof_reopt_preds = np.zeros(len(seq_train)) results_list = [] for fold, (index_train, index_valid) in enumerate(skf.split(label_train, label_train)): logger.info(f'Fold {fold + 1} / {KFOLD} - create dataloader and build model') x_train = { 'text': seq_train[index_train].astype(int), 'nlp': [x[index_train] for x in x_nlp] } x_valid = { 'text': seq_train[index_valid].astype(int), 'nlp': [x[index_valid] for x in x_nlp] } y_train, y_valid = label_train[index_train].astype(np.float32), label_train[index_valid].astype(np.float32) model = NLPFeaturesDeepRNN(embedding_matrix, PADDING_LENGTH, nlp_size, embed_drop=0.2, mask=True, nlp_layer_types=spec['nlp_layer_types'], rnn_layer_types=spec['rnn_layer_types'], upper_layer_types=spec['upper_layer_types']) steps_per_epoch = seq_train[index_train].shape[0] // batch_size scheduler_trigger_steps = steps_per_epoch * trigger step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS if spec['combined']: criterion_type = 'bce_focal' else: criterion_type = 'focal' config = { 'epochs': epochs, 'batch_size': batch_size, 'output_device': output_device, 'criterion_type': criterion_type, 'criteria_weights': [1.0, spec['weight']], 'criterion_gamma': spec['gamma'], 'criterion_alpha': spec['alpha'], 'optimizer': 'adam', 'optimizer_lr': 0.003, 'num_snapshots': NUM_SNAPSHOTS, 'scheduler_type': 'cyclic', 'base_lr': 0.0005, 'max_lr': 0.003, 'step_size': step_size, 'scheduler_mode': 'triangular', 'scheduler_gamma': 0.9, 'scheduler_trigger_steps': scheduler_trigger_steps, 'sampler_type': 'normal', 'seed': SEED } trainer = Trainer(model, logger, config) eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid, y_valid, fold) fold_results = calculate_fold_metrics(eval_results, label_train[index_valid].reshape(-1,)) results_list.append(fold_results) message = f'Fold {fold + 1} / {KFOLD} has been done.\n' message += f'Majority Voting - F1: {fold_results["oof_mv_f1"]}, ' message += f'Precision: {fold_results["oof_mv_precision"]}, Recall: {fold_results["oof_mv_recall"]}\n' message += f'Optimized - F1: {fold_results["oof_opt_f1"]}, ' message += f'Precision: {fold_results["oof_opt_precision"]}, Recall: {fold_results["oof_opt_recall"]}\n' message += f'Re-optimized - F1: {fold_results["oof_reopt_f1"]}, ' message += f'Precision: {fold_results["oof_reopt_precision"]}, Recall: {fold_results["oof_reopt_recall"]}\n' message += f'Focal Loss: {fold_results["oof_focal_loss"]}, ' message += f'Optimized Threshold: {fold_results["oof_opt_threshold"]}, ' message += f'Re-optimized Threshold: {fold_results["oof_reopt_threshold"]}, ' logger.post(message) eval_results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name, 'fold_id': fold } for res in eval_results: res.update(eval_results_addition) # post_to_snapshot_metrics_table(data=res, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) fold_results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name, 'fold_id': fold } fold_results.update(fold_results_addition) post_to_fold_metrics_table(fold_results, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) oof_mv_preds[index_valid] = fold_results['oof_mv_preds'] oof_opt_preds[index_valid] = fold_results['oof_opt_preds'] oof_reopt_preds[index_valid] = fold_results['oof_reopt_preds'] oof_preds_proba[index_valid] = fold_results['oof_preds_proba'] results = calculate_total_metrics(results_list) results_addition = { 'date': datetime.now(), 'script_name': SCRIPT_NAME, 'spec_id': spec_id, 'model_name': model_name } results.update(results_addition) if args['save_preds']: save_path = DATA_DIR.joinpath(f'predictions/{SCRIPT_NAME + "_" + model_name + ".pkl"}') predictions = { 'proba': oof_preds_proba, 'mv': oof_mv_preds, 'opt': oof_opt_preds, 'reopt': oof_reopt_preds } joblib.dump(predictions, str(save_path)) post_to_total_metrics_table(results, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET) logger.post(f'Spec ID: {spec_id}\nModel Spec: {spec}') message = 'KFold training and evaluation has been done.\n' message += f'Majority Voting - F1: avg = {results["mv_f1_avg"]}, std = {results["mv_f1_std"]}, ' message += f'Precision: {results["mv_precision_avg"]}, Recall: {results["mv_recall_avg"]}\n' message += f'Optimized - F1: avg = {results["opt_f1_avg"]}, std = {results["opt_f1_std"]}, ' message += f'Precision: {results["opt_precision_avg"]}, Recall: {results["opt_recall_avg"]}\n' message += f'Re-optimized - F1: avg = {results["reopt_f1_avg"]}, std = {results["reopt_f1_std"]}, ' message += f'Precision: {results["reopt_precision_avg"]}, Recall: {results["reopt_recall_avg"]}\n' mv_thresholds = ", ".join([str(th) for th in results["mv_thresholds_avg"]]) message += f'Focal Loss: {results["focal_loss_avg"]}, ' message += f'Optimized Threshold: {results["opt_threshold_avg"]}, ' message += f'Re-optimized Threshold: {results["reopt_threshold_avg"]}\n' message += f'Majority Voting Thresholds: {mv_thresholds}' logger.post(message)
left_word4, phonetic_input] all_outputs = [outputs, out1, out2, out3, out4, out5, out6] model = Model(input=all_inputs, output=all_outputs) opt = Adam() return model X_vocab_len = 90 X_max_len = 18 n1, n2, n3, n4, n5, n7, _ = pickle.load(open('pickle-dumps/n', 'rb')) # print("Zero padding .. ") X_wrds_inds = pad_sequences(X_wrds_inds, maxlen=X_max_len, dtype='int32', padding='post') X_left1 = pad_sequences(X_left1, maxlen=X_max_len, dtype='int32', padding='post') X_left2 = pad_sequences(X_left2, maxlen=X_max_len, dtype='int32', padding='post') X_left3 = pad_sequences(X_left3, maxlen=X_max_len, dtype='int32', padding='post') X_left4 = pad_sequences(X_left4, maxlen=X_max_len,
df.drop(df.columns[1], axis=1, inplace=True) df.info() X = df.v2 Y = df.v1 le = LabelEncoder() Y = le.fit_transform(Y) Y = Y.reshape(-1, 1) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05) max_words = 10000 max_len = 128 tok = Tokenizer(num_words=max_words) tok.fit_on_texts(X_train) sequences = tok.texts_to_sequences(X_train) sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len) def RNN(): inputs = Input(name='inputs', shape=[max_len]) layer = Embedding(max_words, 50, input_length=max_len)(inputs) layer = GRU(64)(layer) layer = Dense(256, name='FC1')(layer) layer = Activation('relu')(layer) layer = Dropout(0.5)(layer) layer = Dense(1, name='out_layer')(layer) layer = Activation('sigmoid')(layer) model = Model(inputs=inputs, outputs=layer) return model
import pickle with open(os.path.join('data', 'save', "sequences.txt"), "wb") as _fp: pickle.dump(sequences, _fp) # Load encoded_sentences with open(os.path.join('data', 'save', "sequences.txt"), "rb") as _fp: sequences = pickle.load(_fp) gc.collect() # Padding from keras.preprocessing.sequence import pad_sequences max_length = max([len(s) for s in sequences]) X = pad_sequences(sequences, maxlen=max_length, padding='post') np.save(os.path.join('data', 'save', "X.npy"), X) X = np.load(os.path.join('data', 'save', "X.npy")) # Creating X with word embeddings unique_words = len(word_index) total_words = unique_words + 1 skipped_words = 0 embedding_dim = 100 embedding_matrix = np.zeros((total_words,embedding_dim)) for word, index in tokenizer.word_index.items(): try: embedding_vector = model[word]
def prepare_data(filepath, num_data_points=40000, vocab_size=4000, max_length=500): train_set_proportion = 0.9 train_size = int(num_data_points * train_set_proportion) print("Preparing Data...") current_file = open(filepath, "rb") x = current_file.read() current_file.close() x = x.decode("utf-8") x = x.splitlines() random.shuffle(x) x = x[:num_data_points] labels = [] reviews = [] reTokenizer = RegexpTokenizer(r'\w+') for i in x: separated = i.split(" ", 1) labels.append(separated[0]) reviews.append(separated[1]) for i in range(len(labels)): labels[i] = int(labels[i] == '__label__1') all_words = [] for i in range(len(reviews)): tokens = reTokenizer.tokenize(reviews[i]) reviews[i] = [] for word in tokens: word = word.lower() all_words.append(word) reviews[i].append(word) vocab_pickle_location = os.path.join(vocab_directory, "all_words.pkl") if not os.path.isdir(vocab_directory): print("Error: vocab_directory doesn't exist!") else: all_words = pickle.load(open(vocab_pickle_location, 'rb')) all_words = all_words[:vocab_size] word2int = {all_words[i][0]: i + 1 for i in range(vocab_size)} # int2word = {x: y for y, x in word2int.items()} # dict_as_list = list(word2int) def review2intlist(rev_text): int_list = [] for word in rev_text: if word in word2int.keys(): int_list.append(word2int[word]) return int_list X = [] for i in range(len(reviews)): X.append(review2intlist(reviews[i])) X = sequence.pad_sequences(X, maxlen=max_length) LSTM_inputs = np.zeros(shape=(max_length, num_data_points), dtype=np.float32) for i in range(num_data_points): LSTM_inputs[:, i] = X[i] LSTM_inputs = LSTM_inputs.T LSTM_outputs = np.zeros(shape=num_data_points) for i in range(num_data_points): LSTM_outputs[i] = labels[i] x_train, y_train = LSTM_inputs[:train_size], LSTM_outputs[:train_size] x_test, y_test = LSTM_inputs[train_size:], LSTM_outputs[train_size:] half_test_size = int(len(y_test)/2) x_valid = x_test[:half_test_size] y_valid = y_test[:half_test_size] x_test = x_test[half_test_size:] y_test = y_test[half_test_size:] print("Finished preparing data...") return x_train, y_train, x_test, y_test, x_valid, y_valid
test_texts_2.append(text_to_wordlist(values[2])) test_ids.append(values[0]) print('Found %s texts in test.csv' % len(test_texts_1)) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2) sequences_1 = tokenizer.texts_to_sequences(texts_1) sequences_2 = tokenizer.texts_to_sequences(texts_2) test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1) test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2) word_index = tokenizer.word_index print('Found %s unique tokens' % len(word_index)) data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH) data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH) labels = np.array(labels) print('Shape of data tensor:', data_1.shape) print('Shape of label tensor:', labels.shape) test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH) test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH) test_ids = np.array(test_ids) ######################################## ## generate leaky features ######################################## train_df = pd.read_csv(TRAIN_DATA_FILE) test_df = pd.read_csv(TEST_DATA_FILE)
def pad(self, data, len=None): from keras.preprocessing.sequence import pad_sequences return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0)
def y_label_change(k,maxlen,sen_label,sen): sen=sen[0:k] tokenizer = Tokenizer(num_words=None) tokenizer.fit_on_texts(sen) #texts作为处理对象 word_sequence = tokenizer.texts_to_sequences(sen) #将文本转换为由索引表示的序列数据 train_data = pad_sequences(word_sequence, maxlen=maxlen, padding="post") #对padding进行设置,否则会默认从前面开始填充 word_index = tokenizer.word_index #word到索引的映射列表 # word_index['PAD']=0 # word_index['UNK']=1 print(train_data.shape) print("word",word_index) #model = KeyedVectors.load_word2vec_format('./model/text.model.bin', binary=True) model=gensim.models.Word2Vec.load('./model/ner.model') embedding_matrix = np.zeros((len(word_index) + 1, 100)) for word, i in word_index.items(): if word in model: embedding_matrix[i] = np.asarray(model[word]) elif word not in model: # words not found in embedding index will be all-zeros. embedding_matrix[i] =np.asarray(0) print("t",embedding_matrix.shape) tag=['O','B-TIM', 'I-TIM','B-LOC', 'I-LOC','B-ORG', 'I-ORG','B-COM', 'I-COM','B-PRO', 'I-PRO','B-JOB', 'I-JOB','B-PER', 'I-PER'] Y =sen_label[0:k]#以下操作将二维结构的标签矩阵转换为多类别数值特征 l=[] a=0 b=0 for a in range(len(Y)): for b in range(len(Y[a])): if Y[a][b]==tag[0]: Y[a][b]=0 elif Y[a][b]==tag[1]: Y[a][b] = 1 elif Y[a][b]==tag[2]: Y[a][b] = 2 elif Y[a][b]==tag[3]: Y[a][b] = 3 elif Y[a][b]==tag[4]: Y[a][b] = 4 elif Y[a][b] == tag[5]: Y[a][b] = 5 elif Y[a][b] == tag[6]: Y[a][b] = 6 elif Y[a][b] == tag[7]: Y[a][b] = 7 elif Y[a][b] == tag[8]: Y[a][b] = 8 elif Y[a][b] == tag[9]: Y[a][b] = 9 elif Y[a][b] == tag[10]: Y[a][b] = 10 elif Y[a][b] == tag[11]: Y[a][b] = 11 elif Y[a][b] == tag[12]: Y[a][b] = 12 elif Y[a][b] == tag[13]: Y[a][b] = 13 elif Y[a][b] == tag[14]: Y[a][b] = 14 else: pass b=b+1 a=a+1 print(Y) Y=pad_sequences(Y, maxlen=maxlen, padding="post") # print("labelsall",np.array(labels_all).shape) num_class=len(set(list(tag))) print(num_class) Y=np.expand_dims(Y, 2) return Y,tag,embedding_matrix,word_index,num_class,train_data
job_detail_pd[ 'Job_Description_key_word'] = job_detail_pd.Job_Description.apply( key_word_extract) # -------------------------- 建立字典 ------------------------------- # 建立2000个词的字典 token = Tokenizer(num_words=2000) token.fit_on_texts( job_detail_pd['Job_Description_key_word']) # 按单词出现次数排序,排序前2000的单词会列入词典中 # 使用token字典将“文字”转化为“数字列表” Job_Description_Seq = token.texts_to_sequences( job_detail_pd['Job_Description_key_word']) # 截长补短让所有“数字列表”长度都是50 词嵌入前的预处理 Job_Description_Seq_Padding = sequence.pad_sequences(Job_Description_Seq, maxlen=50) # 长度都填充到50 x_train = Job_Description_Seq_Padding y_train = job_detail_pd['label'].tolist() # 把数组转化为列表 # ------------------ class--------------------*/ from keras.layers import Input from keras.models import Model inputs = Input(shape=(50, )) class JobModel(keras.Model): def __init__(self): super(JobModel, self).__init__() self.embedding = Embedding(output_dim=32, input_dim=2000) self.conv1 = Conv1D(256, 3, activation='relu')
y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0) tonnetz_std = np.std(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0) return (mfcc_mean, chroma_mean, mel_mean, contrast_mean, tonnetz_mean, mfcc_std, chroma_std, mel_std, contrast_std, tonnetz_std) for fn in files: print("Process...", fn) try: print('process..', fn) feature_lld = extract_lld(fn) feature_hfs = extract_hfs(fn) except Exception as e: print('cannot open', fn) traceback.print_exc() sys.exit(3) lld_features = np.hstack(feature_lld) hfs_features = np.hstack(feature_hfs) feat_lld.append(lld_features) feat_hfs.append(hfs_features) #feat_np = np.array(feat) feat_lld = np.array(feat_lld) feat_lld = sequence.pad_sequences(feat_lld, dtype='float64') np.save('../data/song_librosa.npy', feat_lld) np.save('../data/song_librosa_hfs.npy', feat_hfs)
def test_tokenize(tokenizer, sents, MAX_SEQUENCE_LENGTH=500): sequences = tokenizer.texts_to_sequences(sents) text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) return text
def encode_docs(tokenizer, max_length, docs): # integer encode encoded = tokenizer.texts_to_sequences(docs) # pad sequences padded = pad_sequences(encoded, maxlen=max_length, padding='post') return padded