def do_rnn(x_train,x_test,y_train,y_test): global n_words # Data preprocessing # Sequence padding print "GET n_words embedding %d" % n_words #x_train = pad_sequences(x_train, maxlen=100, value=0.) #x_test = pad_sequences(x_test, maxlen=100, value=0.) # Converting labels to binary vectors y_train = to_categorical(y_train, nb_classes=2) y_test = to_categorical(y_test, nb_classes=2) # Network building net = tflearn.input_data(shape=[None, 100,n_words]) net = tflearn.lstm(net, 10, return_seq=True) net = tflearn.lstm(net, 10, ) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.1,name="output", loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=3) model.fit(x_train, y_train, validation_set=(x_test, y_test), show_metric=True, batch_size=32,run_id="maidou")
def do_rnn(x,y): global max_document_length print "RNN" trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0) y_test=testY trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, max_document_length]) net = tflearn.embedding(net, input_dim=10240000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=10,run_id="webshell",n_epoch=5) y_predict_list=model.predict(testX) y_predict=[] for i in y_predict_list: if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) do_metrics(y_test, y_predict)
def do_rnn(trainX, testX, trainY, testY): global n_words # Data preprocessing # Sequence padding print "GET n_words embedding %d" % n_words trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.) testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, MAX_DOCUMENT_LENGTH]) net = tflearn.embedding(net, input_dim=n_words, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=3) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32,run_id="maidou")
def do_rnn(trainX, testX, trainY, testY): max_document_length=64 y_test=testY trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, max_document_length]) net = tflearn.embedding(net, input_dim=10240000, output_dim=64) net = tflearn.lstm(net, 64, dropout=0.1) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0,tensorboard_dir="dga_log") model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=10,run_id="dga",n_epoch=1) y_predict_list = model.predict(testX) #print y_predict_list y_predict = [] for i in y_predict_list: print i[0] if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) print(classification_report(y_test, y_predict)) print metrics.confusion_matrix(y_test, y_predict)
def do_cnn_doc2vec(trainX, testX, trainY, testY): global max_features print "CNN and doc2vec" #trainX = pad_sequences(trainX, maxlen=max_features, value=0.) #testX = pad_sequences(testX, maxlen=max_features, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Building convolutional network network = input_data(shape=[None,max_features], name='input') network = tflearn.embedding(network, input_dim=1000000, output_dim=128,validate_indices=False) branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") network = merge([branch1, branch2, branch3], mode='concat', axis=1) network = tf.expand_dims(network, 2) network = global_max_pool(network) network = dropout(network, 0.8) network = fully_connected(network, 2, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy', name='target') # Training model = tflearn.DNN(network, tensorboard_verbose=0) model.fit(trainX, trainY, n_epoch=5, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=100,run_id="review")
def do_cnn(trainX, trainY,testX, testY): global n_words # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=MAX_DOCUMENT_LENGTH, value=0.) testX = pad_sequences(testX, maxlen=MAX_DOCUMENT_LENGTH, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Building convolutional network network = input_data(shape=[None, MAX_DOCUMENT_LENGTH], name='input') network = tflearn.embedding(network, input_dim=n_words+1, output_dim=128) branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") network = merge([branch1, branch2, branch3], mode='concat', axis=1) network = tf.expand_dims(network, 2) network = global_max_pool(network) network = dropout(network, 0.5) network = fully_connected(network, 2, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy', name='target') # Training model = tflearn.DNN(network, tensorboard_verbose=0) model.fit(trainX, trainY, n_epoch = 20, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=32)
def do_cnn(x,y): global max_document_length print "CNN and tf" trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0) y_test=testY trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Building convolutional network network = input_data(shape=[None,max_document_length], name='input') network = tflearn.embedding(network, input_dim=1000000, output_dim=128) branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") network = merge([branch1, branch2, branch3], mode='concat', axis=1) network = tf.expand_dims(network, 2) network = global_max_pool(network) network = dropout(network, 0.8) network = fully_connected(network, 2, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy', name='target') model = tflearn.DNN(network, tensorboard_verbose=0) #if not os.path.exists(pkl_file): # Training model.fit(trainX, trainY, n_epoch=5, shuffle=True, validation_set=0.1, show_metric=True, batch_size=100,run_id="webshell") # model.save(pkl_file) #else: # model.load(pkl_file) y_predict_list=model.predict(testX) #y_predict = list(model.predict(testX,as_iterable=True)) y_predict=[] for i in y_predict_list: print i[0] if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) print 'y_predict_list:' print y_predict_list print 'y_predict:' print y_predict #print y_test do_metrics(y_test, y_predict)
def process_form_data(filename) : data = h5py.File(filename, 'r') output = h5py.File('forms_out.h5', 'w') test_image = output.create_dataset('test_image', (330, 3, 256, 256), dtype=np.uint8) train_image = output.create_dataset('train_image', (770, 3, 256, 256), dtype=np.uint8) test_label = output.create_dataset('test_label', (330,11), dtype=np.int8) train_label = output.create_dataset('train_label', (770,11), dtype=np.int8) image, labels = shuffle(data['image'], data['form']) onehot_labels = to_categorical(labels, 11) count = {} train_count = 0 test_count = 0 for i, l in enumerate(labels) : if l not in count : count[l] = 0 if count[l] > 29 : train_image[train_count] = image[i] train_label[train_count] = onehot_labels[i] train_count += 1 else : test_image[test_count] = image[i] test_label[test_count] = onehot_labels[i] test_count += 1 count[l] += 1 output.close()
def do_rnn(trainX, testX, trainY, testY): global max_sequences_len global max_sys_call # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=max_sequences_len, value=0.) testX = pad_sequences(testX, maxlen=max_sequences_len, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY_old=testY testY = to_categorical(testY, nb_classes=2) # Network building print "GET max_sequences_len embedding %d" % max_sequences_len print "GET max_sys_call embedding %d" % max_sys_call net = tflearn.input_data([None, max_sequences_len]) net = tflearn.embedding(net, input_dim=max_sys_call+1, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.3) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.1, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=3) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32,run_id="maidou") y_predict_list = model.predict(testX) #print y_predict_list y_predict = [] for i in y_predict_list: #print i[0] if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) #y_predict=to_categorical(y_predict, nb_classes=2) print(classification_report(testY_old, y_predict)) print metrics.confusion_matrix(testY_old, y_predict)
def get_error(model,f,t,label): pred_probs=[model.predict(f[i*500:min((i+1)*500,len(f))]) for i in range(int(len(f)/500)+1)] y=[val for sublist in pred_probs for val in list(sublist)] # y=model.predict(f) yy=np.argmax(y,axis=1) acc=accuracy_score(t,to_categorical(yy,6)) get_statistics(y, yy, t, label) return 1-acc
def do_cnn_word2vec_2d(trainX, testX, trainY, testY): global max_features global max_document_length print "CNN and word2vec2d" y_test = testY #trainX = pad_sequences(trainX, maxlen=max_features, value=0.) #testX = pad_sequences(testX, maxlen=max_features, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Building convolutional network network = input_data(shape=[None,max_document_length,max_features,1], name='input') network = conv_2d(network, 32, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 2) network = local_response_normalization(network) network = conv_2d(network, 64, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 2) network = local_response_normalization(network) network = fully_connected(network, 128, activation='tanh') network = dropout(network, 0.8) network = fully_connected(network, 256, activation='tanh') network = dropout(network, 0.8) network = fully_connected(network, 2, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.01, loss='categorical_crossentropy', name='target') model = tflearn.DNN(network, tensorboard_verbose=0) model.fit(trainX, trainY, n_epoch=5, shuffle=True, validation_set=(testX, testY), show_metric=True,run_id="sms") y_predict_list = model.predict(testX) print y_predict_list y_predict = [] for i in y_predict_list: print i[0] if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) print(classification_report(y_test, y_predict)) print metrics.confusion_matrix(y_test, y_predict)
def get_data(filename, num_frames, num_classes, input_length): """Get the data from our saved predictions or pooled features.""" # Local vars. X = [] y = [] temp_list = deque() classes = get_classes() # Open and get the features. with open(filename, 'rb') as fin: frames = pickle.load(fin) print(f"Frames {len(frames)}") print(f"Frame tipo: {frames[0]}") for frame in frames: features = frame[0] actual = frame[1] # Add to the queue. if len(temp_list) == num_frames - 1: temp_list.append(features) flat = list(temp_list) X.append(np.array(flat)) y.append( classes.index(actual)) # Convert our labels into integer. temp_list.popleft() else: temp_list.append(features) continue print("Total dataset size: %d" % len(X)) # Numpy. X = np.array(X) y = np.array(y) print(f"X {X.shape}") print(f"y {y.shape}") # Reshape. X = X.reshape(-1, num_frames, input_length) # One-hot encoded categoricals. y = to_categorical(y, num_classes) print(f"X {X.shape}") print(f"y {y.shape}") # Split into train and test. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) return X_train, X_test, y_train, y_test
def prep_train_test(n, dev_pct): np.random.seed(87) shuffle_indices = np.random.permutation(np.arange(n)) split = int(n*dev_pct) return shuffle_indices[split:], shuffle_indices[:split] np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(pos_x))) pos_x_shuffled = pos_x[shuffle_indices] dev_idx = -1 * int(dev_pct * float(len(pos_x))) pos_train = pos_x_shuffled[:dev_idx] pos_test = pos_x_shuffled[dev_idx:] np.random.seed(10) shuffle_indices=np.random.permutation(np.arange(len(neg_x))) neg_x_shuffled = neg_x[shuffle_indices] dev_idx = -1 * int(dev_pct * float(len(neg_x))) neg_train = neg_x_shuffled[:dev_idx] neg_test = neg_x_shuffled[dev_idx:] x_train = np.array(list(pos_train) + list(neg_train)) y_train = len(pos_train)*[1] + len(neg_train)*[0] x_test = np.array(list(pos_test) + list(neg_test)) y_test = len(pos_test)*[1] + len(neg_test)*[0] y_train = to_categorical(y_train, nb_classes=2) y_test = to_categorical(y_test, nb_classes=2) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(x_train))) x_train = x_train[shuffle_indices] y_train = y_train[shuffle_indices] np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(x_test))) x_test = x_test[shuffle_indices] y_test = y_test[shuffle_indices] print("Train Mal/Ben split: {}/{}".format(len(pos_train), len(neg_train))) print("Test Mal/Ben split: {}/{}".format(len(pos_test), len(neg_test))) print("Train/Test split: {}/{}".format(len(y_train), len(y_test))) print("Train/Test split: {}/{}".format(len(x_train), len(x_test))) return x_train, y_train, x_test, y_test
def prepare_data(data): # acquring tokenizer, and tokenizing strings related to action made # TODO this probably can be reworked to dialog choice using TKinter, but I didn't find such need in my case if os.path.isfile('tokenizer.pickle'): with open('tokenizer.pickle', 'rb') as handle: tokenizer = pickle.load(handle) # if tokenizer isn't found, new token is created else: tokenizer = tfds.features.text.Tokenizer() # saving tokenizer for backu[ with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) vocabulary_set = set() # creating additional copy of data to resolve problems with IndexErrors exceptions data_copy = [] # tokenizing actions from data sets and creating an vocabulary set for i in data: some_tokens = tokenizer.tokenize(i[1][1]) vocabulary_set.update(some_tokens) with open('vocabulary.pickle', 'wb') as handle: pickle.dump(vocabulary_set, handle, protocol=pickle.HIGHEST_PROTOCOL) encoder = prepare_encoder(vocabulary_set) # saving for i in data: # i is in format [image array,[(x_postion,y_position), action]] before this processing # Preparing new set of processed data. Getting previous screen array, encoding string and # getting screen position data from (x,y) tuple to [x,y] array screen = i[0] # token is a 1 element list with int as tokenized string value. token = encoder.encode(i[1][1])[0] screen_position_tuple = i[1][0] x_value = screen_position_tuple[0] y_value = screen_position_tuple[1] # creating a row of preprocessed data and appending new array new_data = (screen, [x_value, y_value, token]) data_copy.append(new_data) a = [] for i in data_copy: a.append(i[1][2]) data_utils.to_categorical(a, encoder.vocab_size) index = 0 for i in data_copy: i[1][2] = a[index] index += 1 return data_copy
def prep_train_test(pos_x, neg_x, dev_ratio): """ 构建训练测试集 :param pos_x: :param neg_x: :param dev_ratio: 测试集比例 :return: """ np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(pos_x))) pos_x_shuffled = pos_x[shuffle_indices] dev_idx = -1 * int(dev_ratio * float(len(pos_x))) pos_train = pos_x_shuffled[:dev_idx] pos_test = pos_x_shuffled[dev_idx:] np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(neg_x))) neg_x_shuffled = neg_x[shuffle_indices] dev_idx = -1 * int(dev_ratio * float(len(neg_x))) neg_train = neg_x_shuffled[:dev_idx] neg_test = neg_x_shuffled[dev_idx:] x_train = np.array(list(pos_train) + list(neg_train)) y_train = len(pos_train)*[1] + len(neg_train)*[0] x_test = np.array(list(pos_test) + list(neg_test)) y_test = len(pos_test)*[1] + len(neg_test)*[0] y_train = to_categorical(y_train, nb_classes=2) y_test = to_categorical(y_test, nb_classes=2) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(x_train))) x_train = x_train[shuffle_indices] y_train = y_train[shuffle_indices] np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(x_test))) x_test = x_test[shuffle_indices] y_test = y_test[shuffle_indices] print("Train Mal/Ben split: {}/{}".format(len(pos_train), len(neg_train))) print("Test Mal/Ben split: {}/{}".format(len(pos_test), len(neg_test))) print("Train/Test split: {}/{}".format(len(y_train), len(y_test))) print("Train/Test split: {}/{}".format(len(x_train), len(x_test))) return x_train, y_train, x_test, y_test
def getData_imdb(): from tflearn.datasets import imdb train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) print(trainX.shape) print(trainY) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) return trainX, testX, trainY, testY
def get_data(input_data_dump, num_frames_per_video, labels, ifTrain): """Get the data from our saved predictions or pooled features.""" # Local vars. X = [] y = [] temp_list = deque() # Open and get the features. with open(input_data_dump, 'rb') as fin: frames = pickle.load(fin) for i, frame in enumerate(frames): features = frame[0] actual = frame[1].lower() # frameCount = frame[2] # Convert our labels into binary. actual = labels[actual] # Add to the queue. if len(temp_list) == num_frames_per_video - 1: temp_list.append(features) flat = list(temp_list) X.append(np.array(flat)) y.append(actual) temp_list.clear() else: temp_list.append(features) continue print("Class Name\tNumeric Label") for key in labels: print("%s\t\t%d" % (key, labels[key])) print('DEBUG X ', len(X)) #print('tem', temp_list[0].shape) print(' Y ', len(y)) # Numpy. X = np.array(X) y = np.array(y) print("Dataset shape: ", X.shape) # One-hot encoded categoricals. y = to_categorical(y, len(labels)) # Split into train and test. if ifTrain: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) return X_train, X_test, y_train, y_test else: return X, y
def bi_lstm(trainX, trainY,testX, testY): trainX = pad_sequences(trainX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data(shape=[None, 200]) net = tflearn.embedding(net, input_dim=20000, output_dim=128) net = tflearn.bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128)) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') # Training model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2) model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64,run_id="rnn-bilstm")
def load_data_and_labels(args, input_file, word2idx: dict): """ Load research data from files, padding sentences and generate one-hot labels. Args: args: The arguments. input_file: The research record. word2idx: The word2idx dict. Returns: The dict <Data> (includes the record tokenindex and record labels) Raises: IOError: If word2vec model file doesn't exist """ if not input_file.endswith('.json'): raise IOError( "[Error] The research record is not a json file. " "Please preprocess the research record into the json file.") def _token_to_index(x: list): result = [] for item in x: if item not in word2idx.keys(): result.append(word2idx['_UNK']) else: word_idx = word2idx[item] result.append(word_idx) return result Data = dict() with open(input_file) as fin: Data['f_id'] = [] Data['b_id'] = [] Data['f_content_index'] = [] Data['b_content_index'] = [] Data['labels'] = [] for eachline in fin: record = json.loads(eachline) f_id = record['front_testid'] b_id = record['behind_testid'] f_content = record['front_features'] b_content = record['behind_features'] labels = record['label'] Data['f_id'].append(f_id) Data['b_id'].append(b_id) Data['f_content_index'].append(_token_to_index(f_content)) Data['b_content_index'].append(_token_to_index(b_content)) Data['labels'].append(labels) Data['f_pad_seqs'] = pad_sequences(Data['f_content_index'], maxlen=args.pad_seq_len, value=0.) Data['b_pad_seqs'] = pad_sequences(Data['b_content_index'], maxlen=args.pad_seq_len, value=0.) Data['onehot_labels'] = to_categorical(Data['labels'], nb_classes=2) return Data
def buildTrainingData(self): print("Building train data for SentClassificationModel[{}]...".format( self.model_name)) #initialize all keys required to browse data raw_data_key = 'raw' data_key = 'data' sent_class = 'class' conv_key = 'conv_ind' #read training data avg_words, avg_sents, conv = cu.processTaggedChat(self.train_data_file) trainX = [] trainY = [] for i, cdata in enumerate(conv): #if i >= 5: # break for sdata in cdata: trainX.append(sdata[data_key]) trainY.append(sdata[sent_class]) #print("trainX[{}]****labels[{}]".format(sdata[data_key],sdata[sent_class])) print( "Training data of [{}] sentences and [{}] labels loaded for classification..." .format(len(trainX), len(trainY))) self.vocab = nu.Vocab(trainX, self.config) #build X vocab dict & required data self.labels = nu.Vocab(trainY, self.config, label=True) #build Y vocab dict & required data self.labels.setUNK( 'UNK1') #Explicitly set label for unknown classification #Create encoding for training data self.encodedXdata = self.vocab.getCodedData() self.encodedYdata = self.labels.getCodedData() print("Coded X {} data: {}".format(len(self.encodedXdata), self.vocab.getData()[:10])) print("Coded X code: {}".format(self.encodedXdata[:10])) print("Coded Y size {} unique {} data: {}".format( len(self.encodedYdata), len(set(self.encodedYdata)), self.labels.getData()[:10])) print("Coded Y code: {}".format(self.encodedYdata[:10])) #pad sequence with zero's self.encodedXdata = pad_sequences(self.encodedXdata, maxlen=self.config.sent_size, value=0) self.no_classes = len(set(self.encodedYdata)) #no of target classes self.Y = to_categorical( self.encodedYdata, nb_classes=self.no_classes) #Y as required by tflearn #release unwanted variables. trainX = None trainY = None
def run_on_imdb(): # IMDB Dataset loading train, test, _ = imdb.load_data(path=imdb_dataset_path, n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) if check_file_exist(imdb_model_path): model.load(imdb_model_path) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32) if save_model: print("Saving model as 'imdb_model.tfl'") model.save(imdb_model_path) return 0
def do_rnn(x, y): global max_document_length print("RNN") trainX, testX, trainY, testY = train_test_split(x, y, test_size=0.4, random_state=0) y_test = testY trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, max_document_length]) net = tflearn.embedding(net, input_dim=10240000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=10, run_id="webshell", n_epoch=5) y_predict_list = model.predict(testX) y_predict = [] for i in y_predict_list: if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) do_metrics(y_test, y_predict)
def load_sst(glove_data): # Get the phrases and their indices # print("Getting Phrase Dictionary...") # _, word_dict = get_phrases_dict('stanfordSentimentTreebank/dictionary.txt') if glove_data is None: word_dict = increment_word_dict() else: print("Getting glove word indices...") word_dict = glove_word_indices(glove_data) # Convert the phrases to ints with word indices so they can be processed by Neural Network print("Converting to Ints...") int_phrases = phrases2ints( word_dict, 'stanfordSentimentTreebank/datasetSentences.txt') # Convert indices to sentiment values print("Converting Indices to Sentiment Values...") phrase_sentiments = indices_to_sentiment( int_phrases, 'stanfordSentimentTreebank/sentiment_labels.txt') # Split into train, test, and dev groups print("Splitting into train, test, and dev groups...") train, test, val = split_database( phrase_sentiments, 'stanfordSentimentTreebank/datasetSplit.txt') # Unzip input sequences and sentiment labels trainX, trainY = unzip_examples(train) valX, valY = unzip_examples(val) testX, testY = unzip_examples(test) # Sequence padding print("Padding Sequences...") trainX = pad_sequences(trainX, maxlen=200, value=0.) valX = pad_sequences(valX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors print("Converting labels to binary vectors...") trainY = to_categorical(trainY, nb_classes=2) valY = to_categorical(valY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) return Dataset(trainX, trainY, valX, valY, testX, testY)
def evaluate(sess, dataset, model, step, max_dev_itr=100, verbose=True, mode='val'): results_dir = model.val_results_dir if mode == 'val' \ else model.test_results_dir samples_path = os.path.join(results_dir, '{}_samples_{}.txt'.format(mode, step)) history_path = os.path.join(results_dir, '{}_history.txt'.format(mode)) avg_val_loss, avg_acc = 0.0, 0.0 print("Running Evaluation {}:".format(mode)) tflearn.is_training(False, session=sess) # This is needed to reset the local variables initialized by # TF for calculating streaming Pearson Correlation and MSE all_dev_text, all_dev_pred, all_dev_gt = [], [], [] dev_itr = 0 while (dev_itr < max_dev_itr and max_dev_itr != 0) \ or mode in ['test', 'train']: val_batch = dataset.next_batch(FLAGS.batch_size, pad=model.args["sequence_length"], one_hot=False, raw=False) cat_targets = [ to_categorical(n, len(dataset.vocab_w2i[2])) for n in val_batch.ner ] loss, pred, acc = model.evaluate_step(sess, val_batch.sentences, val_batch.ner, cat_targets) avg_val_loss += loss avg_acc += acc all_dev_text += id2seq(val_batch.sentences, dataset.vocab_i2w[0]) all_dev_pred += onehot2seq(pred, dataset.vocab_i2w[2]) all_dev_gt += onehot2seq(cat_targets, dataset.vocab_i2w[2]) dev_itr += 1 if mode == 'test' and dataset.epochs_completed == 1: break if mode == 'train' and dataset.epochs_completed == 1: break result_set = (all_dev_text, all_dev_pred, all_dev_gt) avg_loss = avg_val_loss / dev_itr avg_acc = avg_acc / dev_itr if verbose: print("{}:\t Loss: {}".format(mode, avg_loss, avg_acc)) with open(samples_path, 'w') as sf, open(history_path, 'a') as hf: for x1, pred, gt in zip(all_dev_text, all_dev_pred, all_dev_gt): sf.write('{}\t{}\t{}\n'.format(x1, pred, gt)) hf.write('STEP:{}\tTIME:{}\tacc:{}\tLoss\t{}\n'.format( step, datetime.datetime.now().isoformat(), avg_acc, avg_loss)) tflearn.is_training(True, session=sess) return avg_loss, avg_acc, result_set
def convert(self, X, y=None): """Pad and index X, make y categorical.""" X = np.array(list(self.processor.transform(X))) X = pad_sequences(X, maxlen=self.max_len, value=0.) if y: y = np.asarray(y) y = to_categorical(y, nb_classes=self.num_classes) return X, y else: return X
def trainEmbedding(): X_train, y_train, X_test, y_test = loadInput(RUMOR_TF_INPUTPICKLE) y_train = to_categorical(y_train, nb_classes=2) y_test = to_categorical(y_test, nb_classes=2) print('X_train: ', X_train) print('X_test: ', X_test) model = build_model() sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) adagrad = optimizers.Adagrad(lr=0.01, epsilon=1e-08, decay=0.0) adadelta = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-8, decay=0.) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model_checkpoint = ModelCheckpoint(MODEL_PATH_ADAGRAD, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', period=1) model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=64, callbacks=[tensor_board, model_checkpoint]) model.evaluate(X_test, y_test, show_accuracy = True)
def do_rnn(trainX, testX, trainY, testY): max_document_length = 64 y_test = testY trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, max_document_length]) net = tflearn.embedding(net, input_dim=10240000, output_dim=64) net = tflearn.lstm(net, 64, dropout=0.1) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0, tensorboard_dir="dga_log") model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=10, run_id="dga", n_epoch=1) y_predict_list = model.predict(testX) #print y_predict_list y_predict = [] for i in y_predict_list: print(i[0]) if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) print(classification_report(y_test, y_predict)) print(metrics.confusion_matrix(y_test, y_predict)) return y_predict, y_test
def GetData(): max_features = 20000 maxlen = 80 # cut texts after this number of words (among top max_features most common words) batch_size = 32 print('Loading data...\n') (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) print(len(x_train), 'train sequences\n') print(len(x_test), 'test sequences\n') print('Pad sequences (samples x time)\n') x_train = sequence.pad_sequences(x_train, maxlen=maxlen) x_test = sequence.pad_sequences(x_test, maxlen=maxlen) y_train = to_categorical(y_train, 2) y_test = to_categorical(y_test, 2) print('x_train shape:\t', x_train.shape) print('x_test shape:\t', x_test.shape) return max_features, x_train, x_test, y_train, y_test, batch_size, maxlen
def _train_model(self, save_path): ''' :param save_path: Path to save the model to :return: None ''' tf.reset_default_graph() train, test = imdb.load_data(num_words=10000, index_from=3) train_x, train_y = train test_x, test_y = test train_x = pad_sequences(train_x, maxlen=100, value=0.) test_x = pad_sequences(test_x, maxlen=100, value=0.) train_y = to_categorical(train_y, nb_classes=2) test_y = to_categorical(test_y, nb_classes=2) # Training self.model.fit(train_x, train_y, n_epoch=5, validation_set=(test_x, test_y), show_metric=True, batch_size=32) self.model.save(save_path)
def predict_type(self,bookdir): # Import dataset X, Y = image_preloader(bookdir, image_shape=(128, 128), mode='file', categorical_labels=False, normalize=False) Y = to_categorical(Y, 3) print("-- Runbook Import Complete.") # Predict label prediction = self.model.predict(X) return prediction
def trainArtToPrimaryTypeModel(artPath, jsonPath, testProp, numEpochs=50): ''' Trains a convolutional network to categorize card art by primary type Inputs: artPath: path to card art jsonPath: path to card data json file testProp: proportion of samples to be used for test/validation numEpochs: number of epochs to train for (50) ''' (X, Y), (X_Test, Y_Test), numCategories = turnPicsToSimpleInputs(artPath, jsonPath, testProp=testProp) X, Y = shuffle(X, Y) Y = to_categorical(Y, numCategories) Y_Test = to_categorical(Y_Test, numCategories) # Train model as classifier model = artToMainTypeModel(numCategories) model.fit(X, Y, n_epoch=numEpochs, shuffle=True, validation_set=(X_Test, Y_Test), show_metric=True, batch_size=100, run_id='mtg_classifier')
def MNISTRNN(): train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000,valid_portion=0.1) X_train, Y_train = train X_test, Y_test = test X_train = pad_sequences(X_train, maxlen=100,value=0.) X_test = pad_sequences(X_test, maxlen=100,value=0.) Y_train = to_categorical(Y_train, nb_classes=2) Y_test = to_categorical(Y_test, nb_classes=2) # LSTM RNN = tflearn.input_data([None, 100]) RNN = tflearn.embedding(RNN, input_dim=10000, output_dim=128) RNN = tflearn.lstm(RNN, 128, dropout=0.8) RNN = tflearn.fully_connected(RNN, 2, activation='softmax') RNN = tflearn.regression(RNN, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # train model = tflearn.DNN(RNN, tensorboard_verbose=0, tensorboard_dir='MINST_tflearn_board_RNN/') model.fit(X_train, Y_train, validation_set=(X_test,Y_test),show_metric=True,batch_size=32)
def do_cnn_word2vec_2d_345(trainX, testX, trainY, testY): global max_features global max_document_length print "CNN and word2vec_2d_345" y_test = testY trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Building convolutional network network = input_data(shape=[None,max_document_length,max_features,1], name='input') network = tflearn.embedding(network, input_dim=1, output_dim=128,validate_indices=False) branch1 = conv_2d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") branch2 = conv_2d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") branch3 = conv_2d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") network = merge([branch1, branch2, branch3], mode='concat', axis=1) network = tf.expand_dims(network, 2) network = global_max_pool_2d(network) network = dropout(network, 0.8) network = fully_connected(network, 2, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy', name='target') # Training model = tflearn.DNN(network, tensorboard_verbose=0) model.fit(trainX, trainY, n_epoch=5, shuffle=True, validation_set=(testX, testY), show_metric=True, batch_size=100,run_id="sms") y_predict_list = model.predict(testX) print y_predict_list y_predict = [] for i in y_predict_list: print i[0] if i[0] > 0.5: y_predict.append(0) else: y_predict.append(1) print(classification_report(y_test, y_predict)) print metrics.confusion_matrix(y_test, y_predict)
def do_rnn_wordbag(trainX, testX, trainY, testY): y_test=testY #trainX = pad_sequences(trainX, maxlen=100, value=0.) #testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=1000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.1) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.005, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=1,run_id="uba",n_epoch=10) y_predict_list = model.predict(testX) #print y_predict_list y_predict = [] for i in y_predict_list: #print i[0] if i[0] >= 0.5: y_predict.append(0) else: y_predict.append(1) print(classification_report(y_test, y_predict)) print metrics.confusion_matrix(y_test, y_predict) print y_train print "ture" print y_test print "pre" print y_predict
def main(): (X_train, y_train), (X_test, y_test), _ = imdb.load_data() X_train = np.array(pad_sequences(X_train, maxlen=100)) X_test = np.array(pad_sequences(X_test, maxlen=100)) vocab_size = X_train.max() + 1 print 'vocab size: {}'.format(vocab_size) y_train = to_categorical(np.array(y_train), 2) y_test = np.array(y_test) cnn = Discriminator(vocab_size, 100, 100, [2, 3], 50, 2) cnn.train(X_train, y_train, 5)
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0): # IMDB Dataset loading train, test, _ = imdb.load_data(path=file_path, n_words=vocab_size, valid_portion=val_fraction, sort_by_len=False) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.) testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) train_dataset = DataSet(trainX, trainY) return train_dataset
def load_data(self, ds): _ds = None if ds['name'] == 'mnist': from tflearn.datasets import mnist as _ds self._X, self._Y, self._test_X, self._test_Y = _ds.load_data( one_hot=ds.get('one_hot', False)) if ds['name'] == 'cifar10': from tflearn.datasets import cifar10 as _ds (self._X, self._Y), (self._test_X, self._test_Y) = _ds.load_data( one_hot=ds.get('one_hot', False)) from tflearn.data_utils import shuffle, to_categorical del _ds # discard if 'reshape' in ds: self.reshape(ds['reshape']) if ds.get('shuffle', False): self._X, self._Y = shuffle(self._X, self._Y) if ds.get('to_categorical', False): self._Y = to_categorical(self._Y, None) self._test_Y = to_categorical(self._test_Y, None) return self
def pad_data(data, max_seq_len): """ Padding each sentence of research data according to the max sentence length. Returns the padded data and data labels. :param data: The research data :param max_seq_len: The max sentence length of research data :returns: The padded data and data labels """ data_front = pad_sequences(data.front_tokenindex, maxlen=max_seq_len, value=0.) data_behind = pad_sequences(data.behind_tokenindex, maxlen=max_seq_len, value=0.) labels = to_categorical(data.labels, nb_classes=2) return data_front, data_behind, labels
def lstm(trainX, trainY,testX, testY): # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32,run_id="rnn-lstm")
def extract_data(filename): """Extract the images into a 4D tensor [image index, y, x, channels]. """ print('Extracting', filename) # get data from h5py file = h5py.File(filename, 'r') train_data = file['train_data'].value train_label = file['train_label'] test_data = file['test_data'].value test_label = file['test_label'] train_label = np.int64(train_label) test_label = np.int64(test_label) train_num = train_data.shape[0] test_num = test_data.shape[0] max, min = train_data.max(), train_data.min() train_data_new = (train_data - min) / (max - min) train_data_out = np.zeros([ train_data.shape[0], train_data.shape[3], train_data.shape[1], train_data.shape[2], 1 ]) for i in range(train_data.shape[3]): train_data_out[:, i, :, :, :] = train_data_new[:, :, :, i] max, min = test_data.max(), test_data.min() test_data_new = (test_data - min) / (max - min) test_data_out = np.zeros([ test_data.shape[0], test_data.shape[3], test_data.shape[1], test_data.shape[2], 1 ]) for i in range(test_data.shape[3]): test_data_out[:, i, :, :, :] = test_data_new[:, :, :, i] train_data_out, train_label = shuffle(train_data_out, train_label) train_label = to_categorical(train_label, 20) test_label = to_categorical(test_label, 20) return train_data_out, train_label, test_data_out, test_label
def train_and_save_model(): # Run this if pkl files already exist in directory pickle_files trainX, testX = dp.convert_reviews() trainY, testY = dp.get_sentiment_arrays() # AVG REVIEW LENGTH: 165.3178 # REMOVE THIS JUNK print('trainX ' + str(trainX[0])) print('trainX ' + str(len(trainX[0]))) print('trainY ' + str(trainY[0])) print('trainY ' + str(type(trainY[0]))) # Sequence padding trainX = pad_sequences(trainX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building deep_net = tfl.input_data([None, 200]) deep_net = tfl.embedding(deep_net, input_dim=10000, output_dim=128) deep_net = tfl.lstm(deep_net, 128, dropout=0.8) deep_net = tfl.fully_connected(deep_net, 2, activation='softmax') deep_net = tfl.regression(deep_net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training 1ST RUN model = tfl.DNN(deep_net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32, n_epoch=20) model.save('./saved_models/model1.tfl')
def train(maxlen=100, embedding_dim=128): # 主训练/测试代码 start = time.time() l_trainX, r_trainX, ret_labels, l_topredictX, r_topredictX = do.load_data_bi_word2vec(maxlen=maxlen, words_keep=50000, validation_portion=0., embedding_dim=embedding_dim, ma="A") trainY = to_categorical(ret_labels, nb_classes=3) del ret_labels lnet = tflearn.input_data([None, maxlen, embedding_dim]) rnet = tflearn.input_data([None, maxlen, embedding_dim]) lnet = tflearn.gru(lnet, embedding_dim, dropout=0.8, return_seq=False, dynamic=True) rnet = tflearn.gru(rnet, embedding_dim, dropout=0.8, return_seq=False, dynamic=True) net = tflearn.layers.merge_outputs([lnet, rnet]) net = tflearn.fully_connected(net, 3, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit([l_trainX, r_trainX], trainY, validation_set=0.1, show_metric=True, batch_size=32) model.save('MODELS/E_W2V_GRU_TC{}_{}.dy'.format(embedding_dim, maxlen)) # model.load('MODELS/E_W2V_GRU_TC{}_{}.dy'.format(embedding_dim, maxlen)) del l_trainX del r_trainX del trainY idx2cla = {0: 'neu', 1: 'pos', 2: 'neg'} filename = "Result/result_{}.csv".format(datetime.datetime.now().strftime("%Y%m%d%H%M")) prefix = list(open('Result/A_AFTER_NRP_200', 'r').readlines()) f = open(filename, 'w') f.write('SentenceId,View,Opinion\n') a = [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000] b = [5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 65000] ANS = [] for i in range(12): ans = model.predict([l_topredictX[a[i]:b[i]], r_topredictX[a[i]:b[i]]]) ANS.extend([s for s in ans]) print("ANS.LENGTH: {}".format(len(ans))) for i, r in enumerate(ANS): f.write(prefix[i].strip()) idx = int(np.argmax(r)) f.write(idx2cla[idx]) k = "" for l in r: k += ',{:.4f}'.format(l) f.write(k) f.write('\n') f.close() end = time.time() print("TIME COST: {}".format(end-start)) outf = vote_by_score(filename) add(outf)
def __prepareData(document, labels, vocabulary): cv = CountVectorizer(vocabulary=vocabulary) le = LabelEncoder() x = cv.fit_transform(document).toarray() y_vector = le.fit_transform(labels) classes = le.classes_ num_classes = len(classes) y = to_categorical(y_vector, nb_classes=num_classes) return x, y, classes
def do_rnn_wordbag(trainX, testX, trainY, testY): global max_document_length print "RNN and wordbag" trainX = pad_sequences(trainX, maxlen=max_document_length, value=0.) testX = pad_sequences(testX, maxlen=max_document_length, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, max_document_length]) net = tflearn.embedding(net, input_dim=10240000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=10,run_id="review",n_epoch=5)
def create_datasets(file_path, vocab_size=30000, val_fraction=0.0): # IMDB Dataset loading train, test, _ = imdb.load_data( path=file_path, n_words=vocab_size, valid_portion=val_fraction, sort_by_len=False) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=FLAGS.max_len, value=0.) testX = pad_sequences(testX, maxlen=FLAGS.max_len, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) train_dataset = DataSet(trainX, trainY) return train_dataset
def do_cnn(trainX, testX, trainY, testY): # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=4) testY = to_categorical(testY, nb_classes=4) # Building convolutional network network = input_data(shape=[None, 32, 32,1], name='input') network = conv_2d(network, 16, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 2) network = local_response_normalization(network) network = conv_2d(network, 16, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 2) network = local_response_normalization(network) network = fully_connected(network, 16, activation='tanh') network = dropout(network, 0.1) network = fully_connected(network, 16, activation='tanh') network = dropout(network, 0.1) network = fully_connected(network, 4, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.01, loss='categorical_crossentropy', name='target') # Training model = tflearn.DNN(network, tensorboard_verbose=0) model.fit(trainX, trainY, n_epoch=10, validation_set=(testX, testY),show_metric=True, run_id="malware")
def load_train_data(): train_dict = sio.loadmat(train_location) X = np.asarray(train_dict['X']) X_train = [] for i in xrange(X.shape[3]): X_train.append(X[:,:,:,i]) X_train = np.asarray(X_train) Y_train = train_dict['y'] for i in xrange(len(Y_train)): if Y_train[i]%10 == 0: Y_train[i] = 0 Y_train = to_categorical(Y_train,10) return (X_train,Y_train)
def load_test_data(): test_dict = sio.loadmat(test_location) X = np.asarray(test_dict['X']) X_test = [] for i in xrange(X.shape[3]): X_test.append(X[:,:,:,i]) X_test = np.asarray(X_test) Y_test = test_dict['y'] for i in xrange(len(Y_test)): if Y_test[i]%10 == 0: Y_test[i] = 0 Y_test = to_categorical(Y_test,10) return (X_test,Y_test)
def generate_image_sets_for_single_digit(nb_sample=SAMPLE_SIZE, single_digit_index=0): captcha = ImageCaptcha() labels = [] images = [] for i in range(0, nb_sample): digits = 0 last_digit = INVALID_DIGIT for j in range(0, DIGIT_COUNT): digit = last_digit while digit == last_digit: digit = random.randint(0, 9) last_digit = digit digits = digits * 10 + digit digits_as_str = DIGIT_FORMAT_STR % digits labels.append(digits_as_str) images.append(captcha.generate_image(digits_as_str)) digit_labels = list() for digit_index in range(0, DIGIT_COUNT): digit_labels.append(np.empty(nb_sample, dtype="int8")) shape = (nb_sample, IMAGE_STD_HEIGHT, IMAGE_STD_WIDTH, RGB_COLOR_COUNT) digit_image_data = np.empty(shape, dtype="float32") for index in range(0, nb_sample): img = images[index].resize((IMAGE_STD_WIDTH, IMAGE_STD_HEIGHT), PIL.Image.LANCZOS) img_arr = np.asarray(img, dtype="float32") / 255.0 digit_image_data[index, :, :, :] = img_arr for digit_index in range(0, DIGIT_COUNT): digit_labels[digit_index][index] = labels[index][digit_index] x = digit_image_data y = to_categorical(digit_labels[single_digit_index], CLASS_COUNT) return x, y
def load_dataset(x_count, y_count): print '[+] Loading data' X = [] Y = [] places = Set() data = np.load('grid/data-{0}-{1}.npy'.format(x_count, y_count)) for row in data: x = map(float, row[1:5]) time = row[4] x.extend([ (time // 60) % 24 + 1, # Hour (time // 1440) % 7 + 1, # Day (time // 43200) % 12 + 1, # Month (time // 525600) + 1 # Year ]) X.append(x) Y.append(row[5]) places.add(row[5]) places = list(places) Y = [places.index(y) for y in Y] Y = to_categorical(Y, len(places)) print '[+] All data loaded' return X, Y
def generate_image_sets_for_multi_digits(nb_sample=SAMPLE_SIZE): captcha = ImageCaptcha() labels = [] images = [] for i in range(0, nb_sample): digits = 0 last_digit = INVALID_DIGIT for j in range(0, DIGIT_COUNT): digit = last_digit while digit == last_digit: digit = random.randint(0, 9) last_digit = digit digits = digits * 10 + digit digits_as_str = DIGIT_FORMAT_STR % digits labels.append(digits_as_str) images.append(captcha.generate_image(digits_as_str)) digit_labels = np.empty((nb_sample, DIGIT_COUNT), dtype="int8") shape = (nb_sample, IMAGE_STD_HEIGHT, IMAGE_STD_WIDTH, RGB_COLOR_COUNT) digit_image_data = np.empty(shape, dtype="float32") for index in range(0, nb_sample): img = images[index].resize((IMAGE_STD_WIDTH, IMAGE_STD_HEIGHT), PIL.Image.LANCZOS) img_arr = np.asarray(img, dtype="float32") / 255.0 digit_image_data[index, :, :, :] = img_arr for digit_index in range(0, DIGIT_COUNT): digit_labels[index][digit_index] = labels[index][digit_index] x, y_as_num = digit_image_data, np.rollaxis(digit_labels, 1) y = { (OUT_PUT_NAME_FORMAT % i ): to_categorical(y_as_num[i], CLASS_COUNT) for i in range(0, DIGIT_COUNT) } # y = [to_categorical(y_as_num[i], CLASS_COUNT) for i in range(0, DIGIT_COUNT)] return x, y
""" from __future__ import division, print_function, absolute_import import tflearn from tflearn.data_utils import shuffle, to_categorical from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.conv import conv_2d, max_pool_2d from tflearn.layers.estimator import regression from tflearn.data_preprocessing import ImagePreprocessing from tflearn.data_augmentation import ImageAugmentation # Data loading and preprocessing from tflearn.datasets import cifar10 (X, Y), (X_test, Y_test) = cifar10.load_data() X, Y = shuffle(X, Y) Y = to_categorical(Y) Y_test = to_categorical(Y_test) # Real-time data preprocessing img_prep = ImagePreprocessing() img_prep.add_featurewise_zero_center() img_prep.add_featurewise_stdnorm() # Real-time data augmentation img_aug = ImageAugmentation() img_aug.add_random_flip_leftright() img_aug.add_random_rotation(max_angle=25.) # Convolutional network building network = input_data(shape=[None, 32, 32, 3], data_preprocessing=img_prep,
v = values.split('/') data[v[3]] = {} data[v[3]]['vector'] = process(values) data[v[3]]['class'] = line.split()[1] return data testset = load_data(val) trainset = load_data(train) import pandas as pd import numpy as np test = pd.DataFrame(testset) trainset = pd.DataFrame(trainset) from tflearn.data_utils import shuffle, to_categorical trainY = to_categorical(np.array(trainset.loc['class']),nb_classes=5) testY = to_categorical(np.array(test.loc['class']),nb_classes=5) import tflearn from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.conv import conv_2d, max_pool_2d from tflearn.layers.normalization import local_response_normalization from tflearn.layers.estimator import regression network = input_data(shape=[None, 32, 32, 3], name='input') network = conv_2d(network, 32, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 3) network = local_response_normalization(network) network = conv_2d(network, 64, 3, activation='relu', regularizer="L2") network = max_pool_2d(network, 3)
from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.embedding_ops import embedding from tflearn.layers.recurrent import bidirectional_rnn, BasicLSTMCell from tflearn.layers.estimator import regression # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=200, value=0.) testX = pad_sequences(testX, maxlen=200, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2) testY = to_categorical(testY, nb_classes=2) # Network building net = input_data(shape=[None, 200]) net = embedding(net, input_dim=20000, output_dim=128) net = bidirectional_rnn(net, BasicLSTMCell(128), BasicLSTMCell(128)) net = dropout(net, 0.5) net = fully_connected(net, 2, activation='softmax') net = regression(net, optimizer='adam', loss='categorical_crossentropy') # Training model = tflearn.DNN(net, clip_gradients=0., tensorboard_verbose=2) model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64)
import tflearn from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # IMDB Dataset loading 会自动下载 train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train #22500个元素的list,每个元素类似这样[17,25,10,406,26,14,556,61,62,323,4],可见是词在词库的位置 testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.)# 通过补零把list的每个元素长度都弄成100 testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY, nb_classes=2)#把0变成[0,1],把1变成[1,0] testY = to_categorical(testY, nb_classes=2) # Network building net = tflearn.input_data([None, 100]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') # Training model = tflearn.DNN(net, tensorboard_verbose=0) model.fit(trainX, trainY, validation_set=(testX, testY), show_metric=True, batch_size=32)
nb_classes = 10 neurons = 4000 epochs = 200 # the data, shuffled and split between train and test sets (X_train, y_train), (X_test, y_test) = mnist.load_data() X_train = X_train.reshape(60000, 784) X_test = X_test.reshape(10000, 784) X_train = X_train.astype('float32') X_test = X_test.astype('float32') X_train /= 255 X_test /= 255 # convert class vectors to binary class matrices Y_train = to_categorical(y_train, nb_classes) Y_test = to_categorical(y_test, nb_classes) X_train, X_val = X_train[:-10000], X_train[-10000:] Y_train, Y_val = Y_train[:-10000], Y_train[-10000:] print(X_train.shape[0], 'train samples') print(X_val.shape[0], 'test samples') print(X_test.shape[0], 'test samples') print(Y_train.shape, Y_test.shape) # TFLearn Network network = input_data(shape=[None, 784], name='input')
""" from __future__ import division, print_function, absolute_import import tflearn import tflearn.data_utils as du # Data loading from tflearn.datasets import cifar10 (X, Y), (testX, testY) = cifar10.load_data() # Data pre-processing X, mean = du.featurewise_zero_center(X) X, std = du.featurewise_std_normalization(X) testX = du.featurewise_zero_center(testX, mean) testX = du.featurewise_std_normalization(testX, std) Y = du.to_categorical(Y, 10) testY = du.to_categorical(testY, 10) # Building Residual Network net = tflearn.input_data(shape=[None, 32, 32, 3]) net = tflearn.conv_2d(net, 32, 3) net = tflearn.batch_normalization(net) net = tflearn.activation(net, 'relu') net = tflearn.shallow_residual_block(net, 4, 32, regularizer='L2') net = tflearn.shallow_residual_block(net, 1, 32, downsample=True, regularizer='L2') net = tflearn.shallow_residual_block(net, 4, 64, regularizer='L2') net = tflearn.shallow_residual_block(net, 1, 64, downsample=True, regularizer='L2') net = tflearn.shallow_residual_block(net, 5, 128, regularizer='L2') net = tflearn.global_avg_pool(net)
from __future__ import division, print_function, absolute_import import tflearn from tflearn.data_utils import shuffle, to_categorical from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.conv import conv_2d, max_pool_2d from tflearn.layers.estimator import regression from tflearn.data_preprocessing import ImagePreprocessing from tflearn.data_augmentation import ImageAugmentation # Data loading and pre processing from tflearn.datasets import cifar10 (X,Y), (X_test, Y_test) = cifar10.load_data() X, Y = shuffle(X,Y) Y = to_categorical(Y, 10) Y_test = to_categorical(Y_test, 10) # Data preprocessing img_prep = ImagePreprocessing() img_prep.add_featurewise_zero_center() img_prep.add_featurewise_stdnorm() # Data augmentation img_aug = ImageAugmentation() img_aug.add_random_flip_leftright() img_aug.add_random_rotation() # Building the CNN network = input_data(shape=[None, 32, 32, 3], data_preprocessing=img_prep, data_augmentation=img_aug, name='first_layer') network = max_pool_2d(network, 2) # Max pooling layer
from tflearn.layers.estimator import regression from tflearn.data_utils import to_categorical, pad_sequences from tflearn.datasets import imdb # IMDB Dataset loading train, test, _ = imdb.load_data(path='imdb.pkl', n_words=10000, valid_portion=0.1) trainX, trainY = train testX, testY = test # Data preprocessing # Sequence padding trainX = pad_sequences(trainX, maxlen=100, value=0.) testX = pad_sequences(testX, maxlen=100, value=0.) # Converting labels to binary vectors trainY = to_categorical(trainY) testY = to_categorical(testY) # Building convolutional network network = input_data(shape=[None, 100], name='input') network = tflearn.embedding(network, input_dim=10000, output_dim=128) branch1 = conv_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2") branch2 = conv_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2") branch3 = conv_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2") network = merge([branch1, branch2, branch3], mode='concat', axis=1) network = tf.expand_dims(network, 2) network = global_max_pool(network) network = dropout(network, 0.5) network = fully_connected(network, 2, activation='softmax') network = regression(network, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy', name='target')
print('Read content') def load_content (file_name): with open(file_name) as f: return f.read() X = [] for i in range (MAX_FILE_ID): file_name = data_dir + '/' + str(i + 1) if os.path.isfile (file_name): X.append (load_content(file_name)) X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=0.2, random_state=2017) Y_train = to_categorical (Y_train, nb_classes = len (qualities)) Y_test = to_categorical (Y_test, nb_classes = len (qualities)) ### Process vocabulary print('Process vocabulary') vocab_processor = tflearn.data_utils.VocabularyProcessor(max_document_length = model_size, min_frequency = 0) X_train = np.array(list(vocab_processor.fit_transform(X_train))) X_test = np.array(list(vocab_processor.fit_transform(X_test))) X_train = pad_sequences(X_train, maxlen=model_size, value=0.) X_test = pad_sequences(X_test, maxlen=model_size, value=0.) n_words = len(vocab_processor.vocabulary_) print('Total words: %d' % n_words)