def input_data(): df = import_data() # import dataframe divide = 3584 # value for divide data on train and test df.drop('dateandtime', inplace=True, axis=1) # Convert object type to float type df.NGP = pandas.to_numeric(pandas.Series(df.NGP), errors='coerce') df.EGT = pandas.to_numeric(pandas.Series(df.EGT), errors='coerce') df.WF = pandas.to_numeric(pandas.Series(df.WF), errors='coerce') df.dropna(inplace=True) # Divide data on train and test without shuffle train_X = numpy.array(df.values[:divide, 0:3]) train_Y_p = numpy.array(df.values[:divide, 3:]) test_X = numpy.array(df.values[divide:, 0:3]) test_Y_p = numpy.array(df.values[divide:, 3:]) # Peapare one hot encode train_Y_p = train_Y_p.astype('int') # Convert to int type (train data) test_Y_p = test_Y_p.astype('int') # Convert to int type (test data) trf = train_Y_p.ravel() # Need be dimension 1, to encode in one hot tref = test_Y_p.ravel() # Need be dimension 1, to encode in one hot traf_Y = one_hot( trf, num_labels=3) # num_labels need be same as your classes (0,1,2) tres_Y = one_hot( tref, num_labels=3) # num_labels need be same as your classes (0,1,2) train_Y_en = numpy.array(traf_Y) # one hot numpy array (train data) test_Y_en = numpy.array(tres_Y) # one hot numpy array (test data) print(train_Y_en) # It is one hot: #_______________________________ #|__good__|_anomaly_|_anomaly1_| #|____1___|____0____|_____0____| #|____0___|____1____|_____0____| #|____0___|____0____|_____1____| return train_X, test_X, train_Y_en, test_Y_en
def load_npy_data(metadata_filename, npy_dataroot, speaker_id): # print("Loading data...", end="") files = get_file_list(metadata_filename, npy_dataroot, speaker_id) # print("Done!") # print("File length:{}".format(len(files))) random_files = randomize_file(files) for each in random_files: wav = np.squeeze(np.load(each[0])) local_condition = np.load(each[1]) if hparams.triphone: pre_phone = one_hot(local_condition[0], num_labels=hparams.lc_initial_channels) cur_phone = one_hot(local_condition[1], num_labels=hparams.lc_initial_channels) nxt_phone = one_hot(local_condition[2], num_labels=hparams.lc_initial_channels) local_condition = np.concatenate((pre_phone, cur_phone, nxt_phone), axis=1) else: local_condition = one_hot(local_condition, num_labels=hparams.lc_initial_channels) local_condition = lc_averaging(local_condition) global_condition = each[2] yield wav, local_condition, global_condition
def next_batch_train(self, batch_size): start_humain = self._index_in_epoch_humain start_robot = self._index_in_epoch_robot self._index_in_epoch_humain += int(batch_size / 2) self._index_in_epoch_robot += int(batch_size / 2) if self._index_in_epoch_humain > self._num_examples_humain: # Shuffle the data perm = [i for i in range(self._num_examples_humain)] numpy.random.shuffle(perm) self._images_humain = self._images_humain[perm] # Start next epoch start_humain = 0 self._index_in_epoch_humain = int(batch_size / 2) assert batch_size <= self._num_examples_humain if self._index_in_epoch_robot > self._num_examples_robot: # Shuffle the data perm = [i for i in range(self._num_examples_robot)] numpy.random.shuffle(perm) self._images_robot = self._images_robot[perm] # Start next epoch start_robot = 0 self._index_in_epoch_robot = int(batch_size / 2) assert batch_size <= self._num_examples_robot end_humain = self._index_in_epoch_humain end_robot = self._index_in_epoch_robot train_data = numpy.concatenate((self._images_humain[start_humain:end_humain], self._images_robot[start_robot:end_robot]), axis=0) label_data_tem1 = [1 for i in range(end_humain - start_humain)] label_data_tem2 = [0 for i in range(end_robot - start_robot)] label_data_tem1.extend(label_data_tem2) label_data = one_hot(label_data_tem1) perm_data = [i for i in range(len(train_data))] numpy.random.shuffle(perm_data) train_data = train_data[perm_data] label_data = label_data[perm_data] return train_data, label_data
def list_images(directory): """ Get all the images and labels in directory/label/*.png """ labels = os.listdir(directory) # Sort the labels so that training and test get them in the same order labels.sort() files_and_labels = [] for label in labels: for f in os.listdir(os.path.join(directory, label)): files_and_labels.append((os.path.join(directory, label, f), label)) filenames, labels = zip(*files_and_labels) filenames = list(filenames) labels = list(labels) unique_labels = list(set(labels)) label_to_int = {} for i, label in enumerate(unique_labels): label_to_int[label] = i labels = [label_to_int[l] for l in labels] labels = one_hot(labels, num_labels=n_classes, dtype=np.float32) return filenames, labels
def test_autoguessing(): y = np.array([0, 4, 0, 4]) expect = np.array([[1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.], [1., 0., 0., 0., 0.], [0., 0., 0., 0., 1.]], dtype='float') out = one_hot(y) np.testing.assert_array_equal(expect, out)
def multiplet_to_ohe(multiplet): multiplet_map = {'S': 0, 'D': 1, 'T': 2, 'Q': 3} assert multiplet in multiplet_map.keys(),\ "Multiplet %s not recognized" % multiplet num_labels = len(multiplet_map.keys()) val = multiplet_map[multiplet] return np.squeeze( one_hot(np.array([val]), dtype='int', num_labels=num_labels))
def test_list(): y = [0, 1, 2, 3, 4, 2] expect = np.array( [[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.], [0., 0., 1., 0., 0.]], dtype='float') out = one_hot(y) np.testing.assert_array_equal(expect, out)
def test_list(): y = [0, 1, 2, 3, 4, 2] expect = np.array([[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.], [0., 0., 1., 0., 0.], [0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.], [0., 0., 1., 0., 0.]], dtype='float') out = one_hot(y) np.testing.assert_array_equal(expect, out)
def make_one_hot(labels): unique_classes = np.unique(labels) mapper = {} i = 0 for c in unique_classes: mapper[c] = i i += 1 new_labels = [mapper[x] for x in labels] new_labels_oh = one_hot(new_labels) return mapper, new_labels_oh
def one_hot_encoding(self, question_code, answer_code): question = self.find_question_from_question_list(question_code) answers = question["answers"] dictMap = {} index = 0 for answer in answers: dictMap[answer["code"]] = index index += 1 one_hot_result = one_hot([dictMap[answer_code]], num_labels=len(question["answers"])) return one_hot_result[0]
def load_imdb2(folder): x_text = list() y_text = list() for file in os.listdir(folder + '/test/pos'): review_file = open(folder + '/test/pos/' + file, 'r') x_text.append(clean_str(review_file.readline())) y_text.append(1) review_file.close() for file in os.listdir(folder + '/test/neg'): review_file = open(folder + '/test/neg/' + file, 'r') x_text.append(clean_str(review_file.readline())) y_text.append(0) review_file.close() for file in os.listdir(folder + '/train/pos'): review_file = open(folder + '/train/pos/' + file, 'r') x_text.append(clean_str(review_file.readline())) y_text.append(1) review_file.close() for file in os.listdir(folder + '/train/neg'): review_file = open(folder + '/train/neg/' + file, 'r') x_text.append(clean_str(review_file.readline())) y_text.append(0) review_file.close() x_text = [x[:100] for x in x_text] Y = one_hot(y_text, dtype='int') # Build vocabulary max_document_length = max([len(x) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) X = np.array(list(vocab_processor.fit_transform(x_text))) X = sequence.pad_sequences(X, maxlen=max_document_length, padding='post') X_TRAIN = X[:len(X) * 9 / 10] Y_TRAIN = Y[:len(Y) * 9 / 10] X_DEV = X[len(X) * 9 / 10:len(X) * 95 / 100] Y_DEV = Y[len(Y) * 9 / 10:len(Y) * 95 / 100] X_TEST = X[len(X) * 95 / 100:] Y_TEST = Y[len(Y) * 95 / 100:] return (X_TRAIN, Y_TRAIN), (X_DEV, Y_DEV), ( X_TEST, Y_TEST), len(vocab_processor.vocabulary_) + 1, max_document_length
def one_hot_encoding(labels): """ doing one hot encoding for all categories """ label_counts = Counter(labels) label_num = len(label_counts) label_dict = {k: i for i, k in enumerate(label_counts.keys())} reverse_label_dict = {i: k for i, k in enumerate(label_counts.keys())} int_labels = list(map(lambda x: label_dict[x], labels)) one_hot_labels = one_hot(int_labels) return one_hot_labels, int_labels
def build_one_hot(train_data, test_data, vocab): """ One-Hot Representation Builder Converts the integer tokenized representation into a one hot representation. :param train_data: Training data set :type train_data: pd.DataFrame :param test_data: Test data set :type test_data: pd.DataFrame :param vocab: Full vocabulary for the training and test sets :type vocab: dict """ for df in [train_data, test_data]: # one_hot = enc.transform(train_data[const.COL_TWEET_TRANSFORM]) df[const.COL_ONE_HOT] = df[const.COL_TWEET_INT_TRANSFORM].apply(lambda x: one_hot(x, num_labels=len(vocab)))
def load_trec(dev, test): categories = { "ABBR": 0, "ENTY": 1, "DESC": 2, "HUM": 3, "LOC": 4, "NUM": 5 } x_train = list() y_train = list() for line in [line.split(" ", 1) for line in open(dev).readlines()]: i = line[0].split(":") y_train.append(categories[i[0]]) x_train.append(clean_str(line[1])) for line in [line.split(" ", 1) for line in open(test).readlines()]: i = line[0].split(":") y_train.append(categories[i[0]]) x_train.append(clean_str(line[1])) # Generate labels X = x_train Y = one_hot(y_train, dtype='int') # Build vocabulary max_document_length = max([len(x.split(" ")) for x in X]) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) X = np.array(list(vocab_processor.fit_transform(X))) X = sequence.pad_sequences(X, maxlen=max_document_length, padding='post') X_TRAIN = X[:len(X) * 9 / 10] Y_TRAIN = Y[:len(Y) * 9 / 10] X_DEV = X[len(X) * 9 / 10:len(X) * 95 / 100] Y_DEV = Y[len(Y) * 9 / 10:len(Y) * 95 / 100] X_TEST = X[len(X) * 95 / 100:] Y_TEST = Y[len(Y) * 95 / 100:] return (X_TRAIN, Y_TRAIN), (X_DEV, Y_DEV), ( X_TEST, Y_TEST), len(vocab_processor.vocabulary_) + 1, max_document_length
def load_npy_data(metadata_filename, npy_dataroot, speaker_id): # print("Loading data...", end="") files = get_file_list(metadata_filename, npy_dataroot, speaker_id) # print("Done!") # print("File length:{}".format(len(files))) random_files = randomize_file(files) for each in random_files: wav = np.squeeze(np.load(each[0])) wav = np.transpose(wav) local_condition = one_hot(np.load(os.path.normpath(each[1])), num_labels=hparams.lc_initial_channels) local_condition = lc_averaging(local_condition) global_condition = each[2] yield wav, local_condition, global_condition
def backward(self, input, label): local_biases = [np.zeros(b.shape) for b in self.biases] local_weights = [np.zeros(w.shape) for w in self.weights] input = input.reshape(len(input), 1) #Forward pass activations, pre_activations = self.forward(input) #Error at last layer delta = activations[-1] - one_hot([label], 10).reshape(10, 1) local_biases[-1] = delta local_weights[-1] = np.dot(delta, activations[-2].transpose()) for l in range(2, self.total_layers): z = pre_activations[-l] sp = self.sigmoid_derivative(z) delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp local_biases[-l] = delta local_weights[-l] = np.dot(delta, activations[-l - 1].transpose()) return (local_biases, local_weights)
def custom_alg(self, question_code, answer_code): question = self.find_question_from_question_list(question_code) value = 0 if int(int(answer_code[-2:]) / 1) == 1: value = 0 elif int(int(answer_code[-2:]) / 2) == 1: value = 1 elif int(int(answer_code[-2:]) / 4) == 1: value = 2 elif int(int(answer_code[-2:]) / 10) == 1: value = 3 elif int(int(answer_code[-2:]) / 20) == 1: value = 4 else: print("error") one_hot_result = one_hot([value], num_labels=len(question["answers"])) return one_hot_result[0]
def load_sst2(train, dev, test): x_train = list() y_train = list() for line in [line.split(",", 1) for line in open(train).readlines()]: y_train.append(int(line[0]) - 1) x_train.append(clean_str(line[1])) for line in [line.split(",", 1) for line in open(dev).readlines()]: y_train.append(int(line[0]) - 1) x_train.append(clean_str(line[1])) for line in [line.split(",", 1) for line in open(test).readlines()]: y_train.append(int(line[0]) - 1) x_train.append(clean_str(line[1])) # Generate labels X = x_train Y = one_hot(y_train, dtype='int') # Build vocabulary max_document_length = max([len(x.split(" ")) for x in X]) vocab_processor = learn.preprocessing.VocabularyProcessor( max_document_length) X = np.array(list(vocab_processor.fit_transform(X))) X = sequence.pad_sequences(X, maxlen=max_document_length, padding='post') X_TRAIN = X[:len(X) * 9 / 10] Y_TRAIN = Y[:len(Y) * 9 / 10] X_DEV = X[len(X) * 9 / 10:len(X) * 95 / 100] Y_DEV = Y[len(Y) * 9 / 10:len(Y) * 95 / 100] X_TEST = X[len(X) * 95 / 100:] Y_TEST = Y[len(Y) * 95 / 100:] return (X_TRAIN, Y_TRAIN), (X_DEV, Y_DEV), ( X_TEST, Y_TEST), len(vocab_processor.vocabulary_) + 1, max_document_length
def preprocess_dataset(ds_name): # Importing the dataset dataset = pd.read_csv(ds_name) dataset = dataset[dataset.replace([np.inf, -np.inf], np.nan).notnull().all(axis=1)] X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values # attack types: ['BENIGN' 'DoS GoldenEye' 'DoS Hulk' 'DoS Slowhttptest' 'DoS slowloris' 'Heartbleed'] #X = np.copy(X[:, :]) Y = np.copy(y) labelencoder_y = LabelEncoder() # final Y for splitting Y = labelencoder_y.fit_transform(Y) Y = one_hot(Y) # Feature Scaling sc = MinMaxScaler(feature_range=(0, 1)) # final X for spliting X[X == -inf] = 0 for i in range(X.shape[0]): for j in range(X.shape[1]): if float(X[i,j]) >= 1.7976931348623157e+308: X[i,j] = 1.7976931348623157e+308 X[X == nan] = 1 X = sc.fit_transform(X) return X, Y
def test_multidim_list(): y = [[0, 1, 2, 3, 4, 2]] out = one_hot(y)
for question in questions: if question["code"] == v: answer_length = len(question["answers"]) answers = question["answers"] dictMap = {} index = 0 for answer in answers: dictMap[answer["code"]] = index index += 1 data_array = list(map(str, cols)) stringToInt = [] for data in data_array: stringToInt.append(dictMap[data]) one_hot_result = one_hot(stringToInt, num_labels=answer_length) if len(processed_data) == 0: processed_data = one_hot_result else: processed_data = np.append(processed_data, one_hot_result, axis=1) num += answer_length print("code:", v, "num:", answer_length) pass if code == '00112': # 객관식 카테고리 다중선택가능 sum알고리즘 # 각각에 대하여 다른 알고리즘 적용 for question in questions: if question["code"] == v:
def test_list_morelabels(): y = [0, 1] expect = np.array([[1., 0., 0.], [0., 1., 0.]], dtype='float') out = one_hot(y, num_labels=3) np.testing.assert_array_equal(expect, out)
def test_multidim_list(): y = [[0, 1, 2, 3, 4, 2]] with pytest.raises(AttributeError): one_hot(y)
def main(): # Placeholders images = tf.placeholder(tf.float32, [None, 28, 28]) targets = tf.placeholder(tf.int32, [None, 10]) keep_prob = tf.placeholder(tf.float32) # Weights W_conv1 = weight_variable([3, 3, 1, 16]) b_conv1 = bias_variable([16]) W_conv2 = weight_variable([3, 3, 16, 32]) b_conv2 = bias_variable([32]) hidden_units = (7 * 7 * 32 + 10) // 2 W_hidden = weight_variable([7 * 7 * 32, hidden_units]) b_hidden = bias_variable([hidden_units]) W_output = weight_variable([hidden_units, 10]) b_output = bias_variable([10]) weights = [ W_conv1, b_conv1, W_conv2, b_conv2, W_hidden, b_hidden, W_output, b_output, ] # Forward x = tf.reshape(images, [-1, 28, 28, 1]) x = max_pool(tf.nn.relu(conv2d(x, W_conv1) + b_conv1)) x = max_pool(tf.nn.relu(conv2d(x, W_conv2) + b_conv2)) x = tf.reshape(x, [-1, 7 * 7 * 32]) x = tf.nn.dropout(x, keep_prob) x = tf.nn.relu(tf.matmul(x, W_hidden) + b_hidden) x = tf.nn.dropout(x, keep_prob) outputs = tf.matmul(x, W_output) + b_output # Loss loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=targets)) optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss) # Accuracy correct = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1)) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) with tf.Session() as sess: batch_size = 64 # Training sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(weights, max_to_keep=1) X, y = load_train_data() y = one_hot(y) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1) best_val_acc = -1 patience_count = 0 for epoch in range(1, 1001): X_train, y_train = shuffle(X_train, y_train) X_batches = np.array_split(X_train, X_train.shape[0] // batch_size) y_batches = np.array_split(y_train, y_train.shape[0] // batch_size) loss_sum = acc_sum = 0.0 for X_batch, y_batch in zip(X_batches, y_batches): loss_batch, acc_batch, _ = sess.run( [loss, accuracy, optimizer], feed_dict={ images: X_batch, targets: y_batch, keep_prob: 0.5 }) loss_sum += loss_batch * X_batch.shape[0] acc_sum += acc_batch * X_batch.shape[0] acc = acc_sum / X.shape[0] X_batches = np.array_split(X_val, X_val.shape[0] // batch_size) y_batches = np.array_split(y_val, y_val.shape[0] // batch_size) acc_sum = 0.0 for X_batch, y_batch in zip(X_batches, y_batches): acc_batch = sess.run(accuracy, feed_dict={ images: X_batch, targets: y_batch, keep_prob: 1.0 }) acc_sum += acc_batch * X_batch.shape[0] val_acc = acc_sum / X_val.shape[0] patience_count += 1 if val_acc > best_val_acc: best_val_acc = val_acc patience_count = 0 saver.save(sess, 'tensorflow_convnet') msg = 'Epoch {:04d} - loss: {:.6g} - acc: {:.6g} - val_acc: {:.6g}' print(msg.format(epoch, loss_sum / X.shape[0], acc, val_acc)) if patience_count > 3: break # Prediction saver.restore(sess, 'tensorflow_convnet') X = load_test_data() X_batches = np.array_split(X, X.shape[0] // batch_size) labels = [] for X_batch in X_batches: y = sess.run(outputs, feed_dict={images: X_batch, keep_prob: 1.0}) labels.extend(np.argmax(y, 1)) save_predictions(np.array(labels), 'tensorflow_convnet.csv')
def test_oneclass(): np.testing.assert_array_equal(one_hot([0]), np.array([[0.]], dtype='float'))
def crossentropy_derivative(self, output_activations, y): act_output = one_hot([y], 10) return (( -np.divide(act_output, output_activations.reshape(1, 10))).reshape( 10, 1))
def test_multidim_array(): y = np.array([[0], [1], [2], [3], [4], [2]]) out = one_hot(y)
def onehot_vectorize(matrix, num): result = [] for vector in matrix: a = one_hot(vector.tolist(), dtype='int', num_labels=num) result.append(a) return np.array(result)
def ohe_label(label): encoded_label = encode_label(label) label = one_hot(encoded_label, dtype='int', num_labels=VOCAB_SIZE) return label
with open(file, 'r') as f: data = f.read() #Cleaning the data from csv csdata = data.split('\n') cdata = [] for line in csdata[1:-1]: cdata.append(list(map(int, line.split(',')))) #loading into the numpy arrays npdata = np.array(cdata[:]) del file, cdata, csdata, data #Seperating the labeled data Y = npdata[:, 0] X = npdata[:, 1:] #making as per the vector notation X_train = X.T Y_train = one_hot(Y.reshape(Y.shape[0], )).T #debugging file = 'fashion-mnist_test.csv' with open(file, 'r') as f: data = f.read() #Cleaning the data from csv csdata = data.split('\n') cdata = [] for line in csdata[1:-1]: cdata.append(list(map(int, line.split(',')))) #loading into the numpy arrays npdata = np.array(cdata[:]) del file, cdata, csdata, data #Seperating the labeled data Y = npdata[:, 0]
def test_multidim_array(): y = np.array([[0], [1], [2], [3], [4], [2]]) one_hot(y)
def test_multidim_array(): y = np.array([[0], [1], [2], [3], [4], [2]]) with pytest.raises(AttributeError): one_hot(y)
Protocol_encoder = LabelEncoder() X[:, 2 ] = Protocol_encoder.fit_transform(X[: , 2]) Flag_encoder = LabelEncoder() X[:, 3] =Flag_encoder.fit_transform(X[: , 3]) onehotencoder = OneHotEncoder(categorical_features=[1,2,3]) X = onehotencoder.fit_transform(X).toarray() output_encoder = LabelEncoder() y =output_encoder.fit_transform(y) #onehotencoder2 = OneHotEncoder() y=one_hot(y) #mlxtend lib one_hots the 1d array # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test)
def test_multidim_list(): y = [[0, 1, 2, 3, 4, 2]] one_hot(y)
#config.gpu_options.per_process_gpu_memory_fraction = 0.5 server = tf.train.Server(cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index, config=config) #Load data train_images, train_labels = mnist_reader.load_mnist('data/fashion', kind='train') test_images, test_labels = mnist_reader.load_mnist('data/fashion', kind='t10k') #data preprocessing num_classes = 10 x_train = train_images.reshape([-1, 28, 28, 1]) x_test = test_images.reshape([-1, 28, 28, 1])[:1000] y_train = one_hot(train_labels) y_test = one_hot(test_labels)[:1000] #parameters # Network parameters n_input = 784 # Fashion MNIST data input (img shape: 28*28) n_classes = 10 # Fashion MNIST total classes (0-9 digits) image_size = 28 channel_size = 1 n_samples = x_train.shape[0] batch_size = 128 epochs = 2 num_iterations = n_samples // batch_size test_step = 100 learning_rate = 0.01
from mlxtend.preprocessing import one_hot import numpy as np # defaults y = np.array([0, 1, 2, 1, 2]) print one_hot(y) # python lists y = [0, 1, 2, 1, 2] print one_hot(y) # integer arrays y = [0, 1, 2, 1, 2] print one_hot(y, dtype='int') # arbitray numbers of class labels y = [0, 1, 2, 1, 2] print one_hot(y, num_labels=10)