def shuffle_data(X,y,no_lable_vs_lable): X, y = shuffle(X, y, random_state=0) # balance labels by subsampling: y_dict = defaultdict(list) for i, y_i in enumerate(y): y_dict[y_i[0]].append(i) # subsample X_sub = [] y_sub = [] y_set = set(y_dict) y_dict_len = [len(y_dict[y_set_i]) for y_set_i in sorted(list(y_set))] quotent = y_dict_len[0] / sum(y_dict_len) print 'lenth cutting' print str(len(X)) # generalize over multiple classes: if(quotent > no_lable_vs_lable): # decrease 0 class labels: newLen = int(2*y_dict_len[1]*no_lable_vs_lable) id_new = y_dict['0'][:newLen] + [y_dict[id] for id in y_set if not id in ['0']][0] X_sub = [X[id] for id in id_new] y_sub = [y[id] for id in id_new] print(str(newLen), 'new 0 class length: ', str(len(id_new))) else: newLen = int(y_dict_len[0]*(1-no_lable_vs_lable)) id_new = y_dict['1'][:newLen] + [y_dict[id] for id in y_set if not id in ['0']][0] X_sub = [X[id] for id in id_new] y_sub = [y[id] for id in id_new] print(str(newLen), 'new 1 class length') X, y = shuffle(X_sub, y_sub, random_state=0) print str(len(X_sub)) print '--------------' return X,y
def frames2batch(k = 12,batch_size = 1024, is_calib = False): pos = util.get_files(rootdir = 'F:\\train_data\\pos\\') neg = util.get_files(rootdir = 'F:\\train_data\\neg\\') pos = shuffle(pos) neg = shuffle(neg) total = pos + neg total = shuffle(total) batch = [] c = 0 bpath = 'F:\\train_data\\batch\\' for item_path in total: frame = fr.get_frame(item_path) frame_r = fr.resize_frame(frame,(k,k)) if frame_r == None: continue vec = fr.frame_to_vect(frame_r) label = 1 if item_path.split('\\')[-1].find('pos') > 0 else 0 print(item_path,label) batch.append((vec,label)) if len(batch) > 0 and len(batch) % batch_size == 0: batch = sp.array(batch) sp.savez(bpath + str(c) + '_' + str(k) + ('_' if not is_calib else '_calib-') + 'net',batch) batch = [] c += 1 if len(batch) > 0 and len(batch) % batch_size == 0: batch = sp.array(batch) sp.savez(bpath + str(c) + '_' + str(k) + ('_' if not is_calib else '_calib') + '-net',batch) batch = [] c += 1
def process_data(): global num_classes, num_train, num_test X_train , Y_train = load_data('Train') X_test , Y_test = load_data('Test') X_train = X_train.astype(np.float64) X_test = X_test.astype(np.float64) num_train = X_train.shape[0] num_test = X_test.shape[0] mean_image = np.mean(X_train,axis=0) X_train -= mean_image X_test -= mean_image X_train = X_train.reshape(-1, 1, img_dim, img_dim) Y_train -= 1 X_train , Y_train = shuffle(X_train, Y_train) X_test = X_test.reshape(-1, 1, img_dim, img_dim) Y_test -= 1 X_test , Y_test = shuffle(X_test, Y_test) print 'Training X shape :- ', X_train.shape print 'Training Y shape :- ', Y_train.shape print 'Testing X shape :- ', X_test.shape print 'Testing Y shape :- ', Y_test.shape return X_train, Y_train, X_test, Y_test
def main(): train, test, word2idx = get_ptb_data() for t in train: add_idx_to_tree(t, 0) train = [tree2list(t, -1, True) for t in train] train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels for t in test: add_idx_to_tree(t, 0) test = [tree2list(t, -1, True) for t in test] test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels train = shuffle(train) train = train[:1000] # n_pos = sum(t[3][-1] for t in train) # print "n_pos train:", n_pos test = shuffle(test) test = test[:100] # n_pos = sum(t[3][-1] for t in test) # print "n_pos test:", n_pos V = len(word2idx) print "vocab size:", V D = 80 K = 5 model = RecursiveNN(V, D, K) model.fit(train, epochs=3, activation=T.nnet.relu) print "train accuracy:", model.score(train) print "test accuracy:", model.score(test) print "train f1:", model.f1_score(train) print "test f1:", model.f1_score(test)
def splitIntoTrainingAndValidation(A, B): data1 = shuffle(sourceSets[A]) # Note this is a random shuffle, that's data2 = shuffle(sourceSets[B]) # why we need many iterations freqM = np.minimum(freqs[A], freqs[B]) freq1tr = np.round(freqM * 0.8) # Randomly selected 80% for the training set, freq1va = freqM - freq1tr # and the remaining 20% for the validation set freq2tr = np.copy(freq1tr) freq2va = np.copy(freq1va) trainingSetSize = int(sum(freq1tr)) # 1/2 size actually validatnSetSize = int(sum(freq1va)) testSet1size = len(data1) - trainingSetSize - validatnSetSize testSet2size = len(data2) - trainingSetSize - validatnSetSize X = np.zeros((trainingSetSize*2, numFeatures)) Xv = np.zeros((validatnSetSize*2, numFeatures)) Xt = np.zeros((testSet1size+testSet2size, numFeatures)) y = np.ravel([([0]*trainingSetSize) + ([1]*trainingSetSize)]) yv = np.ravel([([0]*validatnSetSize) + ([1]*validatnSetSize)]) yt = np.ravel([([0]*testSet1size) + ([1]*testSet2size)]) trnIdx = vldIdx = tstIdx = 0 for item in data1: year = item[0] if freq1tr[year] > 0: X[trnIdx], trnIdx, freq1tr[year] = item[1:], trnIdx+1, freq1tr[year]-1 elif freq1va[year] > 0: Xv[vldIdx], vldIdx, freq1va[year] = item[1:], vldIdx+1, freq1va[year]-1 else: Xt[tstIdx], tstIdx = item[1:], tstIdx+1 assert trnIdx==trainingSetSize and vldIdx==validatnSetSize and tstIdx==testSet1size for item in data2: year = item[0] if freq2tr[year] > 0: X[trnIdx], trnIdx, freq2tr[year] = item[1:], trnIdx+1, freq2tr[year]-1 elif freq2va[year] > 0: Xv[vldIdx], vldIdx, freq2va[year] = item[1:], vldIdx+1, freq2va[year]-1 else: Xt[tstIdx], tstIdx = item[1:], tstIdx+1 assert trnIdx==trainingSetSize*2 and vldIdx==validatnSetSize*2 and tstIdx==testSet1size+testSet2size X, y = shuffle(X, y) # Just in case... perhaps no reason to shuffle again here? fs = SelectKBest(f_classif, k = numFeatures) # TODO: try other feature selection methods? fs.fit(np.concatenate((X, Xv)), np.concatenate((y, yv))) return X, y, Xv, yv, Xt, yt, testSet1size, testSet2size, fs.scores_
def generate_training_data(image_paths, angles, batch_size=128, validation_flag=False): ''' method for the model training data generator to load, process, and distort images, then yield them to the model. if 'validation_flag' is true the image is not distorted. also flips images with turning angle magnitudes of greater than 0.33, as to give more weight to them and mitigate bias toward low and zero turning angles ''' image_paths, angles = shuffle(image_paths, angles) X,y = ([],[]) while True: for i in range(len(angles)): img = cv2.imread(image_paths[i]) angle = angles[i] img = preprocess_image(img) if not validation_flag: img, angle = random_distort(img, angle) X.append(img) y.append(angle) if len(X) == batch_size: yield (np.array(X), np.array(y)) X, y = ([],[]) image_paths, angles = shuffle(image_paths, angles) # flip horizontally and invert steer angle, if magnitude is > 0.33 if abs(angle) > 0.33: img = cv2.flip(img, 1) angle *= -1 X.append(img) y.append(angle) if len(X) == batch_size: yield (np.array(X), np.array(y)) X, y = ([],[]) image_paths, angles = shuffle(image_paths, angles)
def generate_feature(in_file, dump=False, single_only=False, min_count=0): f = open(in_file, 'r') f.readline() training_data, tags = [], [] total_features = {} for line in f.readlines(): tokens = line.replace('\n', '').split(',') fs = [s for s in tokens[1:] if s.isdigit()] # ignore invalid data if len(fs) != 10: continue tags.append(tokens[0]) features = get_feature_array(fs, single_only) update_total_features(total_features, features) training_data.append(features) training_data = transform_to_matrix(total_features, training_data) training_data = cut_off(training_data, min_count) shuffle(training_data, tags) tags = np.array(tags) if dump: np.savetxt('preprocessing/dumpX.txt', training_data, fmt='%d', delimiter=',') np.savetxt('preprocessing/dumpY.txt', tags[np.newaxis].T, fmt='%s', delimiter=',') return total_features, training_data, np.array(tags)
def getTrainTestData(): data = pickle.load(open('./data/60_unnormalized.p', "rb")) raw_meta = [] raw_data = [] for k,v in data.iteritems(): for i in range(len(v)): _d = v[i] previous = [[0]*LOCATION_ID_MAX,[0]*LOCATION_ID_MAX] if i==0: # previous date date_time = datetime.datetime.strptime(k, '%Y-%m-%d') previous_day = date_time - datetime.timedelta(1) str_previous_day = previous_day.strftime('%Y-%m-%d') if str_previous_day in data: previous[0]=data[str_previous_day][-2] previous[1]=data[str_previous_day][-1] elif i==1: # previous date date_time = datetime.datetime.strptime(k, '%Y-%m-%d') previous_day = date_time - datetime.timedelta(1) str_previous_day = previous_day.strftime('%Y-%m-%d') previous[1]=v[i-1] if str_previous_day in data: previous[0]=data[str_previous_day][-1] else: previous[0]=v[i-2] previous[1]=v[i-1] raw_meta.append({"date":k,"interval":i,"previous":previous}) raw_data.append(_d) num = len(raw_data) train_meta_data = raw_meta[0:int(0.6*num)] valid_meta_data = raw_meta[int(0.6*num):int(0.8*num)] test_meta_data = raw_meta[int(0.8*num):] train_y = raw_data[0:int(0.6*num)] valid_y = raw_data[int(0.6*num):int(0.8*num)] test_y = raw_data[int(0.8*num):] train_X = getFeatures(train_meta_data) valid_X = getFeatures(valid_meta_data) test_X = getFeatures(test_meta_data) train_X = np.array(train_X, dtype=np.float32) valid_X = np.array(valid_X, dtype=np.float32) test_X = np.array(test_X, dtype=np.float32) train_y = np.array(train_y, dtype=np.float32) valid_y = np.array(valid_y, dtype=np.float32) test_y = np.array(test_y, dtype=np.float32) train_X, train_y = shuffle(train_X, train_y, random_state=0) valid_X, valid_y = shuffle(valid_X, valid_y, random_state=1) test_X, test_y = shuffle(test_X, test_y, random_state=2) return train_X, train_y, valid_X, valid_y, test_X, test_y
def splitIntoTrainingValidation(A, B): # TODO: 3rd parameter: the desired value of (validatSet1size + validatSet2size) data1 = shuffle(sourceSets[A]) # Note this is a random shuffle, that's data2 = shuffle(sourceSets[B]) # why we need many iterations freq1 = np.minimum(freqs[A], freqs[B]) if sum(freq1) > maxTrainSetSz: freq1 = np.round(freq1 * (maxTrainSetSz * 1.0 / sum(freq1))) trainingSetSize = int(sum(freq1)) # Half size actually. Approximately <= maxTrainSetSz validatSet1size = len(data1) - trainingSetSize validatSet2size = len(data2) - trainingSetSize X = np.zeros((trainingSetSize*2, numFeatures)) Xv = np.zeros((validatSet1size+validatSet2size, numFeatures)) y = np.ravel([([0]*trainingSetSize) + ([1]*trainingSetSize)]) yv = np.ravel([([0]*validatSet1size) + ([1]*validatSet2size)]) freq2 = np.copy(freq1) trnIdx = valIdx = 0 for item in data1: year = item[0] if freq1[year] > 0: freq1[year]-=1 X[trnIdx] = item[1:] trnIdx+=1 else: Xv[valIdx] = item[1:] valIdx += 1 assert trnIdx==trainingSetSize and valIdx==validatSet1size for item in data2: year = item[0] if freq2[year] > 0: freq2[year]-=1 X[trnIdx] = item[1:] trnIdx+=1 else: Xv[valIdx] = item[1:] valIdx += 1 assert trnIdx==trainingSetSize*2 and valIdx==validatSet1size+validatSet2size return X, y, Xv, yv, validatSet1size, validatSet2size
def main(): train, test, word2idx = get_ptb_data() for t in train: add_idx_to_tree(t, 0) train = [tree2list(t, -1, is_binary=True) for t in train] train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels for t in test: add_idx_to_tree(t, 0) test = [tree2list(t, -1, is_binary=True) for t in test] test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels train = shuffle(train) train = train[:1000] test = shuffle(test) test = test[:500] V = len(word2idx) print "vocab size:", V D = 80 K = 5 model = RecursiveNN(V, D, K) model.fit(train, reg=0, activation=T.nnet.relu) print "train accuracy:", model.score(train) print "test accuracy:", model.score(test)
def compute_distances_and_pairs(self, pdb_file, nr_contacts=None, nr_noncontacts=None): #distance and contacts self.features['pair']['Cbdist'] = pdb.distance_map(pdb_file, self.L) #mask positions that have too many gaps gap_freq = 1 - (self.Ni / self.neff) highly_gapped_pos = np.where(gap_freq > self.max_gap_percentage)[0] self.features['pair']['Cbdist'][:,highly_gapped_pos] = np.nan self.features['pair']['Cbdist'][highly_gapped_pos, :] = np.nan #if there are unresolved residues, there will be nan in the distance_map with np.errstate(invalid='ignore'): self.features['pair']['contact'] = (self.features['pair']['Cbdist'] <= self.contact_threshold) * 1 self.features['pair']['nocontact'] = (self.features['pair']['Cbdist'] > self.non_contact_threshold) * 1 indices_contact = np.where(np.triu(self.features['pair']['contact'], k=self.seq_separation)) indices_contact = tuple(shuffle(indices_contact[0],indices_contact[1], random_state=0)) if nr_contacts: indices_contact = indices_contact[0][:nr_contacts], indices_contact[1][:nr_contacts] indices_nocontact = np.where(np.triu(self.features['pair']['nocontact'], k=self.seq_separation)) indices_nocontact = tuple(shuffle(indices_nocontact[0],indices_nocontact[1], random_state=0)) if nr_noncontacts: indices_nocontact = indices_nocontact[0][:nr_noncontacts], indices_nocontact[1][:nr_noncontacts] #update indices of i<j for only relevant pairs self.ij_ind_upper = np.array(list(indices_contact[0]) + list(indices_nocontact[0])), np.array(list(indices_contact[1]) + list(indices_nocontact[1]))
def cluster(m, n_colors=32): from sklearn.utils import shuffle from sklearn.cluster import KMeans from sklearn.metrics import pairwise_distances_argmin def recreate_image(codebook, labels, w, h): """Recreate the (compressed) image from the code book & labels""" d = codebook.shape[1] image = np.zeros((w, h, d)) label_idx = 0 for i in range(w): for j in range(h): image[i][j] = codebook[labels[label_idx]] label_idx += 1 return image # Load Image and transform to a 2D numpy array. w, h, d = original_shape = tuple(m.shape) image_array = np.reshape(m, (w * h, d)) image_array_sample = shuffle(image_array, random_state=0)[:1000] kmeans = KMeans(n_clusters=n_colors).fit(image_array_sample) codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1] labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0) return recreate_image(codebook_random, labels_random, w, h)
def get_aa_cross_val(L, X, Y, AA, tsize=None, rstate=-1): """Get test data from dataset""" test_position = [] aa_y = np.zeros(Y.shape) for i in xrange(len(Y)): if L[i][-1] == AA: aa_y[i] = 1 test_position.append(i) if tsize: t_len = int(tsize * len(Y)) # positions that are 0 without being the one for AA zero_pos = np.where(np.logical_and(Y == 0, aa_y == 0))[0] clen = t_len - len(test_position) if clen > 0: random_zero_pos = np.random.choice(zero_pos, clen, replace=False) test_position.extend(random_zero_pos) test_position = np.random.permutation(test_position) mask = np.ones(Y.shape, dtype=bool) mask[test_position] = False train_position = np.array(range(len(mask)))[mask] if rstate > 0: return shuffle(train_position, random_state=rstate), shuffle(test_position, random_state=rstate) # in this case, suppose we want only the train and test index else: return train_position, test_position
def load_whale_data(train_file, test_file, nb_classes=447): print("loading whale data") # nomalize train data print("--> loading training data") train_data = read_csv(train_file) X_train = train_data[:, 1:] X_train = X_train.astype(np.float32) X_train = X_train / 255 y_train = np.vstack(train_data[:, 0]) y_train = y_train.astype(np.uint16) X_train, y_train = shuffle(X_train, y_train, random_state=42) X_train = X_train.reshape(-1, 1, 96, 96) Y_train = np_utils.to_categorical(y_train, 447) print("--> training data loaded") # nomalize test data print("--> loading test data") test_data = read_csv(test_file) X_test = test_data[:, 1:] X_test = X_test.astype(np.float32) X_test = X_test / 255 y_test = np.vstack(test_data[:, 0]) y_test = y_test.astype(np.uint16) X_test, y_test = shuffle(X_test, y_test, random_state=42) X_test = X_test.reshape(-1, 1, 96, 96) Y_test = np_utils.to_categorical(y_test, 447) print("--> test data loaded") return (X_train, Y_train, X_test, Y_test)
def main(is_binary=True): train, test, word2idx = get_ptb_data() for t in train: add_idx_to_tree(t, 0) train = [tree2list(t, -1, is_binary) for t in train] if is_binary: train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels for t in test: add_idx_to_tree(t, 0) test = [tree2list(t, -1, is_binary) for t in test] if is_binary: test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels train = shuffle(train) train = train[:5000] # n_pos = sum(t[3][-1] for t in train) # print "n_pos train:", n_pos test = shuffle(test) test = test[:1000] # n_pos = sum(t[3][-1] for t in test) # print "n_pos test:", n_pos V = len(word2idx) print "vocab size:", V D = 20 K = 2 if is_binary else 5 model = RecursiveNN(V, D, K) model.fit(train) print "train accuracy:", model.score(train) print "test accuracy:", model.score(test) print "train f1:", model.f1_score(train) print "test f1:", model.f1_score(test)
def import_images(): #IMPLEMENT TIMER CUTOFF FR+OR IF FEAT EXT TAKES TOO LONG d_feats = {'orb': []} c_feats = {'orb': []} (cat_paths, dog_paths) = get_filenames(TRAINING_FOLDER) cat_train_pts = [] dog_train_pts = [] for image_fn in shuffle(dog_paths, n_samples = 400, random_state=0): odesc_pts = extract_desc_pts(image_fn) try: for pt in odesc_pts: d_feats['orb'].append(pt) except TypeError: print image_fn continue for image_fn in shuffle(cat_paths, n_samples = 400, random_state=0): odesc_pts = extract_desc_pts(image_fn) try: for pt in odesc_pts: c_feats['orb'].append(pt) except TypeError: print image_fn continue cat_k_means = KMeans(n_jobs=-1, n_clusters=200) cat_k_means.fit(c_feats['orb']) print 'dog calc' dog_k_means = KMeans(n_jobs=-1, n_clusters=200) dog_k_means.fit(d_feats['orb']) print 'saving....' with open('/home/max/CVD/d_o200c200s400.pickle', 'wb') as handle: pickle.dump(dog_k_means.cluster_centers_, handle) with open('/home/max/CVD/c_o200c200s400.pickle', 'wb') as handle: pickle.dump(cat_k_means.cluster_centers_, handle) return '\n\n\n DONE '
def generator3(samples, batch_size=32): num_samples = len(samples) while 1: # Loop forever so the generator never terminates shuffle(samples) for offset in range(0, num_samples, batch_size): batch_samples = samples[offset:offset+batch_size] car_images = [] steering_angles = [] for batch_sample in batch_samples: img_center = cv2.imread(path+batch_sample[0].split('\\')[-1]) img_left = cv2.imread(path+batch_sample[1].split('\\')[-1]) img_right = cv2.imread(path+batch_sample[2].split('\\')[-1]) correction = 0.3 # this is a parameter to tune steering_center = float(batch_sample[3]) steering_left = steering_center + correction steering_right = steering_center - correction # add images and angles to data set car_images.extend([img_center, img_left, img_right]) steering_angles.extend([steering_center, steering_left, steering_right]) # trim image to only see section with road X_train = np.array(car_images) y_train = np.array(steering_angles) yield shuffle(X_train, y_train)
def _subsample_data(self, X, Y, n=10000): if Y is not None: X, Y = shuffle(X, Y) return X[:n], Y[:n] else: X = shuffle(X) return X[:n]
def run_kmeans(inFile, n_colors): china = cv2.imread(inFile) china = np.array(china, dtype=np.float64) / 255 w, h, d = original_shape = tuple(china.shape) assert d == 3 image_array = np.reshape(china, (w * h, d)) print("\tFitting model on a small sub-sample of the data") t0 = time() image_array_sample = shuffle(image_array, random_state=0)[:1000] kmeans = KMeans(k=n_colors, random_state=0).fit(image_array_sample) print("\tdone in %0.3fs." % (time() - t0)) # Get labels for all points print("\tPredicting color indices on the full image (k-means)") t0 = time() labels = kmeans.predict(image_array) print("\tdone in %0.3fs." % (time() - t0)) codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1] print("\tPredicting color indices on the full image (random)") t0 = time() dist = euclidean_distances(codebook_random, image_array, squared=True) labels_random = dist.argmin(axis=0) print("\tdone in %0.3fs." % (time() - t0)) img_kmeans = recreate_image(kmeans.cluster_centers_, labels, w, h) img_random = recreate_image(codebook_random, labels_random, w, h) return china, img_kmeans, img_random
def getMNIST(): # data shape: train (50000, 784), test (10000, 784) # already scaled from 0..1 and converted to float32 datadir = '../large_files/' if not os.path.exists(datadir): datadir = '' input_file = "%smnist.pkl.gz" % datadir if not os.path.exists(input_file): url = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' with open(input_file, "wb") as out: f = urllib2.urlopen(url) out.write(f.read()) out.flush() with gzip.open(input_file) as f: train, valid, test = cPickle.load(f) Xtrain, Ytrain = train Xvalid, Yvalid = valid Xtest, Ytest = test Ytrain_ind = y2indicator(Ytrain) Ytest_ind = y2indicator(Ytest) Xtrain, Ytrain = shuffle(Xtrain, Ytrain) Xtest, Ytest = shuffle(Xtest, Ytest) # try to take a smaller sample Xtrain = Xtrain[0:30000] Ytrain = Ytrain[0:30000] Xtest = Xtest[0:1000] Ytest = Ytest[0:1000] return Xtrain.reshape(len(Xtrain), 1, 28, 28), Ytrain, Ytrain_ind, Xtest.reshape(len(Xtest), 1, 28, 28), Ytest, Ytest_ind
def load_data(self, shuffled=True): samples = load_diabetes() if shuffled: self.X = shuffle(samples.data, random_state=self.SEED) self.y = shuffle(samples.target, random_state=self.SEED) else: self.X, self.y = samples.data, samples.target self.n_features = len(self.X[0])
def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2) S = set(to_tuple(A)) shuffle(A) # shouldn't raise a ValueError for dim = 3 assert_equal(set(to_tuple(A)), S)
def load_binary_data(self, shuffled=True): samples = load_breast_cancer() if shuffled: self.X = shuffle(samples.data, random_state=self.SEED) self.y = shuffle(samples.target, random_state=self.SEED) else: self.X, self.y = samples.data, samples.target self.n_features = len(self.X[0])
def player_status_train_test(player_statuses): """Make a train-test split""" # usak test games chosen by calling: # np.random.RandomState(0).choice(good_games, 20) # and taking the first 6 games that are not incomplete (see wiki) test_games = ['ns000078', 'ns000081', 'cavalry', 'showcase01', 'malafide', 'nexxice'] test_games = ['usak-{}'.format(k) for k in test_games] # add more test games # 50 test games from usdp such that they have at least 500 talk entries test_games += [u'usdp-anzac2011_potts', u'usdp-aloha2', u'usdp-service13', u'usdp-vole_003', u'usdp-service14', u'usdp-owlsopen2011_1a', u'usdp-owlsopen10_3f', u'usdp-echo7', u'usdp-owls_256', u'usdp-owls_246', u'usdp-timgroup1', u'usdp-owlsopen2011_2f', u'usdp-agitar10', u'usdp-owls_242', u'usdp-vole_001', u'usdp-tango', u'usdp-leoxiii', u'usdp-owlsopen2011_2g', u'usdp-vole_025', u'usdp-vole_006', u'usdp-310', u'usdp-owlsopen2011_1c', u'usdp-skullhouse11', u'usdp-vole_004', u'usdp-chess_match', u'usdp-anzac2011_claw', u'usdp-service16', u'usdp-wetterling', u'usdp-owlsopen2011_3c', u'usdp-inthedark1', u'usdp-owls_261', u'usdp-owlsopen2011_3g', u'usdp-ltb2', u'usdp-owlsopen10_3h', u'usdp-vanilla1', u'usdp-owlsopen2011_1g', u'usdp-vole_002', u'usdp-warzones1', u'usdp-vole_012', u'usdp-benjgame', u'usdp-owlsopen2011_3e', u'usdp-power_struggle7', u'usdp-owlsopen2011_3h', u'usdp-owlsopen2011_1d', u'usdp-vole_008', u'usdp-owlsopen2011_2h', u'usdp-spartan01', u'usdp-rainier', u'usdp-owls_252', u'usdp-owls_245'] # filter out short instances print("Before filtering: n_instances=", len(player_statuses)) THRESHOLD = 5 # at least 5 sent and 5 received messages player_statuses = [p for p in player_statuses if sum(msg['direction'] == 'from' for msg in p['talk']) >= THRESHOLD and sum(msg['direction'] == 'to' for msg in p['talk']) >= THRESHOLD] print("After filtering: n_instances=", len(player_statuses)) train_statuses = [_clean(p) for p in player_statuses if p['game'] not in test_games] test_statuses = [_clean(p) for p in player_statuses if p['game'] in test_games] print("Train: {}, test: {}".format(len(train_statuses), len(test_statuses))) print("Test label distribution: ", Counter(row['status'] for row in test_statuses)) train_statuses = np.array(train_statuses) test_statuses = np.array(test_statuses) train_statuses = shuffle(train_statuses, random_state=0) test_statuses = shuffle(test_statuses, random_state=0) y_train = np.array([p['status'] for p in train_statuses]) y_test = np.array([p['status'] for p in test_statuses]) return train_statuses, y_train, test_statuses, y_test
def main(is_binary=True): train, test, word2idx = get_ptb_data() for t in train: add_idx_to_tree(t, 0) train = [tree2list(t, -1, is_binary) for t in train] if is_binary: train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels for t in test: add_idx_to_tree(t, 0) test = [tree2list(t, -1, is_binary) for t in test] if is_binary: test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels # check imbalance # pos = 0 # neg = 0 # mid = 0 # label_counts = np.zeros(5) # for t in train + test: # words, left_child, right_child, labels = t # # for l in labels: # # if l == 0: # # neg += 1 # # elif l == 1: # # pos += 1 # # else: # # mid += 1 # for l in labels: # label_counts[l] += 1 # # print("pos / total:", float(pos) / (pos + neg + mid)) # # print("mid / total:", float(mid) / (pos + neg + mid)) # # print("neg / total:", float(neg) / (pos + neg + mid)) # print("label proportions:", label_counts / label_counts.sum()) # exit() train = shuffle(train) # train = train[:5000] # n_pos = sum(t[3][-1] for t in train) # print("n_pos train:", n_pos) test = shuffle(test) smalltest = test[:1000] # n_pos = sum(t[3][-1] for t in test) # print("n_pos test:", n_pos) V = len(word2idx) print("vocab size:", V) D = 20 K = 2 if is_binary else 5 model = RecursiveNN(V, D, K) model.fit(train, smalltest, epochs=20, train_inner_nodes=True) print("train accuracy:", model.score(train)) print("test accuracy:", model.score(test)) print("train f1:", model.f1_score(train)) print("test f1:", model.f1_score(test))
def build_classification(with_preprocessor=False): """Basic array for testing when using a preprocessor""" X, y = shuffle(*make_blobs(random_state=SEED), random_state=SEED) indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) if with_preprocessor: return Dataset(indices, y[indices], X, indices) else: return Dataset(X[indices], y[indices], None, X[indices])
def build_regression(with_preprocessor=False): """Basic array for testing when using a preprocessor""" X, y = shuffle(*make_regression(n_samples=100, n_features=5, random_state=SEED), random_state=SEED) indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int) if with_preprocessor: return Dataset(indices, y[indices], X, indices) else: return Dataset(X[indices], y[indices], None, X[indices])
def get_data(): """ Get data ready to learn with. Returns ------- dict """ simple = False if simple: # Load the simple, but similar digits dataset from sklearn.datasets import load_digits from sklearn.utils import shuffle digits = load_digits() x = [np.array(el).flatten() for el in digits.images] y = digits.target # Scale data to [-1, 1] - This is of mayor importance!!! # In this case, I know the range and thus I can (and should) scale # manually. However, this might not always be the case. # Then try sklearn.preprocessing.MinMaxScaler or # sklearn.preprocessing.StandardScaler x = x/255.0*2 - 1 x, y = shuffle(x, y, random_state=0) from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = {'train': {'X': x_train, 'y': y_train}, 'test': {'X': x_test, 'y': y_test}} else: # Load the original dataset from sklearn.datasets import fetch_mldata from sklearn.utils import shuffle mnist = fetch_mldata('MNIST original') x = mnist.data y = mnist.target # Scale data to [-1, 1] - This is of mayor importance!!! x = x/255.0*2 - 1 x, y = shuffle(x, y, random_state=0) from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = {'train': {'X': x_train, 'y': y_train}, 'test': {'X': x_test, 'y': y_test}} return data
def read_all(files): x_files = map(lambda x: get_x_name(config.train_spat_folder, x), files) y_files = map(lambda x: get_y_name(config.train_spat_folder, x), files) XX_train, Y_train = cast_dataset(read_data((x_files, y_files))) XX_train = XX_train.swapaxes(4, 2) XX_train, Y_train = shuffle(XX_train, Y_train, random_state=42) XX_test, Y_test = shuffle(XX_train, Y_train, random_state=84, n_samples=500) return XX_train, Y_train, XX_test, Y_test
def sq_dict_learning(row_data, mask, D_0 = None, n_filters = 20, eta = 0.001, sparsity = 10, n_epochs = 4, EV_SCORE = True): ''' k: Number of dictionary items n_theta: Number of orientated realization of the filter ''' #Shuffle the data data = shuffle(row_data).T m, n = data.shape effective_dim = mask.sum() dummy_dim = mask.shape[0]*mask.shape[1] dim_ratio = float(dummy_dim)/effective_dim if D_0 is None: D_base = 1-2*np.random.rand(m,n_filters) D_base -= np.expand_dims(np.mean(D_base, axis=0), 0)*dim_ratio D_base /= np.linalg.norm(D_base,axis=0) D_t = D_base else: D_t = D_0 losses = [] for epoch in range(n_epochs): for t in range(n): x_t = data[:,t] # Sparse Coding idx_t, alphas_t = omp(D_t, x_t, sparsity) # Dictionary Update ##Rotation update d_t = D_t[:,idx_t] eta_prime = eta*m y_t = np.dot(d_t,alphas_t) y_t /= np.linalg.norm(y_t,axis=0) lmbd = np.sqrt(1-(np.dot(y_t, x_t))**2) half_S = np.dot(np.expand_dims(x_t,1), np.expand_dims(y_t,0)) S = half_S - half_S.T update = np.identity(m) + np.sin(2 * eta_prime * lmbd)/lmbd * S + (1 - np.cos(2 * eta_prime * lmbd))/lmbd**2 * np.dot(S,S) D_t[:,idx_t] = np.dot(update, d_t) D_t -= np.expand_dims(np.mean(D_t, axis=0), 0)*dim_ratio D_t /= np.expand_dims(np.linalg.norm(D_t, axis=0), axis=0) if EV_SCORE and (t%500 == 0): loss = score_dict(data, D_t, sparsity ) losses.append(loss) data = shuffle(data.T).T return D_t, losses
tf.abs(tf.subtract(y_conv, tf.cast(tf.argmax(y_, 1), "float32")))) cross_entropy = -tf.reduce_sum( diss * tf.log(tf.cast(y_conv, dtype=tf.float32) + (1e-7))) + sum_acc + tf.reduce_sum(tf.abs(WB_fc1)) train_step = tf.train.AdamOptimizer(Optrate).minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) start = time.time() # minibatch実行 for i in range(EPOCHS): X_train, y_train = shuffle(X['train'], y['train']) XL_train, yL_train = shuffle(X['train'], y['train']) # 0かどうかの判定 dummy_0 = np.asarray([1.0, 0.0] * 2) dummy_0 = dummy_0.reshape(2, 2) img_0_dummy = np.asarray(list(rep_0_img) * 2) img_0_dummy = img_0_dummy.reshape(2, 784) distance_labels = [np.sum(x) for x in dummy_0] eval_acc = sess.run(y_conv, feed_dict={ x: X_train, y_: y_train, keep_prob: 1, xL: img_0_dummy,
def fit(self, X, Y, Xvalid, Yvalid, learning_rate=1e-4, mu=0.9, decay=0.9, epochs=8, batch_sz=100, show_fig=False): ''' Takes training data and test data (valid) at once, then trains and validates along the way. Modifying hyperparams of learning_rate, mu, decay, epochs (iterations = N//batch_sz * epochs), batch_sz and whether to display a figure are passed as optional variables. ''' X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid = Xvalid.astype(np.float32) Yvalid = Yvalid.astype(np.int32) self.rng = RandomStreams() # initialize hidden layers N, D = X.shape K = len(set(Y)) self.hidden_layers = [] M1 = D # first input layer is the number of features in X count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) # layer ID is just the number self.hidden_layers.append(h) M1 = M2 # input layer to next layer is this layer. count += 1 # output layer weights (last hidden layer to K output classes) W = np.random.randn(M1, K) * np.sqrt(2.0 / M1) b = np.zeros(K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.matrix('X') thY = T.ivector('Y') pY_train = self.forward_train(thX) # function to calc prob Y given X # this cost is for training cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) # gradients wrt each param grads = T.grad(cost, self.params) # for momentum ''' np.zeros_like(array) returns an array(/matrix) of the same shape and type of the given array. Very cool, never seen this before. ''' dparams = [ theano.shared(np.zeros_like(p.get_value())) for p in self.params ] # for rmsprop, initialize cache as 1 cache = [ theano.shared(np.ones_like(p.get_value())) for p in self.params ] ''' Noting for myself that I've never seen this way of using zip to loop through multiple lists/arays with the same indices simultaneously. Makes a lot of sense now, I should see where I can use this to turn loops over indices in my code in to list comprehension that is by ele. ''' # these are the functions for updating the variables of # dparams (momentum) and cache. new_cache = [ decay * c + (1 - decay) * g * g for p, c, g in zip(self.params, cache, grads) ] new_dparams = [ mu * dp - learning_rate * g / T.sqrt(new_c + 1e-10) for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads) ] ''' Using zip to create lists of tuples of the variables themselves, and the fuctions for updating them (cache, momentum params and params), where params are weights (W) and biases (b) for each layer. ''' updates = [(c, new_c) for c, new_c in zip(cache, new_cache)] + [ (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams) ] + [(p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)] train_op = theano.function(inputs=[thX, thY], updates=updates) # for evaluation and prediction, more theano graph set-up with tensors # still no values yet in any of these. Training loop next! pY_predict = self.forward_predict(thX) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] # theano function defined above that does all the work. # takes the data (like feed_dict in tf). The update calcs were # given to it above as a list for all layers. train_op(Xbatch, Ybatch) if j % 50 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
import numpy as np from sklearn.model_selection import train_test_split from sklearn.utils import shuffle from tensorflow.contrib.keras.python.keras.models import Sequential from tensorflow.contrib.keras.python.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, SpatialDropout2D from tensorflow.contrib.keras.python.keras.utils import plot_model from tensorflow.contrib.keras.python.keras.preprocessing.image import ImageDataGenerator from tensorflow.contrib.keras.python.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau print("Loading data...") collection = np.load('collection.npy') labels_onehot = np.load('labels_onehot.npy') collection, labels_onehot = shuffle(collection, labels_onehot) x_train_full, x_valid, y_train_full, y_valid = train_test_split(collection, labels_onehot, test_size=0.2) x_train, x_test, y_train, y_test = train_test_split(x_train_full, y_train_full, test_size=0.25) np.save('x_test.npy', x_test) np.save('y_test.npy', y_test) print("Making model...") model = Sequential() model.add( Conv2D(filters=32, kernel_size=(3, 3),
num_of_samples = img_data.shape[0] labels = np.ones((num_of_samples,),dtype='int64') labels[0:202]=0 labels[202:404]=1 labels[404:606]=2 labels[606:]=3 names = ['cats','dogs','horses','humans'] # convert class labels to on-hot encoding Y = np_utils.to_categorical(labels, num_classes) #Shuffle the dataset x,y = shuffle(img_data,Y, random_state=2) # Split the dataset X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2) #%% # Defining the model input_shape=img_data[0].shape model = Sequential() model.add(Conv2D(32, (3,3),border_mode='same',input_shape=input_shape)) model.add(Activation('relu')) model.add(Conv2D(32, (3, 3))) model.add(Activation('relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.5))
# === Manual Back ==== # sess with tf.Session() as sess: sess.run(tf.global_variables_initializer()) train_cota, train_acca = 0, 0 train_cot, train_acc = [], [] test_cota, test_acca = 0, 0 test_cot, test_acc = [], [] for iter in range(num_epoch): train_batch, train_label = shuffle(train_batch, train_label) for batch_size_index in range(0, len(train_batch), (batch_size // 2)): current_batch = train_batch[batch_size_index:batch_size_index + (batch_size // 2)] current_batch_label = train_label[ batch_size_index:batch_size_index + (batch_size // 2)] # online data augmentation here and standard normalization images_aug = seq.augment_images(current_batch.astype(np.float32)) current_batch = np.vstack( (current_batch, images_aug)).astype(np.float32) current_batch_label = np.vstack( (current_batch_label, current_batch_label)).astype(np.float32) current_batch, current_batch_label = shuffle( current_batch, current_batch_label)
return res for x in testText.split("."): if "----" in x: # for y in m = countFreq("----", x) x.replace("----", "") x = cleanData(x) # if re.search('[a-zA-Z]', x): # print (m) for l in range(int(m / 2)): testData.append(x) # print (testData) data, labels = shuffle(data, labels, random_state=0) count_vect = CountVectorizer(ngram_range=(1, 1), max_df=0.1) #print(count_vect) tfidf_transformer = TfidfTransformer(use_idf=True) X_train_counts = count_vect.fit_transform(data[:1050]) #print(X_train_counts) # X_train_counts2 = count_vect.transform(data[5000:]) testData = count_vect.transform(testData) #print(testData) # print (X_train_counts.shape ,X_train_counts2.shape ) # # X_train_counts = vstack([X_train_counts, X_train_counts2]).toarray() # x1 = X_train_counts.toarray().tolist() # x2 = X_train_counts2.toarray().tolist()
def classifier(model, emb_mean, emb_std, embeddings_index): train = pd.read_csv('./input/TIL_NLP_train1_dataset.csv') test = pd.read_csv('./input/TIL_NLP_unseen_dataset.csv') print('running classifier') max_features = 4248 print(max_features) maxlen = 200 embed_size = 100 train = shuffle(train) X_train = train["word_representation"].fillna("fillna").values y_train = train[[ "outwear", "top", "trousers", "women dresses", "women skirts" ]].values X_test = test["word_representation"].fillna("fillna").values y_test = test[[ "outwear", "top", "trousers", "women dresses", "women skirts" ]].values y_test = y_test.tolist() print('preprocessing start') tokenizer = text.Tokenizer(num_words=max_features) tokenizer.fit_on_texts(list(X_train) + list(X_test)) X_train = tokenizer.texts_to_sequences(X_train) X_test = tokenizer.texts_to_sequences(X_test) x_train = sequence.pad_sequences(X_train, maxlen=maxlen) x_test = sequence.pad_sequences(X_test, maxlen=maxlen) del X_train, X_test, train, test gc.collect() word_index = tokenizer.word_index nb_words = min(max_features, len(word_index)) embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) for word, i in word_index.items(): if i >= max_features: continue embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embedding_matrix[i - 1] = embedding_vector print('preprocessing done') # session_conf = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4) # K.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf)) #model #wrote out all the blocks instead of looping for simplicity filter_nr = 64 filter_size = 3 max_pool_size = 3 max_pool_strides = 2 dense_nr = 256 spatial_dropout = 0.2 dense_dropout = 0.5 train_embed = False conv_kern_reg = regularizers.l2(0.00001) conv_bias_reg = regularizers.l2(0.00001) comment = Input(shape=(maxlen, )) emb_comment = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=train_embed)(comment) block1 = Bidirectional(LSTM(embed_size))(emb_comment) block1 = Dense(embed_size, activation='linear')(block1) output = Dense(5, activation='sigmoid')(block1) """ emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment) block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment) block1 = BatchNormalization()(block1) block1 = PReLU()(block1) block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1) block1 = BatchNormalization()(block1) block1 = PReLU()(block1) #we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output #if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment) resize_emb = PReLU()(resize_emb) block1_output = add([block1, resize_emb]) block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output) block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1_output) block2 = BatchNormalization()(block2) block2 = PReLU()(block2) block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2) block2 = BatchNormalization()(block2) block2 = PReLU()(block2) block2_output = add([block2, block1_output]) block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output) block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2_output) block3 = BatchNormalization()(block3) block3 = PReLU()(block3) block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3) block3 = BatchNormalization()(block3) block3 = PReLU()(block3) block3_output = add([block3, block2_output]) block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output) block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3_output) block4 = BatchNormalization()(block4) block4 = PReLU()(block4) block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4) block4 = BatchNormalization()(block4) block4 = PReLU()(block4) block4_output = add([block4, block3_output]) block4_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block4_output) block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4_output) block5 = BatchNormalization()(block5) block5 = PReLU()(block5) block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5) block5 = BatchNormalization()(block5) block5 = PReLU()(block5) block5_output = add([block5, block4_output]) block5_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block5_output) block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5_output) block6 = BatchNormalization()(block6) block6 = PReLU()(block6) block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6) block6 = BatchNormalization()(block6) block6 = PReLU()(block6) block6_output = add([block6, block5_output]) block6_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block6_output) block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6_output) block7 = BatchNormalization()(block7) block7 = PReLU()(block7) block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block7) block7 = BatchNormalization()(block7) block7 = PReLU()(block7) block7_output = add([block7, block6_output]) output = GlobalMaxPooling1D()(block7_output) output = Dense(dense_nr, activation='linear')(output) output = BatchNormalization()(output) output = PReLU()(output) output = Dropout(dense_dropout)(output) output = Dense(5, activation='sigmoid')(output) """ #model = Model(comment, output) # print("Correct model: ", type(model)) model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(), metrics=['accuracy']) num_folds = 5 num = 0 kfold = KFold(n_splits=num_folds, shuffle=True) for train, test in kfold.split(x_train, y_train): print("Training Fold number: ", num) batch_size = 128 epochs = 20 lr = callbacks.LearningRateScheduler(schedule) ra_val = RocAucEvaluation(validation_data=(x_train[test], y_train[test]), interval=1) es = EarlyStopping(monitor='val_loss', verbose=1, patience=5, restore_best_weights=True, mode='min') mc = ModelCheckpoint('best_model_rnn.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True) model.fit(x_train[train], y_train[train], batch_size=batch_size, epochs=epochs, validation_data=(x_train[test], y_train[test]), callbacks=[lr, ra_val, es, mc], verbose=1) num += 1 y_pred = model.predict(x_test) y_pred = [[1 if i > 0.5 else 0 for i in r] for r in y_pred] accuracy = sum([y_pred[i] == y_test[i] for i in range(len(y_pred))]) / len(y_pred) * 100 print([y_pred[i] == y_test[i] for i in range(len(y_pred))]) print(accuracy, "%") print(f1(y_pred, y_test)) """ submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv') submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred submission.to_csv('dpcnn_test_preds.csv', index=False) """ return model
if os.path.isfile(to_cluster_path): return to_cluster_path elif os.path.isfile(to_local_path): return to_local_path else: print("No valid file path") return "NOPE" print("Load data...") total_data_df_path = '/Dedicated/jmichaelson-wdata/mcrichter/HackUiowa2018/NN_behaviour/total_data_df_reduced_no_0_columns.csv' total_data_df = pd.read_csv(check_file_path(total_data_df_path)) seed = 42 total_data_df_shuffled = shuffle(total_data_df, random_state=seed) X = total_data_df_shuffled.drop(["intercept", "Score"], axis=1) y = total_data_df_shuffled[["Score"]] # Create a minimum and maximum processor object min_max_scaler = preprocessing.MinMaxScaler() # Create an object to transform the data to fit minmax processor y_scaled = min_max_scaler.fit_transform(y) # Run the normalizer on the dataframe y_normalized = pd.DataFrame(y_scaled, columns=['Score_normalized']) (trainX, testX, trainY, testY) = train_test_split(X, y_normalized,
print('Loading no_car features from file...') with open(no_car_features_file, 'rb') as f: no_car_features = pickle.load(f) else: print('Generating no_car features from file...') for name in image_names_no_car: image = cv2.imread(name) features = generate_hog_features(image) no_car_features.append(features) with open(no_car_features_file, 'wb') as f: pickle.dump(no_car_features, f) print('Generated') y = np.hstack((np.ones(len(car_features)), np.zeros(len(no_car_features)))) X = np.vstack((car_features, no_car_features)).astype(np.float64) X, y = shuffle(X, y) #X = X[:2000] #y = y[:2000] print(X.shape) X_scaler = StandardScaler().fit(X) with open('X_scaler.pkl', 'wb') as f: pickle.dump(X_scaler, f) X = X_scaler.transform(X) print([X[0]]) rand_state = np.random.randint(0, 100) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rand_state)
sample_batched = next(iter(sequential)) image = torch.autograd.Variable(sample_batched['image'].cuda()) depth = torch.autograd.Variable(sample_batched['depth'].cuda(non_blocking=True)) if epoch == 0: writer.add_image('Train.1.Image', vutils.make_grid(image.data, nrow=6, normalize=True), epoch) if epoch == 0: writer.add_image('Train.2.Depth', colorize(vutils.make_grid(depth.data, nrow=6, normalize=False)), epoch) output = DepthNorm( model(image) ) writer.add_image('Train.3.Ours', colorize(vutils.make_grid(output.data, nrow=6, normalize=False)), epoch) writer.add_image('Train.3.Diff', colorize(vutils.make_grid(torch.abs(output-depth).data, nrow=6, normalize=False)), epoch) del image del depth del output traincsv=pd.read_csv('./content/data/diml_outdoor_train.csv') traincsv = traincsv.values.tolist() traincsv = shuffle(traincsv, random_state=2) #display a sample set of image and depth image depth_dataset = DepthDataset(traincsv=traincsv,root_dir='./content/') fig = plt.figure() len(depth_dataset) model = Model().cpu() if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = nn.DataParallel(model) #load trained model if needed #model.load_state_dict(torch.load('/workspace/1.pth')) print('Model created.')
loss_operation = tf.reduce_mean(cross_entropy) optimizer = tf.train.AdamOptimizer(learning_rate=rate) training_operation = optimizer.minimize(loss_operation) correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(one_hot_y, 1)) accuracy_operation = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) num_examples = len(X_train) print("Training...") print("Validation Accuracy:") for i in range(EPOCHS): X_train, y_train = shuffle(X_train, y_train) for offset in range(0, num_examples, BATCH_SIZE): end = offset + BATCH_SIZE batch_x, batch_y = X_train[offset:end], y_train[offset:end] sess.run(training_operation, feed_dict={ x: batch_x, y: batch_y, keep_prob: 1.0 }) validation_accuracy = evaluate(X_validation, y_validation) print("#{}".format(i + 1), " {:.3f}".format(validation_accuracy)) saver.save(sess, './lenet')
def preprocess(): global X_train, y_train, X_validation, y_validation, X_test, y_test X_train, y_train = shuffle(X_train, y_train)
model.add(Activation('relu')) model.add(Dropout(0.4)) model.add(BatchNormalization()) model.add(Dense(2)) model.add(Activation('sigmoid')) model.summary() model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # In[18]: X_train, Y_train = shuffle(X_train, Y_train) model_checkpoint = ModelCheckpoint('./Alexnet_brat.hdf5', monitor='loss', verbose=1, save_best_only=True) reduce_lr = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.000001, verbose=1) callbacks = [reduce_lr, model_checkpoint] #model.load_weights("./Alexnet_brat.hdf5") model.fit(X_train, Y_train, batch_size=32, epochs=200, verbose=1,
'Z3', 'Z4', 'Z5', 'Z6', 'Z7', 'Z8', 'Z9', 'XAVG', 'YAVG', 'ZAVG', 'XPEAK', 'YPEAK', 'ZPEAK', 'XABSOLDEV', 'YABSOLDEV', 'ZABSOLDEV', 'XSTANDDEV', 'YSTANDDEV', 'ZSTANDDEV', 'Resultant', 'Class', 'Time' ] feats = pd.read_csv(args.tx_train, header=None, names=act_headers) AR = pd.read_csv(args.additional_train, header=None, names=act_headers) #replacing Upstair and Downstairs with Stairs AR = AR.replace(to_replace=['Upstairs', 'Downstairs'], value='Stairs') print AR['Class'].unique() ''' clf = RandomForestClassifier(max_depth=2, random_state=0) clf.fit(feats[feats.columns[1:44]], feats[feats.columns[44]]) ''' data = shuffle(feats) no_of_samples = data.shape[0] samples_per_fold = no_of_samples / 10 Features_10_folds = [] Labels_10_folds = [] for i in range(9): data_fold = data[:samples_per_fold] data = data[samples_per_fold:] features = data_fold[data_fold.columns[1:44]] labels = data_fold[data_fold.columns[44]] Features_10_folds.append(features) Labels_10_folds.append(labels) #for last fold all remaining features = data[data.columns[1:44]]
feat2 = construct_feat(nlist2, model) feat3 = construct_feat(nlist3, model) try: feat = np.concatenate([feat1, feat2, feat3]) except: continue X.append(feat[0]) for j in range(1, len(feat)): X[-1] = np.concatenate([X[-1], feat[j]]) Y.append(int(d[i][1])) maxi = max(maxi, len(X[-1])) for i in range(len(X)): X[i] = pad(X[i], maxi) print("constructed feature vectors") X = np.array(X) Y = np.array(Y) X, Y = shuffle(X, Y) Xtr = X[:5000] Ytr = Y[:5000] Xts = X[5000:5500] Yts = Y[5000:5500] print("brgining training") # naive_bayes(Xtr, Ytr, Xts, Yts) svm(Xtr, Ytr, Xts, Yts) # log_regression(Xtr, Ytr, Xts, Yts) # feed_forward_nn(Xtr, Ytr, Xts, Yts)
def fit(self, X, Y, learning_rate=10e-5, mu=0.9, decay=0.99, epochs=10, batch_sz=100, eps=10e-10, display_cost=False): #learning_rate=10e-7, mu=0.99, decay=0.999, epochs=100, batch_sz=30, l2=0.0, eps=10e-10 learning_rate = np.float32(learning_rate) mu = np.float32(mu) decay = np.float32(decay) eps = np.float32(eps) ''' In Theano we can't actually 'drop' the nodes; that would result in a different computational graph, we are instead to multiply nodes by 1 and 0; for each layer we then need to create a 'mask' - array of 0s and 1s; Theano graph nodes don't have values, so we can't multiply them by numpy array 'mask'; instead we want Theano to generate random values every time it's called; thus we create an instance of RandomStreams object: ''' self.rng = RandomStreams() # first, make a validation set: X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:, :], Y[-1000:] X, Y = X[:-1000, :], Y[:-1000] #initialize the hidden layers: N, D = X.shape K = len(set(Y)) self.hidden_layers = [] # the size of the first dimension of the first matrix: M1 = D count = 0 # for the id of the weigts/biases for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 # update the first dimension size fir the next iteration count += 1 # for the last weight/bias matrix (vector): W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W%s' % count) self.b = theano.shared(b, 'b%s' % count) # collect all the parameters we are going to use during Gradient Descent: self.parameters = [self.W, self.b] for h in self.hidden_layers[::-1]: self.parameters += h.params # in order to use Momentum, # we are to keep track of all the changes (dW's and db's): dparams = [ theano.shared(np.zeros_like(p.get_value(), dtype=np.float32)) for p in self.parameters ] # for RMSProp, # we are to keep track of caches (cache_W's and cache_b's) as well: caches = [ theano.shared(np.ones_like(p.get_value(), dtype=np.float32)) for p in self.parameters ] # define theano variables and functions: thX = T.matrix('X') thY = T.ivector('Y') # a vector of integers # since we do dropout, we drop the nodes only on training step, # when evaluating we just scale them; # so we need to define two expressions for the output and cost calculations: pY_train = self.forward_train(thX) pY_predict = self.forward_predict(thX) cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY])) cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY])) prediction = self.predict(thX) # will do sort of T.argmax(pY, axis=1) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost_predict, prediction]) # the updates for the train function: updates = [ (cache, decay * cache + (np.float32(1.0) - decay) * T.grad(cost, p)**2) for p, cache in zip(self.parameters, caches) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p) / T.sqrt(cache + eps)) for dp, p, cache in zip(dparams, self.parameters, caches) ] + [(p, p + dp) for p, dp in zip(self.parameters, dparams)] #updates = rmsprop(cost, self.parameters, learning_rate, mu, decay, eps) train_op = theano.function(inputs=[thX, thY], updates=updates) # batch SGD: n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j + 1) * batch_sz, :] Ybatch = Y[j * batch_sz:(j + 1) * batch_sz] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print('\ni: %d, j: %d, cost: %.6f, \nerror: %.6f' % (i, j, c, e)) if display_cost: plt.plot(costs) plt.show()
optimizer='adam', metrics=['accuracy']) #model.compile(loss='hinge', # optimizer='adadelta', # metrics=['accuracy']) from keras.utils.visualize_util import plot plot( model, to_file= '/Users/km4n6/Box Sync/kiran/NN_project/final_project/plots/model_svm.png', show_shapes=True) from sklearn.utils import shuffle im_shuffled_validation, shuffled_targets_validation = shuffle( im_validation, targets_validation, random_state=0) # out = model.fit(im, targets, validation_data=(im_shuffled_validation, shuffled_targets_validation), nb_epoch=25, verbose=1, initial_epoch=0, batch_size=32, shuffle=True) np.save( '/Users/km4n6/Box Sync/kiran/NN_project/final_project/saved_models/history_acc_loss_svm.npy',
import pandas as pd import numpy as np from sklearn.utils import shuffle from sklearn.preprocessing import LabelEncoder from sklearn.neighbors import KNeighborsClassifier import pickle # # Loading the data from the csv file # # There are total 101 crops in this csv file with NPK ph temp and climate values data = pd.read_csv('fpo/Crop1.csv') # Throws out a random permutation of the data data = shuffle(data) # We get the crop names from the dataframe y = data.loc[:, 'Crop'] # We're using label encoding so that the names of the crops (which are strings) can be converted into numbers that are easily interpreted by a model labelEncoded_y = LabelEncoder() # Applying the transformation y = labelEncoded_y.fit_transform(y) # Creating a new column for the transformed names data['crop_num'] = y # Now we get the features for predictions X = data[['N', 'P', 'K', 'pH', 'temp', 'climate']] # The labels y = data['crop_num']
# RUNNING CODE full_dataset, full_labels, lost_features = pull_dataset() print('Sanity Check') print('full dataset of shape:', full_dataset.shape) print('full labels of shape:', full_labels.shape) print( 'TOTAL NUMBER OF FACES NOT DETECTED WITH OUR LANDMARKS DETECTOR (IN-BUILT, pre-trained model): {0}' .format(len(lost_features))) # # creating classifier object as an SVM (support vector machine) probabilistic model, you can change this to any other type of classifier # classifier = SVC(kernel='linear', probability=True, tol=1e-3) # Reshuffling data (for extra randomness) X_data, Y_data = shuffle(full_dataset, full_labels, random_state=0) print('X_data of shape:', X_data.shape) print('Y_data of shape:', Y_data.shape) # perform train and test split (random state set to 1 to ensure same distribution accross different sets) # this split is obviously case specific! but cross validation allows us to avoid over-fitting so lets make sure we have a validation set ready. # Since the dataset is not extrememly large i'll be using a 60/20/20 split, meaning more or less 1000 validation and test examples and 3000 training examples, to be tested: 75/10/15 # in this case we are a little less concerned since we are evaluating smiles which are present in every case, unlike glasses X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=1) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2,
'AOD_11', 'AOD_12', 'AOD_13', 'AOD_14', 'AOD_15', 'AOD_16', ] for clo in data_get_dummies3.columns: independent.append(clo) for clo2 in data_get_dummies1.columns: independent.append(clo2) # 因变量 dependent = ["PM25"] # 打乱 data = shuffle(data_out) # 参数设置 mlp = LinearRegression(fit_intercept=True) rng = check_random_state(1027) # 划分 x_train = data_train[independent].values x_test = data_test[independent].values y_train = data_train[dependent].values.ravel() y_test = data_test[dependent].values.ravel() # 计算耗时 starttime = datetime.datetime.now().second # 程序 ensemble = AdaBoostRegressor(base_estimator=mlp, learning_rate=0.01,
def UAP_target_pre(x, model, model_used, model_path, save_path, noise_limit=0.2, attack_type=None, target_class=None, batch_size=None, nb_classes=None, channels=None, samples=None, regular=None): # x_train, x_val, y_train, y_val = train_test_split(x, y, shuffle=True, test_size=0.2) x_train, x_val = train_test_split(x, shuffle=True, test_size=0.2) batch_size = min(batch_size, len(x_train)) universal_noise = tf.Variable(np.zeros((x_train[0].shape)), dtype=tf.float32) temp_universal_noise = tf.expand_dims(universal_noise, 0) # print(temp_universal_noise) x_input = Input(shape=(x_train.shape[1], x_train.shape[2], x_train.shape[3])) x = Lambda(lambda xx: xx + tf.clip_by_value(temp_universal_noise, -noise_limit, noise_limit))(x_input) # Model output if model_used == 'EEGNet': prediction = old_models.EEGNet_output(nb_classes=nb_classes, Chans=channels, Samples=samples, x_input=x) elif model_used == 'DeepConvNet': prediction = old_models.DeepConvNet_output(nb_classes=nb_classes, Chans=channels, Samples=samples, x_input=x) elif model_used == 'ShallowConvNet': prediction = old_models.ShallowConvNet_output(nb_classes=nb_classes, Chans=channels, Samples=samples, x_input=x) else: raise Exception('No such model:{}'.format(model_used)) # print(prediction) u_model = Model(inputs=x_input, outputs=prediction) u_model.load_weights(model_path) model.load_weights(model_path) y_train = np.argmax(model.predict(x_train, batch_size=batch_size), axis=1).flatten() y_val = np.argmax(model.predict(x_val, batch_size=batch_size), axis=1).flatten() alpha = tf.placeholder(dtype=tf.float32) al = 100 if regular == 'l1': loss = alpha * (tf.reduce_mean(tf.abs(universal_noise))) al = 5 elif regular == 'l2': loss = alpha * (tf.reduce_mean(tf.square(universal_noise))) al = 100 elif regular == 'l1+l2': loss = alpha * (tf.reduce_mean(10*tf.square(universal_noise) + 0.1*tf.abs(universal_noise))) al = 10 elif regular == None: loss = 0 else: raise Exception('no such loss regularization!') # loss = alpha * (tf.reduce_mean(tf.square(universal_noise) + tf.abs(universal_noise))) # loss = alpha * (tf.reduce_mean(tf.square(universal_noise) + tf.square(universal_noise))) # print(loss) target = tf.placeholder(dtype=tf.int32, shape=[None, ]) if attack_type == 'nontarget': # loss += K.mean(K.sparse_categorical_crossentropy(target, 1-prediction, from_logits=False)) loss += -K.mean(K.sparse_categorical_crossentropy(target, prediction, from_logits=False)) elif attack_type == 'target': loss += K.mean(K.sparse_categorical_crossentropy(target, prediction, from_logits=False)) else: raise Exception('no such attack_type!') start_vars = set(x.name for x in tf.global_variables()) lr_ph = tf.placeholder(shape=[], dtype=tf.float32) optimizer = tf.train.AdamOptimizer(learning_rate=lr_ph) train = optimizer.minimize(loss, var_list=[universal_noise]) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] init = tf.variables_initializer(var_list=[universal_noise] + new_vars) sess = K.get_session() sess.run(init) nb_batch = len(x_train) // batch_size end = False epochs = 500 lr = 1e-3 v = np.zeros((x_train[0].shape)) patience = 0 patience_threshold = 10 idx_list = [m for m in range(len(x_train))] # target if attack_type == 'target': y_true = np.ones(y_val.shape) * target_class stop_condition = 1 acc_best = 0. else: y_true = np.copy(y_val) stop_condition = -1 acc_best = 1. # stop_condition = 1 # fr_best = 0. for epoch in range(epochs): idx_list = shuffle(idx_list) for i in range(nb_batch): target_idx = idx_list[i * batch_size:min((i + 1) * batch_size, len(x_train))] x_batch, y_batch = x_train[target_idx], y_train[target_idx] if attack_type == 'target': y_batch = np.ones(y_batch.shape) * target_class _, losses = sess.run( [train, loss], { u_model.inputs[0]: x_batch, alpha: al, lr_ph: lr, target: y_batch, # K.learning_phase(): 0 } ) if (i + epoch * nb_batch) % 100 == 0: # if i % 1 == 0: pred = np.argmax(u_model.predict(x_val), -1) y_pred = pred.squeeze() acc = np.sum(np.where(y_pred == y_true, 1, 0)).astype(np.float64) / len(y_pred) norm = np.mean(np.square(sess.run(universal_noise))) if attack_type == 'target': print('epoch:{}/{}, batch:{}/{}, acc:{}, norm:{}'.format(epoch + 1, epochs, i + 1, nb_batch, acc, norm)) else: raw_pred = np.argmax(model.predict(x_val), -1).squeeze() fooling_rate = np.sum(np.where(y_pred != raw_pred, 1, 0)).astype(np.float64) / len(y_pred) print('epoch:{}/{}, batch:{}/{}, acc:{}, fooling rate:{}, norm:{}, loss:{}'.format(epoch + 1, epochs, i + 1, nb_batch, acc, fooling_rate, norm, losses)) # if acc > threshold_acc and norm > threshold_norm: # a = 5e2 if stop_condition * acc > stop_condition * acc_best: patience = 0 acc_best = acc v = K.eval(universal_noise) if save_path == None: print('update v! but not save.') else: print('best acc:{}, now saving adversarial patch to {}.'.format(acc_best, save_path)) # np.savez(noise_filename, v=un_no) np.savez(save_path, v=v) else: patience += 1 if acc == 1: print('best acc:{}, now saving adversarial patch to {}.'.format(acc_best, save_path)) np.savez(save_path, v=v) if patience == patience_threshold: end = True break if end: break return v
history = {'val_loss': [], 'val_acc': []} ''' 모델을 학습시킨다 ''' epochs = 50 batch_size = 200 init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) n_batches = N_train // batch_size for epoch in range(epochs): X_, Y_ = shuffle(X_train, Y_train) for i in range(n_batches): start = i * batch_size end = start + batch_size sess.run(train_step, feed_dict={ x: X_[start:end], t: Y_[start:end], keep_prob: p_keep }) # 검증 데이터를 사용해서 평가한다 val_loss = loss.eval(session=sess, feed_dict={
def getData(dataPath, moreDataPath, trainSize, trainFlag, devFlag, testFlag): ##TODO changed devSize to 1000 for trails for more trainSize data. It was 5000 for default run devSize = 1000 testSize = 1000 numTraces = 1500 ##Number of traces collected per key ## Pre defining the arrays based on sizes of the data x_train = np.zeros((28000 * 256, numTraces)) x_dev = np.zeros((devSize * 256, numTraces)) x_test = np.zeros((testSize * 256, numTraces)) y_train = np.zeros((28000 * 256, 1)) y_dev = np.zeros((devSize * 256, 1)) y_test = np.zeros((testSize * 256, 1)) for index, val in enumerate(range(0, 256)): print("Started data processing for %d key\n" % (val)) trainStr = dataPath + "train_" + str(val) + ".pkl.zip" devStr = dataPath + "dev_" + str(val) + ".pkl.zip" testStr = dataPath + "test_" + str(val) + ".pkl.zip" ##more training data path moreTrainStr = moreDataPath + "train_" + str(val) + ".pkl.zip" ## Checking if the file size is 0, before processing data ## This check is for cross config analysis, where traina nd dev are empty #if (os.stat(trainStr).st_size != 0): if (trainFlag): x_train_inter, y_train_inter = process_inputs(trainStr) ## Trainsize will still be 15000, but we will take data from devSet to trainset x_train[trainSize * index:trainSize * (index) + 15000, :] = x_train_inter y_train[trainSize * index:trainSize * (index) + 15000, 0] = y_train_inter ## Adding 9000 more data x_train_inter_more, y_train_inter_more = process_inputs( moreTrainStr) x_train[trainSize * (index) + 15000:(trainSize * (index) + 15000) + 9000, :] = x_train_inter_more[0:9000, :] y_train[trainSize * (index) + 15000:(trainSize * (index) + 15000) + 9000, 0] = y_train_inter_more.reshape(9000, 1)[0:9000, 0] print("Train= %s\n" % (trainFlag)) else: ## Assigning the array's to 0's ##NOTE: needs to change shape, but since we are always training, I am not changing this x_train[trainSize * index:trainSize * (index + 1), :] = np.zeros( (trainSize, numTraces)) y_train[trainSize * index:trainSize * (index + 1), :] = np.zeros( (trainSize, 1)) print("train= %s\n" % (trainFlag)) #if (os.stat(devStr).st_size != 0): if (devFlag): ## get the data for each sub part x_dev_inter, y_dev_inter = process_inputs(devStr) print("x_dev_inter= %s, y_dev_inter= %s" % (x_dev_inter.shape, y_dev_inter.shape)) x_dev[devSize * index:devSize * (index + 1), :] = x_dev_inter[0:devSize, :] y_dev[devSize * index:devSize * (index + 1), 0] = y_dev_inter.reshape(5000, 1)[0:devSize, 0] print("Dev= %s\n" % (devFlag)) print("x_dev= %s, y_dev= %s" % (x_dev.shape, y_dev.shape)) ## Adding 4000 traces to trainSet here x_train[trainSize * (index) + 15000 + 9000:(trainSize * (index) + 15000) + 13000, :] = x_dev_inter[1000:5000, :] y_train[trainSize * (index) + 15000 + 9000:(trainSize * (index) + 15000) + 13000, 0] = y_dev_inter.reshape(5000, 1)[devSize:5000, 0] print("x_trainSize = %s, y_trainSize= %s" % (x_train.shape, y_train.shape)) else: x_dev[devSize * index:devSize * (index + 1), :] = np.zeros( (devSize, numTraces)) y_dev[devSize * index:devSize * (index + 1), :] = np.zeros( (devSize, 1)) print("dev= %s\n" % (devFlag)) ## Test data is present so check is not performed if (testFlag): x_test_inter, y_test_inter = process_inputs(testStr) x_test[testSize * index:testSize * (index + 1), :] = x_test_inter y_test[testSize * index:testSize * (index + 1), 0] = y_test_inter print("Test= %s\n" % (testFlag)) print("x_test= %s, y_test= %s" % (x_test.shape, y_test.shape)) else: x_test[testSize * index:testSize * (index + 1), :] = np.zeros( (testSize, numTraces)) y_test[testSize * index:testSize * (index + 1), :] = np.zeros( (testSize, 1)) print("test= %s\n" % (testFlag)) print("Finished data processing for %d key\n" % (val)) ## Clear variables x_train_inter = None x_dev_inter = None x_test_inter = None y_train_inter = None y_dev_inter = None y_test_inter = None x_train_inter_more = None y_train_inter_more = None print("\nCleared variables\n") ##Not shuffling for debugging, should be removed ## Shuffling ## https://scikit-learn.org/stable/modules/generated/sklearn.utils.shuffle.html print("\nStarted shuffling of data\nx_train[0]= %s\ny_train[0]= %s" % (x_train[0], y_train[0])) print("\nx_train[12000]= %s\ny_train[12000]= %s" % (x_train[12000], y_train[12000])) x_train, y_train = shuffle(x_train, y_train, random_state=0) x_dev, y_dev = shuffle(x_dev, y_dev, random_state=0) x_test, y_test = shuffle(x_test, y_test, random_state=0) print("\nFinished shuffling of data\nx_train[0]= %s\ny_train[0]= %s" % (x_train[0], y_train[0])) print("\nx_train[12000]= %s\ny_train[12000]= %s" % (x_train[12000], y_train[12000])) ##NOTE: Remove: #Mimport pdb; pdb.set_trace() ## One hot assignment n_classes = 256 y_train_oh = np_utils.to_categorical(y_train, n_classes) y_dev_oh = np_utils.to_categorical(y_dev, n_classes) y_test_oh = np_utils.to_categorical(y_test, n_classes) print("\nOne-hot encoded for outputs\n") ## Standardizing train, dev and test x_train_mean = x_train.mean(axis=0) x_train_std = x_train.std(axis=0) x_dev_mean = x_dev.mean(axis=0) x_dev_std = x_dev.mean(axis=0) x_test_mean = x_test.mean(axis=0) x_test_std = x_test.std(axis=0) #M## Concatenating train and dev #Mx_full = np.concatenate((x_train, x_dev), axis=0) #Mx_full_mean = x_full.mean(axis=0) #Mx_full_std = x_full.std(axis=0) ## chunking the normalization process print("Strated normalizing\n") chunkSize = 28000 chunkNum = int(len(x_train) / chunkSize) for chunkIndex in range(chunkNum): print("Train chunkIndx= %s, chunkNum = %s" % (chunkIndex, chunkNum)) if (chunkIndex != chunkNum - 1): x_train[chunkIndex * chunkSize:(chunkIndex + 1) * chunkSize] = (x_train[chunkIndex * chunkSize: (chunkIndex + 1) * chunkSize] - x_train_mean) / x_train_std else: x_train[chunkIndex * chunkSize:] = (x_train[chunkIndex * chunkSize:] - x_train_mean) / x_train_std devChunkSize = 10000 devChunkNum = int(len(x_dev) / devChunkSize) for devChunkIndex in range(devChunkNum): print("Dev chunkIndx= %s, chunkNum = %s" % (devChunkIndex, devChunkNum)) if (devChunkIndex != devChunkNum - 1): x_dev[devChunkIndex * devChunkSize:(devChunkIndex + 1) * devChunkSize] = (x_dev[devChunkIndex * devChunkSize: (devChunkIndex + 1) * devChunkSize] - x_train_mean) / x_train_std else: x_dev[devChunkIndex * devChunkSize:] = (x_dev[devChunkIndex * devChunkSize:] - x_train_mean) / x_train_std ## Need to do the same for test too return (x_train, y_train_oh), (x_dev, y_dev_oh), (x_test, y_test_oh)
## - 1s - loss: 0.0067 - mean_squared_error: 0.0027 - val_loss: 0.0049 - val_mean_squared_error: 0.0011 ##train ##mean_squared_error: 0.0005 ##test ##mean_squared_error: 0.0011 random.seed(datetime.now()) # load dataset X = np.transpose(np.loadtxt("Xtrain.txt", dtype=float)) Y = np.transpose(np.loadtxt('Ytrain.txt', dtype=float)) m = X.shape[0] print(str(m)) # shuffle dataset (mini batch) X_shuffled, Y_shuffled = shuffle(X, Y) # split dataset m_train = math.floor(m*0.80) print(str(m_train)) X_train = X_shuffled[:m_train,:].reshape(m_train,height,width,3) Y_train = Y_shuffled[:m_train] X_test = X_shuffled[m_train:,:].reshape(m-m_train,height,width,3) Y_test = Y_shuffled[m_train:] # create model model = Sequential() # CONV model.add(Conv2D(8, kernel_size=(5, 5), strides=(1, 1), dilation_rate = (1,1), border_mode='valid', activation='relu', input_shape=inmage_shape)) model.add(MaxPooling2D(pool_size=(2, 1)))
#!/usr/bin/env python import pandas as pd from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import BaggingRegressor from sklearn import datasets from sklearn.utils import shuffle import numpy as np boston = datasets.load_boston() X, Y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, Y_train = X[:offset], Y[:offset] X_test, Y_test = X[offset:], Y[offset:] regressor = BaggingRegressor(DecisionTreeRegressor(max_depth=8)) regressor.fit(X_train,Y_train) score = regressor.score(X_test,Y_test) print score
def main(): max_iter = 20 print_period = 50 train_X, test_X, train_Y, test_Y = get_normalized_data() learning_rate = 0.00004 reg = 0.01 train_Y_ind = indicator(train_Y) test_Y_ind = indicator(test_Y) N, D = train_X.shape batch_size = 500 batch_num = N // batch_size M = 300 K = 10 W1 = np.random.randn(D, M) / np.sqrt(D) b1 = np.zeros(M) W2 = np.random.randn(M, K) / np.sqrt(M) b2 = np.zeros(K) #SAVE INITIAL WEIGHT AND BIAS W1_copy = W1.copy() b1_copy = b1.copy() W2_copy = W2.copy() b2_copy = b2.copy() #batch loss_batch = [] error_batch =[] for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range(batch_num): x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :] y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg*W2) b2 -= learning_rate * (derivative_b2(y, pY) + reg*b2) W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg*W1) b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg*b1) if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) loss_batch.append(l) error_batch.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #momentum W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() lose_momentum = [] error_momentum = [] mu = 0.9 dW1 = 0 dW2 = 0 db1 = 0 db2 = 0 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind) for j in range (batch_num): x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :] y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) # print("overflow?") gW2 = derivative_w2(Z, y, pY) + reg*W2 gb2 = derivative_b2(y, pY) + reg*b2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg*W1 gb1 = derivative_b1(Z, y, pY, W2) + reg*b1 #UDPATE VELOCITIES dW2 = mu*dW2 - learning_rate*gW2 db2 = mu*db2 - learning_rate*gb2 dW1 = mu*dW1 - learning_rate*gW1 db1 = mu*db1 - learning_rate*gb1 #UPDATE WEIGHT W2 += dW2 b2 += db2 W1 += dW1 b1 += db1 if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_momentum.append(l) error_momentum.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) #Nesterov momentum W1 = W1_copy.copy() b1 = b1_copy.copy() W2 = W2_copy.copy() b2 = b2_copy.copy() lose_nesterov = [] error_nesterov = [] mu = 0.9 dW1 = 0 db1 = 0 dW2 = 0 db2 = 0 for i in range(max_iter): shuffle_X, shuffle_Y = shuffle(test_X, test_Y_ind) for j in range(batch_num): x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :] y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :] pY, Z = forward(x, W1, W2, b1, b2) gW2 = derivative_w2(Z, y, pY) + reg*W2 gb2 = derivative_b2(y, pY) + reg*b2 gW1 = derivative_w1(x, Z, y, pY, W2) + reg*W1 gb1 = derivative_b1(Z, y, pY, W2) + reg*b1 #update velocities dW2 = mu*dW2 - learning_rate*gW2 db2 = mu*db2 - learning_rate*db2 dW1 = mu*dW1 - learning_rate*gW1 db1 = mu*db1 - learning_rate*gb1 #update weight W2 += mu*dW2 - learning_rate*gW2 b2 += mu*db2 - learning_rate*db2 W1 += mu*dW1 - learning_rate*gW1 b1 += mu*db1 - learning_rate*gb1 if j % print_period == 0: p_test, Z_test = forward(test_X, W1, W2, b1, b2) l = cost(p_test, test_Y_ind) e = error_rate(p_test, test_Y) lose_nesterov.append(l) error_nesterov.append(e) print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l)) print("error_rate: ", e) p_final, z_final = forward(test_X, W1, W2, b1, b2) print("final error_rate:", error_rate(p_final, test_Y)) plt.plot(loss_batch, label="batch") plt.plot(lose_momentum, label="momentum") plt.plot(lose_nesterov, label="Nesterov") plt.legend() plt.show()
print("\nConverting groundTruth labels to numpy array...") # this part is for the groundTruth labels with open(gt_path + 'drive.json') as f: data = json.load(f) # convert to numpy array data = np.asarray(data) # extract speed y = data[:, 1] # shuffle print("\nShuffling the data...") X, y = shuffle(X, y, random_state=42) # split into train and test print("\nSplitting X into train and test...") X_train = X[train_mask] X_test = X[test_mask] print("\nWriting X_train as HDF5...") write_hdf5(X_train, out_path + "X_train_50.hdf5") print("\nWriting X_test as HDF5...") write_hdf5(X_test, out_path + "X_test_50.hdf5") # split into train and test print("\nSplitting y into train and test...") y_train = y[train_mask]
def train_test_split_by_part(X, y, pdgid, n_mu=2500, n_el=2500, n_had=2500, n_fake=2500): try: mu_idx = sample_without_replacement(len(X[abs(pdgid) == 13]), n_mu, random_state=23) elec_idx = sample_without_replacement(len(X[abs(pdgid) == 11]), n_el, random_state=23) had_idx = sample_without_replacement(len(X[np.logical_and( abs(pdgid) > 37, pdgid != -999)]), n_had, random_state=23) fake_idx = sample_without_replacement(len(X[pdgid == -999]), n_fake, random_state=23) except: print( "Error: Not enough muons/electrons/hadrons/fakes in sample to create training data" ) return [], [], [], [], [], [] X_train = np.concatenate((X[abs(pdgid)==13][mu_idx],X[abs(pdgid)==11][elec_idx],\ X[np.logical_and(abs(pdgid)>37,pdgid!=-999)][had_idx],X[pdgid==-999][fake_idx])) y_train = np.concatenate((y[abs(pdgid)==13][mu_idx],y[abs(pdgid)==11][elec_idx],\ y[np.logical_and(abs(pdgid)>37,pdgid!=-999)][had_idx],y[pdgid==-999][fake_idx])) pdgid_train = np.concatenate((pdgid[abs(pdgid)==13][mu_idx],pdgid[abs(pdgid)==11][elec_idx],\ pdgid[np.logical_and(abs(pdgid)>37,pdgid!=-999)][had_idx],pdgid[pdgid==-999][fake_idx])) X_test = np.concatenate((np.delete(X[abs(pdgid)==13],mu_idx,axis=0),np.delete(X[abs(pdgid)==11],elec_idx,axis=0),\ np.delete(X[np.logical_and(abs(pdgid)>37,pdgid!=-999)],had_idx,axis=0),\ np.delete(X[pdgid==-999],fake_idx,axis=0))) y_test = np.concatenate((np.delete(y[abs(pdgid)==13],mu_idx,axis=0),np.delete(y[abs(pdgid)==11],elec_idx,axis=0),\ np.delete(y[np.logical_and(abs(pdgid)>37,pdgid!=-999)],had_idx,axis=0),\ np.delete(y[pdgid==-999],fake_idx,axis=0))) pdgid_test = np.concatenate((np.delete(pdgid[abs(pdgid)==13],mu_idx,axis=0),np.delete(pdgid[abs(pdgid)==11],elec_idx,axis=0),\ np.delete(pdgid[np.logical_and(abs(pdgid)>37,pdgid!=-999)],had_idx,axis=0),\ np.delete(pdgid[pdgid==-999],fake_idx,axis=0))) mu_check = (np.sum(abs(pdgid_test) == 13) / np.sum(abs(pdgid) == 13) < .2) el_check = (np.sum(abs(pdgid_test) == 11) / np.sum(abs(pdgid) == 11) < .2) had_check = (np.sum(pdgid_test == -999) / np.sum(pdgid == -999) < .2) fake_check = ( np.sum(np.logical_and(abs(pdgid_test) > 37, pdgid_test != -999)) / np.sum(np.logical_and(abs(pdgid_test) > 37, pdgid_test != -999)) < .2) if mu_check or el_check or had_check or fake_check: print( "Warning: The test set has less than 20% of muons/electrons/hadrons/fakes" ) X_train, y_train, pdgid_train = shuffle(X_train, y_train, pdgid_train, random_state=23) X_test, y_test, pdgid_test = shuffle(X_test, y_test, pdgid_test, random_state=23) return X_train, y_train, pdgid_train, X_test, y_test, pdgid_test
def read_csvfile(self, filename): self.dataframe = pd.read_csv(filename) self.dataframe = shuffle(self.dataframe) return self.dataframe
activation='relu', kernel_initializer = TruncatedNormal(stddev=0.1), kernel_regularizer = regularizers.l2(0.01), name="D3")(x) x = BatchNormalization(axis = 1,name="D3_BN")(x) x = Activation('relu',name = 'D3_relu')(x) out = Dense(5,activation='softmax',name="OutPut")(x) model = Model(inputs = inputs,outputs = out) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) print("正在训练网络,请耐心等候......") X_train,y_train = shuffle(X_train,y_train,random_state=0) startTime = time.clock() trainLog = model.fit(X_train, y_train, validation_split = 0.1, batch_size=64, epochs=10, verbose=1 ) endTime = time.clock() # 注意,这里的时间window和linux不同 print("网络训练已完成 耗时%f 秒"%((float)(endTime - startTime)/10)) # 绘制模型的结构图 此处还出现点问题,待解a决 plot_model(model, to_file='model.png',