예제 #1
0
def shuffle_data(X,y,no_lable_vs_lable):
	X, y = shuffle(X, y, random_state=0)
	# balance labels by subsampling:
	y_dict = defaultdict(list)
	for i, y_i in enumerate(y):
		y_dict[y_i[0]].append(i)
	# subsample
	X_sub = []
	y_sub = []
	y_set = set(y_dict)
	y_dict_len = [len(y_dict[y_set_i]) for y_set_i in sorted(list(y_set))]
	quotent = y_dict_len[0] / sum(y_dict_len)
	print 'lenth cutting'
	print str(len(X))
	# generalize over multiple classes: 
	if(quotent > no_lable_vs_lable):
		# decrease 0 class labels:
		newLen = int(2*y_dict_len[1]*no_lable_vs_lable)
		id_new = y_dict['0'][:newLen] + [y_dict[id] for id in y_set if not id in ['0']][0]
		X_sub = [X[id] for id in id_new]
		y_sub = [y[id] for id in id_new]
		print(str(newLen), 'new 0 class length: ', str(len(id_new)))
	else:
		newLen = int(y_dict_len[0]*(1-no_lable_vs_lable))
		id_new = y_dict['1'][:newLen] + [y_dict[id] for id in y_set if not id in ['0']][0]
		X_sub = [X[id] for id in id_new]
        y_sub = [y[id] for id in id_new]
        print(str(newLen), 'new 1 class length')
	X, y = shuffle(X_sub, y_sub, random_state=0)
	print str(len(X_sub))
	print '--------------'
	return X,y
 def frames2batch(k = 12,batch_size = 1024, is_calib = False):
     pos = util.get_files(rootdir = 'F:\\train_data\\pos\\')
     neg = util.get_files(rootdir = 'F:\\train_data\\neg\\')
     pos = shuffle(pos)
     neg = shuffle(neg)
     total = pos + neg
     total  = shuffle(total)
     batch = []
     c = 0
     bpath = 'F:\\train_data\\batch\\'
     for item_path in total:
         
         frame = fr.get_frame(item_path)
         frame_r = fr.resize_frame(frame,(k,k))
         if frame_r == None:
             continue
         vec = fr.frame_to_vect(frame_r)
         label = 1 if item_path.split('\\')[-1].find('pos') > 0 else 0
         print(item_path,label)
         batch.append((vec,label))
         if len(batch) > 0 and len(batch) % batch_size == 0:
             batch = sp.array(batch)
             sp.savez(bpath + str(c) + '_' + str(k) + ('_' if not is_calib else '_calib-')  + 'net',batch)
             batch = []
             
             c += 1
     if len(batch) > 0 and len(batch) % batch_size == 0:
         batch = sp.array(batch)
         sp.savez(bpath + str(c) + '_' + str(k) + ('_' if not is_calib else '_calib')  + '-net',batch)
         batch = []
         c += 1
예제 #3
0
def process_data():
  global num_classes, num_train, num_test

  X_train , Y_train = load_data('Train')
  X_test , Y_test = load_data('Test')
  X_train = X_train.astype(np.float64)
  X_test = X_test.astype(np.float64)
  num_train = X_train.shape[0]
  num_test = X_test.shape[0]

  mean_image = np.mean(X_train,axis=0)
  X_train -= mean_image
  X_test -= mean_image

  X_train = X_train.reshape(-1, 1, img_dim, img_dim)
  Y_train -= 1
  X_train , Y_train = shuffle(X_train, Y_train)

  X_test = X_test.reshape(-1, 1, img_dim, img_dim)
  Y_test -= 1
  X_test , Y_test = shuffle(X_test, Y_test)

  print 'Training X shape :- ', X_train.shape
  print 'Training Y shape :- ', Y_train.shape
  print 'Testing X shape :- ', X_test.shape
  print 'Testing Y shape :- ', Y_test.shape

  return X_train, Y_train, X_test, Y_test
def main():
    train, test, word2idx = get_ptb_data()

    for t in train:
        add_idx_to_tree(t, 0)
    train = [tree2list(t, -1, True) for t in train]
    train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels

    for t in test:
        add_idx_to_tree(t, 0)
    test = [tree2list(t, -1, True) for t in test]
    test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels

    train = shuffle(train)
    train = train[:1000]
    # n_pos = sum(t[3][-1] for t in train)
    # print "n_pos train:", n_pos
    test = shuffle(test)
    test = test[:100]
    # n_pos = sum(t[3][-1] for t in test)
    # print "n_pos test:", n_pos

    V = len(word2idx)
    print "vocab size:", V
    D = 80
    K = 5

    model = RecursiveNN(V, D, K)
    model.fit(train, epochs=3, activation=T.nnet.relu)
    print "train accuracy:", model.score(train)
    print "test accuracy:", model.score(test)
    print "train f1:", model.f1_score(train)
    print "test f1:", model.f1_score(test)
def splitIntoTrainingAndValidation(A, B):
	data1 = shuffle(sourceSets[A])    # Note this is a random shuffle, that's
	data2 = shuffle(sourceSets[B])    #                                   why we need many iterations
	freqM = np.minimum(freqs[A], freqs[B])
	freq1tr = np.round(freqM * 0.8)        # Randomly selected 80% for the training set,
	freq1va = freqM - freq1tr              # and the remaining 20% for the validation set
	freq2tr = np.copy(freq1tr)
	freq2va = np.copy(freq1va)
	trainingSetSize = int(sum(freq1tr))  # 1/2 size actually
	validatnSetSize = int(sum(freq1va))
	testSet1size = len(data1) - trainingSetSize - validatnSetSize
	testSet2size = len(data2) - trainingSetSize - validatnSetSize
	X  = np.zeros((trainingSetSize*2,         numFeatures))
	Xv = np.zeros((validatnSetSize*2,         numFeatures))
	Xt = np.zeros((testSet1size+testSet2size, numFeatures))
	y  = np.ravel([([0]*trainingSetSize) + ([1]*trainingSetSize)])
	yv = np.ravel([([0]*validatnSetSize) + ([1]*validatnSetSize)])
	yt = np.ravel([([0]*testSet1size)    + ([1]*testSet2size)])
	trnIdx = vldIdx = tstIdx = 0
	for item in data1:
		year = item[0]
		if   freq1tr[year] > 0:   X[trnIdx], trnIdx, freq1tr[year]  =  item[1:],  trnIdx+1,  freq1tr[year]-1
		elif freq1va[year] > 0:  Xv[vldIdx], vldIdx, freq1va[year]  =  item[1:],  vldIdx+1,  freq1va[year]-1
		else:                    Xt[tstIdx], tstIdx                 =  item[1:],  tstIdx+1
	assert trnIdx==trainingSetSize   and vldIdx==validatnSetSize   and tstIdx==testSet1size
	for item in data2:
		year = item[0]
		if   freq2tr[year] > 0:   X[trnIdx], trnIdx, freq2tr[year]  =  item[1:],  trnIdx+1,  freq2tr[year]-1
		elif freq2va[year] > 0:  Xv[vldIdx], vldIdx, freq2va[year]  =  item[1:],  vldIdx+1,  freq2va[year]-1
		else:                    Xt[tstIdx], tstIdx                 =  item[1:],  tstIdx+1
	assert trnIdx==trainingSetSize*2 and vldIdx==validatnSetSize*2 and tstIdx==testSet1size+testSet2size
	X, y = shuffle(X, y)   # Just in case... perhaps no reason to shuffle again here?
	fs = SelectKBest(f_classif, k = numFeatures)   # TODO: try other feature selection methods?
	fs.fit(np.concatenate((X, Xv)), np.concatenate((y, yv)))
	return X, y, Xv, yv, Xt, yt, testSet1size, testSet2size, fs.scores_
def generate_training_data(image_paths, angles, batch_size=128, validation_flag=False):
    '''
    method for the model training data generator to load, process, and distort images, then yield them to the
    model. if 'validation_flag' is true the image is not distorted. also flips images with turning angle magnitudes of greater than 0.33, as to give more weight to them and mitigate bias toward low and zero turning angles
    '''
    image_paths, angles = shuffle(image_paths, angles)
    X,y = ([],[])
    while True:       
        for i in range(len(angles)):
            img = cv2.imread(image_paths[i])
            angle = angles[i]
            img = preprocess_image(img)
            if not validation_flag:
                img, angle = random_distort(img, angle)
            X.append(img)
            y.append(angle)
            if len(X) == batch_size:
                yield (np.array(X), np.array(y))
                X, y = ([],[])
                image_paths, angles = shuffle(image_paths, angles)
            # flip horizontally and invert steer angle, if magnitude is > 0.33
            if abs(angle) > 0.33:
                img = cv2.flip(img, 1)
                angle *= -1
                X.append(img)
                y.append(angle)
                if len(X) == batch_size:
                    yield (np.array(X), np.array(y))
                    X, y = ([],[])
                    image_paths, angles = shuffle(image_paths, angles)
def generate_feature(in_file, dump=False, single_only=False, min_count=0):
  f = open(in_file, 'r')
  f.readline()
  training_data, tags = [], []
  total_features = {}

  for line in f.readlines():
    tokens = line.replace('\n', '').split(',')
    fs = [s for s in tokens[1:] if s.isdigit()]
    # ignore invalid data
    if len(fs) != 10:
      continue
    tags.append(tokens[0])
    features = get_feature_array(fs, single_only)
    update_total_features(total_features, features)
    training_data.append(features)

  training_data = transform_to_matrix(total_features, training_data)
  training_data = cut_off(training_data, min_count)
  shuffle(training_data, tags)
  tags = np.array(tags)
  if dump:
    np.savetxt('preprocessing/dumpX.txt', training_data, fmt='%d', delimiter=',')
    np.savetxt('preprocessing/dumpY.txt', tags[np.newaxis].T, fmt='%s', delimiter=',')
  return total_features, training_data, np.array(tags)
예제 #8
0
def getTrainTestData():
    data = pickle.load(open('./data/60_unnormalized.p', "rb"))

    raw_meta = []
    raw_data = []
    for k,v in data.iteritems():
        for i in range(len(v)):

            _d = v[i]
            previous = [[0]*LOCATION_ID_MAX,[0]*LOCATION_ID_MAX]
            if i==0:
                # previous date
                date_time = datetime.datetime.strptime(k, '%Y-%m-%d')
                previous_day = date_time - datetime.timedelta(1)
                str_previous_day = previous_day.strftime('%Y-%m-%d')
                if str_previous_day in data:
                    previous[0]=data[str_previous_day][-2]
                    previous[1]=data[str_previous_day][-1]
            elif i==1:
                # previous date
                date_time = datetime.datetime.strptime(k, '%Y-%m-%d')
                previous_day = date_time - datetime.timedelta(1)
                str_previous_day = previous_day.strftime('%Y-%m-%d')
                previous[1]=v[i-1]
                if str_previous_day in data:
                    previous[0]=data[str_previous_day][-1]
            else:
                previous[0]=v[i-2]
                previous[1]=v[i-1]

            raw_meta.append({"date":k,"interval":i,"previous":previous})
            raw_data.append(_d)

    num = len(raw_data)

    train_meta_data = raw_meta[0:int(0.6*num)]
    valid_meta_data = raw_meta[int(0.6*num):int(0.8*num)]
    test_meta_data = raw_meta[int(0.8*num):]

    train_y = raw_data[0:int(0.6*num)]
    valid_y = raw_data[int(0.6*num):int(0.8*num)]
    test_y = raw_data[int(0.8*num):]

    train_X = getFeatures(train_meta_data)
    valid_X = getFeatures(valid_meta_data)
    test_X = getFeatures(test_meta_data)

    train_X = np.array(train_X, dtype=np.float32)
    valid_X = np.array(valid_X, dtype=np.float32)
    test_X = np.array(test_X, dtype=np.float32)

    train_y = np.array(train_y, dtype=np.float32)
    valid_y = np.array(valid_y, dtype=np.float32)
    test_y = np.array(test_y, dtype=np.float32)

    train_X, train_y = shuffle(train_X, train_y, random_state=0)
    valid_X, valid_y = shuffle(valid_X, valid_y, random_state=1)
    test_X, test_y = shuffle(test_X, test_y, random_state=2)

    return train_X, train_y, valid_X, valid_y, test_X, test_y
예제 #9
0
def splitIntoTrainingValidation(A, B):  # TODO: 3rd parameter: the desired value of (validatSet1size + validatSet2size)
	data1 = shuffle(sourceSets[A])    # Note this is a random shuffle, that's
	data2 = shuffle(sourceSets[B])    #                                   why we need many iterations
	freq1 = np.minimum(freqs[A], freqs[B])
	if sum(freq1) > maxTrainSetSz:  freq1 = np.round(freq1 * (maxTrainSetSz * 1.0 / sum(freq1)))
	trainingSetSize = int(sum(freq1))  # Half size actually.  Approximately <= maxTrainSetSz
	validatSet1size = len(data1) - trainingSetSize
	validatSet2size = len(data2) - trainingSetSize
	X  = np.zeros((trainingSetSize*2,               numFeatures))
	Xv = np.zeros((validatSet1size+validatSet2size, numFeatures))
	y  = np.ravel([([0]*trainingSetSize) + ([1]*trainingSetSize)])
	yv = np.ravel([([0]*validatSet1size) + ([1]*validatSet2size)])
	freq2  = np.copy(freq1)
	trnIdx = valIdx = 0
	for item in data1:
		year = item[0]
		if freq1[year] > 0:
					freq1[year]-=1
					X[trnIdx] = item[1:]
					trnIdx+=1
		else:
			  Xv[valIdx] = item[1:]
			  valIdx += 1
	assert trnIdx==trainingSetSize and valIdx==validatSet1size
	for item in data2:
		year = item[0]
		if freq2[year] > 0:
					freq2[year]-=1
					X[trnIdx] = item[1:]
					trnIdx+=1
		else:
			  Xv[valIdx] = item[1:]
			  valIdx += 1
	assert trnIdx==trainingSetSize*2 and valIdx==validatSet1size+validatSet2size
	return X, y, Xv, yv, validatSet1size, validatSet2size
def main():
    train, test, word2idx = get_ptb_data()

    for t in train:
        add_idx_to_tree(t, 0)
    train = [tree2list(t, -1, is_binary=True) for t in train]
    train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels

    for t in test:
        add_idx_to_tree(t, 0)
    test = [tree2list(t, -1, is_binary=True) for t in test]
    test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels

    train = shuffle(train)
    train = train[:1000]
    test = shuffle(test)
    test = test[:500]

    V = len(word2idx)
    print "vocab size:", V
    D = 80
    K = 5

    model = RecursiveNN(V, D, K)
    model.fit(train, reg=0, activation=T.nnet.relu)
    print "train accuracy:", model.score(train)
    print "test accuracy:", model.score(test)
    def compute_distances_and_pairs(self, pdb_file, nr_contacts=None, nr_noncontacts=None):
        #distance and contacts
        self.features['pair']['Cbdist'] = pdb.distance_map(pdb_file, self.L)

        #mask positions that have too many gaps
        gap_freq = 1 - (self.Ni / self.neff)
        highly_gapped_pos = np.where(gap_freq > self.max_gap_percentage)[0]
        self.features['pair']['Cbdist'][:,highly_gapped_pos] = np.nan
        self.features['pair']['Cbdist'][highly_gapped_pos, :] = np.nan

        #if there are unresolved residues, there will be nan in the distance_map
        with np.errstate(invalid='ignore'):
            self.features['pair']['contact'] = (self.features['pair']['Cbdist'] <= self.contact_threshold) * 1
            self.features['pair']['nocontact'] = (self.features['pair']['Cbdist'] > self.non_contact_threshold) * 1

        indices_contact = np.where(np.triu(self.features['pair']['contact'], k=self.seq_separation))
        indices_contact = tuple(shuffle(indices_contact[0],indices_contact[1], random_state=0))
        if nr_contacts:
            indices_contact = indices_contact[0][:nr_contacts], indices_contact[1][:nr_contacts]

        indices_nocontact = np.where(np.triu(self.features['pair']['nocontact'], k=self.seq_separation))
        indices_nocontact = tuple(shuffle(indices_nocontact[0],indices_nocontact[1], random_state=0))
        if nr_noncontacts:
            indices_nocontact = indices_nocontact[0][:nr_noncontacts], indices_nocontact[1][:nr_noncontacts]


        #update indices of i<j for only relevant pairs
        self.ij_ind_upper = np.array(list(indices_contact[0]) + list(indices_nocontact[0])), np.array(list(indices_contact[1]) + list(indices_nocontact[1]))
def cluster(m, n_colors=32):
    from sklearn.utils import shuffle
    from sklearn.cluster import KMeans
    from sklearn.metrics import pairwise_distances_argmin

    def recreate_image(codebook, labels, w, h):
        """Recreate the (compressed) image from the code book & labels"""
        d = codebook.shape[1]
        image = np.zeros((w, h, d))
        label_idx = 0
        for i in range(w):
            for j in range(h):
                image[i][j] = codebook[labels[label_idx]]
                label_idx += 1
        return image

    # Load Image and transform to a 2D numpy array.
    w, h, d = original_shape = tuple(m.shape)
    image_array = np.reshape(m, (w * h, d))
    image_array_sample = shuffle(image_array, random_state=0)[:1000]
    kmeans = KMeans(n_clusters=n_colors).fit(image_array_sample)

    codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
    labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0)

    return recreate_image(codebook_random, labels_random, w, h)
예제 #13
0
def get_aa_cross_val(L, X, Y, AA, tsize=None, rstate=-1):
    """Get test data from dataset"""
    test_position = []
    aa_y = np.zeros(Y.shape)
    for i in xrange(len(Y)):
        if L[i][-1] == AA:
            aa_y[i] = 1
            test_position.append(i)

    if tsize:
        t_len = int(tsize * len(Y))
        # positions that are 0 without being the one for AA
        zero_pos = np.where(np.logical_and(Y == 0, aa_y == 0))[0]
        clen = t_len - len(test_position)
        if clen > 0:
            random_zero_pos = np.random.choice(zero_pos, clen, replace=False)
            test_position.extend(random_zero_pos)

    test_position = np.random.permutation(test_position)
    mask = np.ones(Y.shape, dtype=bool)
    mask[test_position] = False
    train_position = np.array(range(len(mask)))[mask]

    if rstate > 0:
        return shuffle(train_position, random_state=rstate), shuffle(test_position, random_state=rstate)
    # in this case, suppose we want only the train and test index
    else:
        return train_position, test_position
def load_whale_data(train_file, test_file, nb_classes=447):
    print("loading whale data")

    # nomalize train data
    print("--> loading training data")
    train_data = read_csv(train_file)
    X_train = train_data[:, 1:]
    X_train = X_train.astype(np.float32)
    X_train = X_train / 255

    y_train = np.vstack(train_data[:, 0])
    y_train = y_train.astype(np.uint16)

    X_train, y_train = shuffle(X_train, y_train, random_state=42)
    X_train = X_train.reshape(-1, 1, 96, 96)
    Y_train = np_utils.to_categorical(y_train, 447)
    print("--> training data loaded")

    # nomalize test data
    print("--> loading test data")
    test_data = read_csv(test_file)
    X_test = test_data[:, 1:]
    X_test = X_test.astype(np.float32)
    X_test = X_test / 255

    y_test = np.vstack(test_data[:, 0])
    y_test = y_test.astype(np.uint16)

    X_test, y_test = shuffle(X_test, y_test, random_state=42)
    X_test = X_test.reshape(-1, 1, 96, 96)
    Y_test = np_utils.to_categorical(y_test, 447)
    print("--> test data loaded")

    return (X_train, Y_train, X_test, Y_test)
def main(is_binary=True):
    train, test, word2idx = get_ptb_data()

    for t in train:
        add_idx_to_tree(t, 0)
    train = [tree2list(t, -1, is_binary) for t in train]
    if is_binary:
        train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels

    for t in test:
        add_idx_to_tree(t, 0)
    test = [tree2list(t, -1, is_binary) for t in test]
    if is_binary:
        test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels

    train = shuffle(train)
    train = train[:5000]
    # n_pos = sum(t[3][-1] for t in train)
    # print "n_pos train:", n_pos
    test = shuffle(test)
    test = test[:1000]
    # n_pos = sum(t[3][-1] for t in test)
    # print "n_pos test:", n_pos

    V = len(word2idx)
    print "vocab size:", V
    D = 20
    K = 2 if is_binary else 5

    model = RecursiveNN(V, D, K)
    model.fit(train)
    print "train accuracy:", model.score(train)
    print "test accuracy:", model.score(test)
    print "train f1:", model.f1_score(train)
    print "test f1:", model.f1_score(test)
예제 #16
0
def import_images():
	#IMPLEMENT TIMER CUTOFF FR+OR IF FEAT EXT TAKES TOO LONG
	d_feats = {'orb': []}
	c_feats = {'orb': []}
	(cat_paths, dog_paths) = get_filenames(TRAINING_FOLDER)
	cat_train_pts = []
	dog_train_pts = []
	for image_fn in shuffle(dog_paths, n_samples = 400, random_state=0):
		odesc_pts = extract_desc_pts(image_fn)
		try:
			for pt in odesc_pts:
				d_feats['orb'].append(pt)
		except TypeError:
			print image_fn
			continue
	for image_fn in shuffle(cat_paths, n_samples = 400, random_state=0):
		odesc_pts = extract_desc_pts(image_fn)
		try:
			for pt in odesc_pts:
				c_feats['orb'].append(pt)
		except TypeError:
			print image_fn
			continue
	cat_k_means = KMeans(n_jobs=-1, n_clusters=200)
	cat_k_means.fit(c_feats['orb'])
	print 'dog calc'
	dog_k_means = KMeans(n_jobs=-1, n_clusters=200)
	dog_k_means.fit(d_feats['orb'])
	print 'saving....'
	with open('/home/max/CVD/d_o200c200s400.pickle', 'wb') as handle:
		pickle.dump(dog_k_means.cluster_centers_, handle)
	with open('/home/max/CVD/c_o200c200s400.pickle', 'wb') as handle:
		pickle.dump(cat_k_means.cluster_centers_, handle)
	return '\n\n\n DONE   '	
예제 #17
0
def generator3(samples, batch_size=32):
    num_samples = len(samples)
    
    while 1: # Loop forever so the generator never terminates
        shuffle(samples)
        for offset in range(0, num_samples, batch_size):
            batch_samples = samples[offset:offset+batch_size]

            car_images = []
            steering_angles = []
            for batch_sample in batch_samples:
                img_center = cv2.imread(path+batch_sample[0].split('\\')[-1])
                img_left   = cv2.imread(path+batch_sample[1].split('\\')[-1])
                img_right  = cv2.imread(path+batch_sample[2].split('\\')[-1])
                
                correction = 0.3 # this is a parameter to tune
                steering_center = float(batch_sample[3])
                steering_left   = steering_center + correction
                steering_right  = steering_center - correction
                
                # add images and angles to data set
                car_images.extend([img_center, img_left, img_right])
                steering_angles.extend([steering_center, steering_left, steering_right])
                
            # trim image to only see section with road
            X_train = np.array(car_images)
            y_train = np.array(steering_angles)
            yield shuffle(X_train, y_train)
 def _subsample_data(self, X, Y, n=10000):
   if Y is not None:
     X, Y = shuffle(X, Y)
     return X[:n], Y[:n]
   else:
     X = shuffle(X)
     return X[:n]
예제 #19
0
def run_kmeans(inFile,  n_colors):
	china = cv2.imread(inFile)
	china = np.array(china, dtype=np.float64) / 255
	w, h, d = original_shape = tuple(china.shape)
	assert d == 3
	image_array = np.reshape(china, (w * h, d))
	
	print("\tFitting model on a small sub-sample of the data")
	t0 = time()
	image_array_sample = shuffle(image_array, random_state=0)[:1000]
	kmeans = KMeans(k=n_colors, random_state=0).fit(image_array_sample)
	print("\tdone in %0.3fs." % (time() - t0))
	
	# Get labels for all points
	print("\tPredicting color indices on the full image (k-means)")
	t0 = time()
	labels = kmeans.predict(image_array)
	print("\tdone in %0.3fs." % (time() - t0))
	
	codebook_random = shuffle(image_array, random_state=0)[:n_colors + 1]
	print("\tPredicting color indices on the full image (random)")
	t0 = time()
	dist = euclidean_distances(codebook_random, image_array, squared=True)
	labels_random = dist.argmin(axis=0)
	print("\tdone in %0.3fs." % (time() - t0))

	img_kmeans = recreate_image(kmeans.cluster_centers_, labels, w, h)
	img_random = recreate_image(codebook_random, labels_random, w, h)
	return china, img_kmeans, img_random
예제 #20
0
def getMNIST():
    # data shape: train (50000, 784), test (10000, 784)
    # already scaled from 0..1 and converted to float32
    datadir = '../large_files/'
    if not os.path.exists(datadir):
        datadir = ''

    input_file = "%smnist.pkl.gz" % datadir
    if not os.path.exists(input_file):
        url = 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
        with open(input_file, "wb") as out:
            f = urllib2.urlopen(url)
            out.write(f.read())
            out.flush()

    with gzip.open(input_file) as f:
        train, valid, test = cPickle.load(f)

    Xtrain, Ytrain = train
    Xvalid, Yvalid = valid
    Xtest, Ytest = test

    Ytrain_ind = y2indicator(Ytrain)
    Ytest_ind = y2indicator(Ytest)

    Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
    Xtest, Ytest = shuffle(Xtest, Ytest)

    # try to take a smaller sample
    Xtrain = Xtrain[0:30000]
    Ytrain = Ytrain[0:30000]
    Xtest = Xtest[0:1000]
    Ytest = Ytest[0:1000]

    return Xtrain.reshape(len(Xtrain), 1, 28, 28), Ytrain, Ytrain_ind, Xtest.reshape(len(Xtest), 1, 28, 28), Ytest, Ytest_ind
예제 #21
0
 def load_data(self, shuffled=True):
     samples = load_diabetes()
     if shuffled:
         self.X = shuffle(samples.data, random_state=self.SEED)
         self.y = shuffle(samples.target, random_state=self.SEED)
     else:
         self.X, self.y = samples.data, samples.target
     self.n_features = len(self.X[0])
예제 #22
0
def test_shuffle_on_ndim_equals_three():
    def to_tuple(A):    # to make the inner arrays hashable
        return tuple(tuple(tuple(C) for C in B) for B in A)

    A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])  # A.shape = (2,2,2)
    S = set(to_tuple(A))
    shuffle(A)  # shouldn't raise a ValueError for dim = 3
    assert_equal(set(to_tuple(A)), S)
예제 #23
0
 def load_binary_data(self, shuffled=True):
     samples = load_breast_cancer()
     if shuffled:
         self.X = shuffle(samples.data, random_state=self.SEED)
         self.y = shuffle(samples.target, random_state=self.SEED)
     else:
         self.X, self.y = samples.data, samples.target
     self.n_features = len(self.X[0])
예제 #24
0
def player_status_train_test(player_statuses):
    """Make a train-test split"""

    # usak test games chosen by calling:
    # np.random.RandomState(0).choice(good_games, 20)
    # and taking the first 6 games that are not incomplete (see wiki)

    test_games = ['ns000078', 'ns000081', 'cavalry', 'showcase01', 'malafide',
                  'nexxice']
    test_games = ['usak-{}'.format(k) for k in test_games]


    # add more test games 
    # 50 test games from usdp such that they have at least 500 talk entries

    test_games += [u'usdp-anzac2011_potts', u'usdp-aloha2', u'usdp-service13',
u'usdp-vole_003', u'usdp-service14', u'usdp-owlsopen2011_1a',
u'usdp-owlsopen10_3f', u'usdp-echo7', u'usdp-owls_256', u'usdp-owls_246',
u'usdp-timgroup1', u'usdp-owlsopen2011_2f', u'usdp-agitar10', u'usdp-owls_242',
u'usdp-vole_001', u'usdp-tango', u'usdp-leoxiii', u'usdp-owlsopen2011_2g',
u'usdp-vole_025', u'usdp-vole_006', u'usdp-310', u'usdp-owlsopen2011_1c',
u'usdp-skullhouse11', u'usdp-vole_004', u'usdp-chess_match',
u'usdp-anzac2011_claw', u'usdp-service16', u'usdp-wetterling',
u'usdp-owlsopen2011_3c', u'usdp-inthedark1', u'usdp-owls_261',
u'usdp-owlsopen2011_3g', u'usdp-ltb2', u'usdp-owlsopen10_3h', u'usdp-vanilla1',
u'usdp-owlsopen2011_1g', u'usdp-vole_002', u'usdp-warzones1', u'usdp-vole_012',
u'usdp-benjgame', u'usdp-owlsopen2011_3e', u'usdp-power_struggle7',
u'usdp-owlsopen2011_3h', u'usdp-owlsopen2011_1d', u'usdp-vole_008',
u'usdp-owlsopen2011_2h', u'usdp-spartan01', u'usdp-rainier', u'usdp-owls_252',
u'usdp-owls_245'] 

    # filter out short instances
    print("Before filtering: n_instances=", len(player_statuses))
    THRESHOLD = 5  # at least 5 sent and 5 received messages
    player_statuses = [p for p in player_statuses
                    if sum(msg['direction'] == 'from'
                           for msg in p['talk']) >= THRESHOLD
                    and sum(msg['direction'] == 'to'
                            for msg in p['talk']) >= THRESHOLD]
    print("After filtering: n_instances=", len(player_statuses))
    train_statuses = [_clean(p) for p in player_statuses
                      if p['game'] not in test_games]
    test_statuses = [_clean(p) for p in player_statuses
                     if p['game'] in test_games]
    print("Train: {}, test: {}".format(len(train_statuses), len(test_statuses)))
    print("Test label distribution: ",
          Counter(row['status'] for row in test_statuses))

    train_statuses = np.array(train_statuses)
    test_statuses = np.array(test_statuses)

    train_statuses = shuffle(train_statuses, random_state=0)
    test_statuses = shuffle(test_statuses, random_state=0)

    y_train = np.array([p['status'] for p in train_statuses])
    y_test = np.array([p['status'] for p in test_statuses])

    return train_statuses, y_train, test_statuses, y_test
def main(is_binary=True):
    train, test, word2idx = get_ptb_data()

    for t in train:
        add_idx_to_tree(t, 0)
    train = [tree2list(t, -1, is_binary) for t in train]
    if is_binary:
        train = [t for t in train if t[3][-1] >= 0] # for filtering binary labels

    for t in test:
        add_idx_to_tree(t, 0)
    test = [tree2list(t, -1, is_binary) for t in test]
    if is_binary:
        test = [t for t in test if t[3][-1] >= 0] # for filtering binary labels

    # check imbalance
    # pos = 0
    # neg = 0
    # mid = 0
    # label_counts = np.zeros(5)
    # for t in train + test:
    #     words, left_child, right_child, labels = t
    #     # for l in labels:
    #     #     if l == 0:
    #     #         neg += 1
    #     #     elif l == 1:
    #     #         pos += 1
    #     #     else:
    #     #         mid += 1
    #     for l in labels:
    #         label_counts[l] += 1
    # # print("pos / total:", float(pos) / (pos + neg + mid))
    # # print("mid / total:", float(mid) / (pos + neg + mid))
    # # print("neg / total:", float(neg) / (pos + neg + mid))
    # print("label proportions:", label_counts / label_counts.sum())
    # exit()


    train = shuffle(train)
    # train = train[:5000]
    # n_pos = sum(t[3][-1] for t in train)
    # print("n_pos train:", n_pos)
    test = shuffle(test)
    smalltest = test[:1000]
    # n_pos = sum(t[3][-1] for t in test)
    # print("n_pos test:", n_pos)

    V = len(word2idx)
    print("vocab size:", V)
    D = 20
    K = 2 if is_binary else 5

    model = RecursiveNN(V, D, K)
    model.fit(train, smalltest, epochs=20, train_inner_nodes=True)
    print("train accuracy:", model.score(train))
    print("test accuracy:", model.score(test))
    print("train f1:", model.f1_score(train))
    print("test f1:", model.f1_score(test))
예제 #26
0
def build_classification(with_preprocessor=False):
  """Basic array for testing when using a preprocessor"""
  X, y = shuffle(*make_blobs(random_state=SEED),
                 random_state=SEED)
  indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int)
  if with_preprocessor:
    return Dataset(indices, y[indices], X, indices)
  else:
    return Dataset(X[indices], y[indices], None, X[indices])
예제 #27
0
def build_regression(with_preprocessor=False):
  """Basic array for testing when using a preprocessor"""
  X, y = shuffle(*make_regression(n_samples=100, n_features=5,
                                  random_state=SEED),
                 random_state=SEED)
  indices = shuffle(np.arange(X.shape[0]), random_state=SEED).astype(int)
  if with_preprocessor:
    return Dataset(indices, y[indices], X, indices)
  else:
    return Dataset(X[indices], y[indices], None, X[indices])
예제 #28
0
def get_data():
    """
    Get data ready to learn with.

    Returns
    -------
    dict
    """
    simple = False
    if simple:  # Load the simple, but similar digits dataset
        from sklearn.datasets import load_digits
        from sklearn.utils import shuffle
        digits = load_digits()
        x = [np.array(el).flatten() for el in digits.images]
        y = digits.target

        # Scale data to [-1, 1] - This is of mayor importance!!!
        # In this case, I know the range and thus I can (and should) scale
        # manually. However, this might not always be the case.
        # Then try sklearn.preprocessing.MinMaxScaler or
        # sklearn.preprocessing.StandardScaler
        x = x/255.0*2 - 1

        x, y = shuffle(x, y, random_state=0)

        from sklearn.cross_validation import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=0.33,
                                                            random_state=42)
        data = {'train': {'X': x_train,
                          'y': y_train},
                'test': {'X': x_test,
                         'y': y_test}}
    else:  # Load the original dataset
        from sklearn.datasets import fetch_mldata
        from sklearn.utils import shuffle
        mnist = fetch_mldata('MNIST original')

        x = mnist.data
        y = mnist.target

        # Scale data to [-1, 1] - This is of mayor importance!!!
        x = x/255.0*2 - 1

        x, y = shuffle(x, y, random_state=0)

        from sklearn.cross_validation import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=0.33,
                                                            random_state=42)
        data = {'train': {'X': x_train,
                          'y': y_train},
                'test': {'X': x_test,
                         'y': y_test}}
    return data
예제 #29
0
def read_all(files):
    x_files = map(lambda x: get_x_name(config.train_spat_folder, x), files)
    y_files = map(lambda x: get_y_name(config.train_spat_folder, x), files)

    XX_train, Y_train = cast_dataset(read_data((x_files, y_files)))
    XX_train = XX_train.swapaxes(4, 2)
    XX_train, Y_train = shuffle(XX_train, Y_train, random_state=42)

    XX_test, Y_test = shuffle(XX_train, Y_train, random_state=84, n_samples=500)

    return XX_train, Y_train, XX_test, Y_test
예제 #30
0
def sq_dict_learning(row_data, mask, D_0 = None, n_filters = 20,  
    eta = 0.001, sparsity = 10, n_epochs = 4, EV_SCORE = True):
    ''' 
    k: Number of dictionary items
    n_theta: Number of orientated realization of the filter
    '''    
    #Shuffle the data
    data = shuffle(row_data).T
    m, n = data.shape
    effective_dim = mask.sum()
    dummy_dim = mask.shape[0]*mask.shape[1]
    dim_ratio = float(dummy_dim)/effective_dim
    

    if D_0 is None:
        D_base = 1-2*np.random.rand(m,n_filters)
        D_base -= np.expand_dims(np.mean(D_base, axis=0), 0)*dim_ratio
        D_base /= np.linalg.norm(D_base,axis=0)
        D_t = D_base
    else:
        D_t = D_0   

    losses = []
    for epoch in range(n_epochs):
        
        for t in range(n):
            x_t = data[:,t]
               
            # Sparse Coding   
            idx_t, alphas_t = omp(D_t, x_t, sparsity)
            
            
            # Dictionary Update
            ##Rotation update
            d_t = D_t[:,idx_t]
            eta_prime = eta*m
            y_t = np.dot(d_t,alphas_t)
            y_t /= np.linalg.norm(y_t,axis=0)
            lmbd = np.sqrt(1-(np.dot(y_t, x_t))**2)
            half_S = np.dot(np.expand_dims(x_t,1), np.expand_dims(y_t,0))
            S = half_S - half_S.T
            update = np.identity(m) + np.sin(2 * eta_prime * lmbd)/lmbd * S + (1 - np.cos(2 * eta_prime * lmbd))/lmbd**2 * np.dot(S,S)      
            D_t[:,idx_t] = np.dot(update, d_t)
            

            D_t -= np.expand_dims(np.mean(D_t, axis=0), 0)*dim_ratio
            D_t /= np.expand_dims(np.linalg.norm(D_t, axis=0), axis=0)
        
                    
            if EV_SCORE and (t%500 == 0):
                loss = score_dict(data, D_t, sparsity )
                losses.append(loss)
        data = shuffle(data.T).T
    return D_t, losses    
예제 #31
0
    tf.abs(tf.subtract(y_conv, tf.cast(tf.argmax(y_, 1), "float32"))))
cross_entropy = -tf.reduce_sum(
    diss * tf.log(tf.cast(y_conv, dtype=tf.float32) +
                  (1e-7))) + sum_acc + tf.reduce_sum(tf.abs(WB_fc1))

train_step = tf.train.AdamOptimizer(Optrate).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

start = time.time()

# minibatch実行
for i in range(EPOCHS):
    X_train, y_train = shuffle(X['train'], y['train'])
    XL_train, yL_train = shuffle(X['train'], y['train'])

    # 0かどうかの判定
    dummy_0 = np.asarray([1.0, 0.0] * 2)
    dummy_0 = dummy_0.reshape(2, 2)
    img_0_dummy = np.asarray(list(rep_0_img) * 2)
    img_0_dummy = img_0_dummy.reshape(2, 784)
    distance_labels = [np.sum(x) for x in dummy_0]

    eval_acc = sess.run(y_conv,
                        feed_dict={
                            x: X_train,
                            y_: y_train,
                            keep_prob: 1,
                            xL: img_0_dummy,
예제 #32
0
    def fit(self,
            X,
            Y,
            Xvalid,
            Yvalid,
            learning_rate=1e-4,
            mu=0.9,
            decay=0.9,
            epochs=8,
            batch_sz=100,
            show_fig=False):
        '''
        Takes training data and test data (valid) at once, then trains and
        validates along the way. Modifying hyperparams of learning_rate, mu,
        decay, epochs (iterations = N//batch_sz * epochs), batch_sz and whether
        to display a figure are passed as optional variables.
        '''
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid = Xvalid.astype(np.float32)
        Yvalid = Yvalid.astype(np.int32)

        self.rng = RandomStreams()

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D  # first input layer is the number of features in X
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)  # layer ID is just the number
            self.hidden_layers.append(h)
            M1 = M2  # input layer to next layer is this layer.
            count += 1
        # output layer weights (last hidden layer to K output classes)
        W = np.random.randn(M1, K) * np.sqrt(2.0 / M1)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY_train = self.forward_train(thX)  # function to calc prob Y given X

        # this cost is for training
        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))

        # gradients wrt each param
        grads = T.grad(cost, self.params)

        # for momentum
        '''
        np.zeros_like(array) returns an array(/matrix) of the same shape and
        type of the given array. Very cool, never seen this before.
        '''
        dparams = [
            theano.shared(np.zeros_like(p.get_value())) for p in self.params
        ]

        # for rmsprop, initialize cache as 1
        cache = [
            theano.shared(np.ones_like(p.get_value())) for p in self.params
        ]
        '''
        Noting for myself that I've never seen this way of using zip to loop
        through multiple lists/arays with the same indices simultaneously.
        Makes a lot of sense now, I should see where I can use this to turn
        loops over indices in my code in to list comprehension that is by ele.
        '''
        # these are the functions for updating the variables of
        # dparams (momentum) and cache.
        new_cache = [
            decay * c + (1 - decay) * g * g
            for p, c, g in zip(self.params, cache, grads)
        ]
        new_dparams = [
            mu * dp - learning_rate * g / T.sqrt(new_c + 1e-10)
            for p, new_c, dp, g in zip(self.params, new_cache, dparams, grads)
        ]
        '''
        Using zip to create lists of tuples of the variables themselves, and
        the fuctions for updating them (cache, momentum params and params),
        where params are weights (W) and biases (b) for each layer.
        '''
        updates = [(c, new_c) for c, new_c in zip(cache, new_cache)] + [
            (dp, new_dp) for dp, new_dp in zip(dparams, new_dparams)
        ] + [(p, p + new_dp) for p, new_dp in zip(self.params, new_dparams)]

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # for evaluation and prediction, more theano graph set-up with tensors
        # still no values yet in any of these. Training loop next!
        pY_predict = self.forward_predict(thX)
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))
        prediction = self.predict(thX)
        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                # theano function defined above that does all the work.
                # takes the data (like feed_dict in tf). The update calcs were
                # given to it above as a list for all layers.
                train_op(Xbatch, Ybatch)

                if j % 50 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c,
                          "error rate:", e)

        if show_fig:
            plt.plot(costs)
            plt.show()
예제 #33
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.contrib.keras.python.keras.models import Sequential
from tensorflow.contrib.keras.python.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, SpatialDropout2D
from tensorflow.contrib.keras.python.keras.utils import plot_model
from tensorflow.contrib.keras.python.keras.preprocessing.image import ImageDataGenerator
from tensorflow.contrib.keras.python.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

print("Loading data...")

collection = np.load('collection.npy')
labels_onehot = np.load('labels_onehot.npy')
collection, labels_onehot = shuffle(collection, labels_onehot)

x_train_full, x_valid, y_train_full, y_valid = train_test_split(collection,
                                                                labels_onehot,
                                                                test_size=0.2)
x_train, x_test, y_train, y_test = train_test_split(x_train_full,
                                                    y_train_full,
                                                    test_size=0.25)

np.save('x_test.npy', x_test)
np.save('y_test.npy', y_test)

print("Making model...")

model = Sequential()
model.add(
    Conv2D(filters=32,
           kernel_size=(3, 3),
예제 #34
0
num_of_samples = img_data.shape[0]
labels = np.ones((num_of_samples,),dtype='int64')

labels[0:202]=0
labels[202:404]=1
labels[404:606]=2
labels[606:]=3
	  
names = ['cats','dogs','horses','humans']
	  
# convert class labels to on-hot encoding
Y = np_utils.to_categorical(labels, num_classes)

#Shuffle the dataset
x,y = shuffle(img_data,Y, random_state=2)
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

#%%
# Defining the model
input_shape=img_data[0].shape
					
model = Sequential()

model.add(Conv2D(32, (3,3),border_mode='same',input_shape=input_shape))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.5))
예제 #35
0
# === Manual Back ====

# sess
with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())

    train_cota, train_acca = 0, 0
    train_cot, train_acc = [], []

    test_cota, test_acca = 0, 0
    test_cot, test_acc = [], []

    for iter in range(num_epoch):

        train_batch, train_label = shuffle(train_batch, train_label)

        for batch_size_index in range(0, len(train_batch), (batch_size // 2)):
            current_batch = train_batch[batch_size_index:batch_size_index +
                                        (batch_size // 2)]
            current_batch_label = train_label[
                batch_size_index:batch_size_index + (batch_size // 2)]

            # online data augmentation here and standard normalization
            images_aug = seq.augment_images(current_batch.astype(np.float32))
            current_batch = np.vstack(
                (current_batch, images_aug)).astype(np.float32)
            current_batch_label = np.vstack(
                (current_batch_label, current_batch_label)).astype(np.float32)
            current_batch, current_batch_label = shuffle(
                current_batch, current_batch_label)
예제 #36
0
    return res


for x in testText.split("."):
    if "----" in x:
        # for y in
        m = countFreq("----", x)
        x.replace("----", "")
        x = cleanData(x)
        # if re.search('[a-zA-Z]', x):
        # print (m)
        for l in range(int(m / 2)):
            testData.append(x)

# print (testData)
data, labels = shuffle(data, labels, random_state=0)

count_vect = CountVectorizer(ngram_range=(1, 1), max_df=0.1)
#print(count_vect)
tfidf_transformer = TfidfTransformer(use_idf=True)
X_train_counts = count_vect.fit_transform(data[:1050])
#print(X_train_counts)
# X_train_counts2 = count_vect.transform(data[5000:])
testData = count_vect.transform(testData)
#print(testData)

# print (X_train_counts.shape ,X_train_counts2.shape )

# # X_train_counts = vstack([X_train_counts, X_train_counts2]).toarray()
# x1 = X_train_counts.toarray().tolist()
# x2 = X_train_counts2.toarray().tolist()
예제 #37
0
def classifier(model, emb_mean, emb_std, embeddings_index):
    train = pd.read_csv('./input/TIL_NLP_train1_dataset.csv')
    test = pd.read_csv('./input/TIL_NLP_unseen_dataset.csv')
    print('running classifier')

    max_features = 4248
    print(max_features)
    maxlen = 200
    embed_size = 100
    train = shuffle(train)
    X_train = train["word_representation"].fillna("fillna").values
    y_train = train[[
        "outwear", "top", "trousers", "women dresses", "women skirts"
    ]].values
    X_test = test["word_representation"].fillna("fillna").values
    y_test = test[[
        "outwear", "top", "trousers", "women dresses", "women skirts"
    ]].values
    y_test = y_test.tolist()
    print('preprocessing start')
    tokenizer = text.Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X_train) + list(X_test))
    X_train = tokenizer.texts_to_sequences(X_train)
    X_test = tokenizer.texts_to_sequences(X_test)

    x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
    x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

    del X_train, X_test, train, test
    gc.collect()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std,
                                        (nb_words, embed_size))

    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i - 1] = embedding_vector

    print('preprocessing done')

    # session_conf = tf.ConfigProto(intra_op_parallelism_threads=4, inter_op_parallelism_threads=4)
    # K.set_session(tf.Session(graph=tf.get_default_graph(), config=session_conf))

    #model
    #wrote out all the blocks instead of looping for simplicity

    filter_nr = 64
    filter_size = 3
    max_pool_size = 3
    max_pool_strides = 2
    dense_nr = 256
    spatial_dropout = 0.2
    dense_dropout = 0.5
    train_embed = False
    conv_kern_reg = regularizers.l2(0.00001)
    conv_bias_reg = regularizers.l2(0.00001)

    comment = Input(shape=(maxlen, ))
    emb_comment = Embedding(max_features,
                            embed_size,
                            weights=[embedding_matrix],
                            trainable=train_embed)(comment)
    block1 = Bidirectional(LSTM(embed_size))(emb_comment)
    block1 = Dense(embed_size, activation='linear')(block1)
    output = Dense(5, activation='sigmoid')(block1)
    """
    emb_comment = SpatialDropout1D(spatial_dropout)(emb_comment)

    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)
    block1 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1)
    block1 = BatchNormalization()(block1)
    block1 = PReLU()(block1)

    #we pass embedded comment through conv1d with filter size 1 because it needs to have the same shape as block output
    #if you choose filter_nr = embed_size (300 in this case) you don't have to do this part and can add emb_comment directly to block1_output
    resize_emb = Conv1D(filter_nr, kernel_size=1, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(emb_comment)
    resize_emb = PReLU()(resize_emb)
        
    block1_output = add([block1, resize_emb])
    block1_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block1_output)

    block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block1_output)
    block2 = BatchNormalization()(block2)
    block2 = PReLU()(block2)
    block2 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2)
    block2 = BatchNormalization()(block2)
    block2 = PReLU()(block2)
        
    block2_output = add([block2, block1_output])
    block2_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block2_output)

    block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block2_output)
    block3 = BatchNormalization()(block3)
    block3 = PReLU()(block3)
    block3 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3)
    block3 = BatchNormalization()(block3)
    block3 = PReLU()(block3)
        
    block3_output = add([block3, block2_output])
    block3_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block3_output)

    block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block3_output)
    block4 = BatchNormalization()(block4)
    block4 = PReLU()(block4)
    block4 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4)
    block4 = BatchNormalization()(block4)
    block4 = PReLU()(block4)

    block4_output = add([block4, block3_output])
    block4_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block4_output)

    block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block4_output)
    block5 = BatchNormalization()(block5)
    block5 = PReLU()(block5)
    block5 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5)
    block5 = BatchNormalization()(block5)
    block5 = PReLU()(block5)

    block5_output = add([block5, block4_output])
    block5_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block5_output)

    block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block5_output)
    block6 = BatchNormalization()(block6)
    block6 = PReLU()(block6)
    block6 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6)
    block6 = BatchNormalization()(block6)
    block6 = PReLU()(block6)

    block6_output = add([block6, block5_output])
    block6_output = MaxPooling1D(pool_size=max_pool_size, strides=max_pool_strides)(block6_output)

    block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block6_output)
    block7 = BatchNormalization()(block7)
    block7 = PReLU()(block7)
    block7 = Conv1D(filter_nr, kernel_size=filter_size, padding='same', activation='linear', 
                kernel_regularizer=conv_kern_reg, bias_regularizer=conv_bias_reg)(block7)
    block7 = BatchNormalization()(block7)
    block7 = PReLU()(block7)

    block7_output = add([block7, block6_output])
    output = GlobalMaxPooling1D()(block7_output)

    output = Dense(dense_nr, activation='linear')(output)
    output = BatchNormalization()(output)
    output = PReLU()(output)
    output = Dropout(dense_dropout)(output)
    output = Dense(5, activation='sigmoid')(output)
    
    """
    #model = Model(comment, output)
    # print("Correct model: ", type(model))

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.Adam(),
                  metrics=['accuracy'])

    num_folds = 5
    num = 0
    kfold = KFold(n_splits=num_folds, shuffle=True)

    for train, test in kfold.split(x_train, y_train):

        print("Training Fold number: ", num)
        batch_size = 128
        epochs = 20
        lr = callbacks.LearningRateScheduler(schedule)
        ra_val = RocAucEvaluation(validation_data=(x_train[test],
                                                   y_train[test]),
                                  interval=1)
        es = EarlyStopping(monitor='val_loss',
                           verbose=1,
                           patience=5,
                           restore_best_weights=True,
                           mode='min')
        mc = ModelCheckpoint('best_model_rnn.h5',
                             monitor='val_loss',
                             mode='min',
                             verbose=1,
                             save_best_only=True)
        model.fit(x_train[train],
                  y_train[train],
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_train[test], y_train[test]),
                  callbacks=[lr, ra_val, es, mc],
                  verbose=1)
        num += 1

        y_pred = model.predict(x_test)
        y_pred = [[1 if i > 0.5 else 0 for i in r] for r in y_pred]

        accuracy = sum([y_pred[i] == y_test[i]
                        for i in range(len(y_pred))]) / len(y_pred) * 100
        print([y_pred[i] == y_test[i] for i in range(len(y_pred))])
        print(accuracy, "%")
        print(f1(y_pred, y_test))
        """
        submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
        submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
        submission.to_csv('dpcnn_test_preds.csv', index=False)
        """

    return model
        if os.path.isfile(to_cluster_path):
            return to_cluster_path
        elif os.path.isfile(to_local_path):
            return to_local_path
        else:
            print("No valid file path")
            return "NOPE"


print("Load data...")
total_data_df_path = '/Dedicated/jmichaelson-wdata/mcrichter/HackUiowa2018/NN_behaviour/total_data_df_reduced_no_0_columns.csv'
total_data_df = pd.read_csv(check_file_path(total_data_df_path))

seed = 42
total_data_df_shuffled = shuffle(total_data_df, random_state=seed)

X = total_data_df_shuffled.drop(["intercept", "Score"], axis=1)
y = total_data_df_shuffled[["Score"]]

# Create a minimum and maximum processor object
min_max_scaler = preprocessing.MinMaxScaler()

# Create an object to transform the data to fit minmax processor
y_scaled = min_max_scaler.fit_transform(y)

# Run the normalizer on the dataframe
y_normalized = pd.DataFrame(y_scaled, columns=['Score_normalized'])

(trainX, testX, trainY, testY) = train_test_split(X,
                                                  y_normalized,
    print('Loading no_car features from file...')
    with open(no_car_features_file, 'rb') as f:
        no_car_features = pickle.load(f)
else:
    print('Generating no_car features from file...')
    for name in image_names_no_car:
        image = cv2.imread(name)
        features = generate_hog_features(image)
        no_car_features.append(features)
    with open(no_car_features_file, 'wb') as f:
        pickle.dump(no_car_features, f)
        print('Generated')

y = np.hstack((np.ones(len(car_features)), np.zeros(len(no_car_features))))
X = np.vstack((car_features, no_car_features)).astype(np.float64)
X, y = shuffle(X, y)

#X = X[:2000]
#y = y[:2000]
print(X.shape)
X_scaler = StandardScaler().fit(X)
with open('X_scaler.pkl', 'wb') as f:
    pickle.dump(X_scaler, f)
X = X_scaler.transform(X)
print([X[0]])

rand_state = np.random.randint(0, 100)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=rand_state)
    sample_batched = next(iter(sequential))
    image = torch.autograd.Variable(sample_batched['image'].cuda())
    depth = torch.autograd.Variable(sample_batched['depth'].cuda(non_blocking=True))
    if epoch == 0: writer.add_image('Train.1.Image', vutils.make_grid(image.data, nrow=6, normalize=True), epoch)
    if epoch == 0: writer.add_image('Train.2.Depth', colorize(vutils.make_grid(depth.data, nrow=6, normalize=False)), epoch)
    output = DepthNorm( model(image) )
    writer.add_image('Train.3.Ours', colorize(vutils.make_grid(output.data, nrow=6, normalize=False)), epoch)
    writer.add_image('Train.3.Diff', colorize(vutils.make_grid(torch.abs(output-depth).data, nrow=6, normalize=False)), epoch)
    del image
    del depth
    del output
    

traincsv=pd.read_csv('./content/data/diml_outdoor_train.csv')
traincsv = traincsv.values.tolist()
traincsv = shuffle(traincsv, random_state=2)

#display a sample set of image and depth image
depth_dataset = DepthDataset(traincsv=traincsv,root_dir='./content/')
fig = plt.figure()
len(depth_dataset)

model = Model().cpu()
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
  model = nn.DataParallel(model)
#load trained model if needed
#model.load_state_dict(torch.load('/workspace/1.pth'))
print('Model created.')
예제 #41
0
loss_operation = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=rate)
training_operation = optimizer.minimize(loss_operation)

correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(one_hot_y, 1))
accuracy_operation = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    num_examples = len(X_train)

    print("Training...")
    print("Validation Accuracy:")
    for i in range(EPOCHS):
        X_train, y_train = shuffle(X_train, y_train)
        for offset in range(0, num_examples, BATCH_SIZE):
            end = offset + BATCH_SIZE
            batch_x, batch_y = X_train[offset:end], y_train[offset:end]
            sess.run(training_operation,
                     feed_dict={
                         x: batch_x,
                         y: batch_y,
                         keep_prob: 1.0
                     })

        validation_accuracy = evaluate(X_validation, y_validation)
        print("#{}".format(i + 1), " {:.3f}".format(validation_accuracy))

    saver.save(sess, './lenet')
예제 #42
0
def preprocess():
    global X_train, y_train, X_validation, y_validation, X_test, y_test
    X_train, y_train = shuffle(X_train, y_train)
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(BatchNormalization())

model.add(Dense(2))
model.add(Activation('sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# In[18]:

X_train, Y_train = shuffle(X_train, Y_train)
model_checkpoint = ModelCheckpoint('./Alexnet_brat.hdf5',
                                   monitor='loss',
                                   verbose=1,
                                   save_best_only=True)
reduce_lr = ReduceLROnPlateau(factor=0.5,
                              patience=3,
                              min_lr=0.000001,
                              verbose=1)
callbacks = [reduce_lr, model_checkpoint]
#model.load_weights("./Alexnet_brat.hdf5")
model.fit(X_train,
          Y_train,
          batch_size=32,
          epochs=200,
          verbose=1,
예제 #44
0
    'Z3', 'Z4', 'Z5', 'Z6', 'Z7', 'Z8', 'Z9', 'XAVG', 'YAVG', 'ZAVG', 'XPEAK',
    'YPEAK', 'ZPEAK', 'XABSOLDEV', 'YABSOLDEV', 'ZABSOLDEV', 'XSTANDDEV',
    'YSTANDDEV', 'ZSTANDDEV', 'Resultant', 'Class', 'Time'
]

feats = pd.read_csv(args.tx_train, header=None, names=act_headers)
AR = pd.read_csv(args.additional_train, header=None, names=act_headers)
#replacing Upstair and Downstairs with Stairs
AR = AR.replace(to_replace=['Upstairs', 'Downstairs'], value='Stairs')
print AR['Class'].unique()
'''
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(feats[feats.columns[1:44]], feats[feats.columns[44]])

'''
data = shuffle(feats)
no_of_samples = data.shape[0]
samples_per_fold = no_of_samples / 10
Features_10_folds = []
Labels_10_folds = []

for i in range(9):
    data_fold = data[:samples_per_fold]
    data = data[samples_per_fold:]
    features = data_fold[data_fold.columns[1:44]]
    labels = data_fold[data_fold.columns[44]]
    Features_10_folds.append(features)
    Labels_10_folds.append(labels)

#for last fold all remaining
features = data[data.columns[1:44]]
예제 #45
0
    feat2 = construct_feat(nlist2, model)
    feat3 = construct_feat(nlist3, model)
    try:
        feat = np.concatenate([feat1, feat2, feat3])
    except:
        continue
    X.append(feat[0])
    for j in range(1, len(feat)):
        X[-1] = np.concatenate([X[-1], feat[j]])
    Y.append(int(d[i][1]))
    maxi = max(maxi, len(X[-1]))

for i in range(len(X)):
    X[i] = pad(X[i], maxi)

print("constructed feature vectors")

X = np.array(X)
Y = np.array(Y)
X, Y = shuffle(X, Y)
Xtr = X[:5000]
Ytr = Y[:5000]
Xts = X[5000:5500]
Yts = Y[5000:5500]

print("brgining training")
# naive_bayes(Xtr, Ytr, Xts, Yts)
svm(Xtr, Ytr, Xts, Yts)
# log_regression(Xtr, Ytr, Xts, Yts)
# feed_forward_nn(Xtr, Ytr, Xts, Yts)
예제 #46
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-5,
            mu=0.9,
            decay=0.99,
            epochs=10,
            batch_sz=100,
            eps=10e-10,
            display_cost=False):
        #learning_rate=10e-7, mu=0.99, decay=0.999, epochs=100, batch_sz=30, l2=0.0, eps=10e-10
        learning_rate = np.float32(learning_rate)
        mu = np.float32(mu)
        decay = np.float32(decay)
        eps = np.float32(eps)
        '''
		In Theano we can't actually 'drop' the nodes;
		that would result in a different computational graph,
		we are instead to multiply nodes by 1 and 0;
		for each layer we then need to create a 'mask' - array of 0s and 1s;
		Theano graph nodes don't have values, so we can't multiply them by numpy array 'mask';
		instead we want Theano to generate random values every time it's called;
		thus we create an instance of RandomStreams object:
		'''
        self.rng = RandomStreams()

        # first, make a validation set:
        X, Y = shuffle(X, Y)
        X = X.astype(np.float32)
        Y = Y.astype(np.int32)
        Xvalid, Yvalid = X[-1000:, :], Y[-1000:]
        X, Y = X[:-1000, :], Y[:-1000]

        #initialize the hidden layers:
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []

        # the size of the first dimension of the first matrix:
        M1 = D
        count = 0  # for the id of the weigts/biases
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2  # update the first dimension size fir the next iteration
            count += 1

        # for the last weight/bias matrix (vector):
        W, b = init_weight_and_bias(M1, K)
        self.W = theano.shared(W, 'W%s' % count)
        self.b = theano.shared(b, 'b%s' % count)

        # collect all the parameters we are going to use during Gradient Descent:
        self.parameters = [self.W, self.b]
        for h in self.hidden_layers[::-1]:
            self.parameters += h.params

        # in order to use Momentum,
        # we are to keep track of all the changes (dW's and db's):
        dparams = [
            theano.shared(np.zeros_like(p.get_value(), dtype=np.float32))
            for p in self.parameters
        ]

        # for RMSProp,
        # we are to keep track of caches (cache_W's and cache_b's) as well:
        caches = [
            theano.shared(np.ones_like(p.get_value(), dtype=np.float32))
            for p in self.parameters
        ]

        # define theano variables and functions:
        thX = T.matrix('X')
        thY = T.ivector('Y')  # a vector of integers

        # since we do dropout, we drop the nodes only on training step,
        # when evaluating we just scale them;
        # so we need to define two expressions for the output and cost calculations:
        pY_train = self.forward_train(thX)
        pY_predict = self.forward_predict(thX)

        cost = -T.mean(T.log(pY_train[T.arange(thY.shape[0]), thY]))
        cost_predict = -T.mean(T.log(pY_predict[T.arange(thY.shape[0]), thY]))

        prediction = self.predict(thX)  # will do sort of T.argmax(pY, axis=1)

        cost_predict_op = theano.function(inputs=[thX, thY],
                                          outputs=[cost_predict, prediction])

        # the updates for the train function:

        updates = [
            (cache, decay * cache +
             (np.float32(1.0) - decay) * T.grad(cost, p)**2)
            for p, cache in zip(self.parameters, caches)
        ] + [(dp,
              mu * dp - learning_rate * T.grad(cost, p) / T.sqrt(cache + eps))
             for dp, p, cache in zip(dparams, self.parameters, caches)
             ] + [(p, p + dp) for p, dp in zip(self.parameters, dparams)]

        #updates = rmsprop(cost, self.parameters, learning_rate, mu, decay, eps)

        train_op = theano.function(inputs=[thX, thY], updates=updates)

        # batch SGD:
        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j + 1) * batch_sz, :]
                Ybatch = Y[j * batch_sz:(j + 1) * batch_sz]

                train_op(Xbatch, Ybatch)

                if j % 20 == 0:
                    c, p = cost_predict_op(Xvalid, Yvalid)
                    costs.append(c)
                    e = error_rate(Yvalid, p)
                    print('\ni: %d,  j: %d, cost: %.6f, \nerror: %.6f' %
                          (i, j, c, e))

        if display_cost:
            plt.plot(costs)
            plt.show()
              optimizer='adam',
              metrics=['accuracy'])

#model.compile(loss='hinge',
#              optimizer='adadelta',
#              metrics=['accuracy'])

from keras.utils.visualize_util import plot
plot(
    model,
    to_file=
    '/Users/km4n6/Box Sync/kiran/NN_project/final_project/plots/model_svm.png',
    show_shapes=True)

from sklearn.utils import shuffle
im_shuffled_validation, shuffled_targets_validation = shuffle(
    im_validation, targets_validation, random_state=0)

#

out = model.fit(im,
                targets,
                validation_data=(im_shuffled_validation,
                                 shuffled_targets_validation),
                nb_epoch=25,
                verbose=1,
                initial_epoch=0,
                batch_size=32,
                shuffle=True)

np.save(
    '/Users/km4n6/Box Sync/kiran/NN_project/final_project/saved_models/history_acc_loss_svm.npy',
예제 #48
0
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import pickle

# # Loading the data from the csv file
# # There are total 101 crops in this csv file with NPK ph temp and climate values
data = pd.read_csv('fpo/Crop1.csv')

# Throws out a random permutation of the data
data = shuffle(data)

# We get the crop names from the dataframe
y = data.loc[:, 'Crop']

# We're using label encoding so that the names of the crops (which are strings) can be converted into numbers that are easily interpreted by a model
labelEncoded_y = LabelEncoder()

# Applying the transformation
y = labelEncoded_y.fit_transform(y)

# Creating a new column for the transformed names
data['crop_num'] = y

# Now we get the features for predictions
X = data[['N', 'P', 'K', 'pH', 'temp', 'climate']]

# The labels
y = data['crop_num']
예제 #49
0
# RUNNING CODE

full_dataset, full_labels, lost_features = pull_dataset()

print('Sanity Check')
print('full dataset of shape:', full_dataset.shape)
print('full labels of shape:', full_labels.shape)

print(
    'TOTAL NUMBER OF FACES NOT DETECTED WITH OUR LANDMARKS DETECTOR (IN-BUILT, pre-trained model): {0}'
    .format(len(lost_features)))
# # creating classifier object as an SVM (support vector machine) probabilistic model, you can change this to any other type of classifier
# classifier = SVC(kernel='linear', probability=True, tol=1e-3)

# Reshuffling data (for extra randomness)
X_data, Y_data = shuffle(full_dataset, full_labels, random_state=0)

print('X_data of shape:', X_data.shape)
print('Y_data of shape:', Y_data.shape)

# perform train and test split (random state set to 1 to ensure same distribution accross different sets)
# this split is obviously case specific! but cross validation allows us to avoid over-fitting so lets make sure we have a validation set ready.
# Since the dataset is not extrememly large i'll be using a 60/20/20 split, meaning more or less 1000 validation and test examples and 3000 training examples, to be tested: 75/10/15
# in this case we are a little less concerned since we are evaluating smiles which are present in every case, unlike glasses
X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                    Y_data,
                                                    test_size=0.2,
                                                    random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.2,
예제 #50
0
        'AOD_11',
        'AOD_12',
        'AOD_13',
        'AOD_14',
        'AOD_15',
        'AOD_16',
    ]
    for clo in data_get_dummies3.columns:
        independent.append(clo)
    for clo2 in data_get_dummies1.columns:
        independent.append(clo2)
    # 因变量
    dependent = ["PM25"]

    # 打乱
    data = shuffle(data_out)

    # 参数设置
    mlp = LinearRegression(fit_intercept=True)
    rng = check_random_state(1027)
    # 划分
    x_train = data_train[independent].values
    x_test = data_test[independent].values
    y_train = data_train[dependent].values.ravel()
    y_test = data_test[dependent].values.ravel()

    # 计算耗时
    starttime = datetime.datetime.now().second
    # 程序
    ensemble = AdaBoostRegressor(base_estimator=mlp,
                                 learning_rate=0.01,
예제 #51
0
파일: utils.py 프로젝트: JiweiTian/UAP_EEG
def UAP_target_pre(x, model, model_used, model_path, save_path, noise_limit=0.2, attack_type=None, target_class=None,
               batch_size=None, nb_classes=None, channels=None, samples=None, regular=None):
    # x_train, x_val, y_train, y_val = train_test_split(x, y, shuffle=True, test_size=0.2)
    x_train, x_val = train_test_split(x, shuffle=True, test_size=0.2)
    batch_size = min(batch_size, len(x_train))

    universal_noise = tf.Variable(np.zeros((x_train[0].shape)), dtype=tf.float32)
    temp_universal_noise = tf.expand_dims(universal_noise, 0)
    # print(temp_universal_noise)
    x_input = Input(shape=(x_train.shape[1], x_train.shape[2], x_train.shape[3]))
    x = Lambda(lambda xx: xx + tf.clip_by_value(temp_universal_noise, -noise_limit, noise_limit))(x_input)

    # Model output
    if model_used == 'EEGNet':
        prediction = old_models.EEGNet_output(nb_classes=nb_classes, Chans=channels, Samples=samples, x_input=x)
    elif model_used == 'DeepConvNet':
        prediction = old_models.DeepConvNet_output(nb_classes=nb_classes, Chans=channels, Samples=samples, x_input=x)
    elif model_used == 'ShallowConvNet':
        prediction = old_models.ShallowConvNet_output(nb_classes=nb_classes, Chans=channels, Samples=samples, x_input=x)
    else:
        raise Exception('No such model:{}'.format(model_used))

    # print(prediction)
    u_model = Model(inputs=x_input, outputs=prediction)
    u_model.load_weights(model_path)
    model.load_weights(model_path)

    y_train = np.argmax(model.predict(x_train, batch_size=batch_size), axis=1).flatten()
    y_val = np.argmax(model.predict(x_val, batch_size=batch_size), axis=1).flatten()


    alpha = tf.placeholder(dtype=tf.float32)
    al = 100
    if regular == 'l1':
        loss = alpha * (tf.reduce_mean(tf.abs(universal_noise)))
        al = 5
    elif regular == 'l2':
        loss = alpha * (tf.reduce_mean(tf.square(universal_noise)))
        al = 100
    elif regular == 'l1+l2':
        loss = alpha * (tf.reduce_mean(10*tf.square(universal_noise) + 0.1*tf.abs(universal_noise)))
        al = 10
    elif regular == None:
        loss = 0
    else:
        raise Exception('no such loss regularization!')
    # loss = alpha * (tf.reduce_mean(tf.square(universal_noise) + tf.abs(universal_noise)))
    # loss = alpha * (tf.reduce_mean(tf.square(universal_noise) + tf.square(universal_noise)))
    # print(loss)
    target = tf.placeholder(dtype=tf.int32, shape=[None, ])
    if attack_type == 'nontarget':
        # loss += K.mean(K.sparse_categorical_crossentropy(target, 1-prediction, from_logits=False))
        loss += -K.mean(K.sparse_categorical_crossentropy(target, prediction, from_logits=False))
    elif attack_type == 'target':
        loss += K.mean(K.sparse_categorical_crossentropy(target, prediction, from_logits=False))
    else:
        raise Exception('no such attack_type!')

    start_vars = set(x.name for x in tf.global_variables())
    lr_ph = tf.placeholder(shape=[], dtype=tf.float32)

    optimizer = tf.train.AdamOptimizer(learning_rate=lr_ph)
    train = optimizer.minimize(loss, var_list=[universal_noise])

    end_vars = tf.global_variables()
    new_vars = [x for x in end_vars if x.name not in start_vars]
    init = tf.variables_initializer(var_list=[universal_noise] + new_vars)

    sess = K.get_session()
    sess.run(init)

    nb_batch = len(x_train) // batch_size


    end = False

    epochs = 500
    lr = 1e-3
    v = np.zeros((x_train[0].shape))

    patience = 0
    patience_threshold = 10

    idx_list = [m for m in range(len(x_train))]

    # target
    if attack_type == 'target':
        y_true = np.ones(y_val.shape) * target_class
        stop_condition = 1
        acc_best = 0.
    else:
        y_true = np.copy(y_val)
        stop_condition = -1
        acc_best = 1.
        # stop_condition = 1
        # fr_best = 0.

    for epoch in range(epochs):
        idx_list = shuffle(idx_list)
        for i in range(nb_batch):
            target_idx = idx_list[i * batch_size:min((i + 1) * batch_size, len(x_train))]
            x_batch, y_batch = x_train[target_idx], y_train[target_idx]

            if attack_type == 'target':
                y_batch = np.ones(y_batch.shape) * target_class

            _, losses = sess.run(
                [train, loss],
                {
                    u_model.inputs[0]: x_batch,
                    alpha: al, lr_ph: lr,
                    target: y_batch,
                    # K.learning_phase(): 0
                }
            )

            if (i + epoch * nb_batch) % 100 == 0:
                # if i % 1 == 0:
                pred = np.argmax(u_model.predict(x_val), -1)
                y_pred = pred.squeeze()
                acc = np.sum(np.where(y_pred == y_true, 1, 0)).astype(np.float64) / len(y_pred)
                norm = np.mean(np.square(sess.run(universal_noise)))
                if attack_type == 'target':
                    print('epoch:{}/{}, batch:{}/{}, acc:{}, norm:{}'.format(epoch + 1, epochs, i + 1, nb_batch,
                                                                             acc, norm))
                else:
                    raw_pred = np.argmax(model.predict(x_val), -1).squeeze()
                    fooling_rate = np.sum(np.where(y_pred != raw_pred, 1, 0)).astype(np.float64) / len(y_pred)
                    print('epoch:{}/{}, batch:{}/{}, acc:{}, fooling rate:{}, norm:{}, loss:{}'.format(epoch + 1,
                                                                 epochs, i + 1, nb_batch, acc, fooling_rate, norm, losses))

                # if acc > threshold_acc and norm > threshold_norm:
                #     a = 5e2
                if stop_condition * acc > stop_condition * acc_best:
                    patience = 0
                    acc_best = acc
                    v = K.eval(universal_noise)
                    if save_path == None:
                        print('update v! but not save.')
                    else:
                        print('best acc:{}, now saving adversarial patch to {}.'.format(acc_best, save_path))
                        # np.savez(noise_filename, v=un_no)
                        np.savez(save_path, v=v)
                else:
                    patience += 1
                    if acc == 1:
                        print('best acc:{}, now saving adversarial patch to {}.'.format(acc_best, save_path))
                        np.savez(save_path, v=v)

                if patience == patience_threshold:
                    end = True
                    break

        if end:
            break
    return v
    history = {'val_loss': [], 'val_acc': []}
    '''
    모델을 학습시킨다
    '''
    epochs = 50
    batch_size = 200

    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)

    n_batches = N_train // batch_size

    for epoch in range(epochs):
        X_, Y_ = shuffle(X_train, Y_train)

        for i in range(n_batches):
            start = i * batch_size
            end = start + batch_size

            sess.run(train_step,
                     feed_dict={
                         x: X_[start:end],
                         t: Y_[start:end],
                         keep_prob: p_keep
                     })

        # 검증 데이터를 사용해서 평가한다
        val_loss = loss.eval(session=sess,
                             feed_dict={
예제 #53
0
def getData(dataPath, moreDataPath, trainSize, trainFlag, devFlag, testFlag):
    ##TODO changed devSize to 1000 for trails for more trainSize data. It was 5000 for default run
    devSize = 1000
    testSize = 1000
    numTraces = 1500  ##Number of traces collected per key

    ## Pre defining the arrays based on sizes of the data
    x_train = np.zeros((28000 * 256, numTraces))
    x_dev = np.zeros((devSize * 256, numTraces))
    x_test = np.zeros((testSize * 256, numTraces))

    y_train = np.zeros((28000 * 256, 1))
    y_dev = np.zeros((devSize * 256, 1))
    y_test = np.zeros((testSize * 256, 1))

    for index, val in enumerate(range(0, 256)):
        print("Started data processing for %d key\n" % (val))
        trainStr = dataPath + "train_" + str(val) + ".pkl.zip"
        devStr = dataPath + "dev_" + str(val) + ".pkl.zip"
        testStr = dataPath + "test_" + str(val) + ".pkl.zip"

        ##more training data path
        moreTrainStr = moreDataPath + "train_" + str(val) + ".pkl.zip"

        ## Checking if the file size is 0, before processing data
        ## This check is for cross config analysis, where traina nd dev are empty
        #if (os.stat(trainStr).st_size != 0):
        if (trainFlag):
            x_train_inter, y_train_inter = process_inputs(trainStr)
            ## Trainsize will still be 15000, but we will take data from devSet to trainset
            x_train[trainSize * index:trainSize * (index) +
                    15000, :] = x_train_inter
            y_train[trainSize * index:trainSize * (index) + 15000,
                    0] = y_train_inter

            ## Adding 9000 more data
            x_train_inter_more, y_train_inter_more = process_inputs(
                moreTrainStr)
            x_train[trainSize * (index) + 15000:(trainSize * (index) + 15000) +
                    9000, :] = x_train_inter_more[0:9000, :]
            y_train[trainSize * (index) + 15000:(trainSize * (index) + 15000) +
                    9000, 0] = y_train_inter_more.reshape(9000, 1)[0:9000, 0]

            print("Train= %s\n" % (trainFlag))
        else:
            ## Assigning the array's to 0's
            ##NOTE: needs to change shape, but since we are always training, I am not changing this
            x_train[trainSize * index:trainSize * (index + 1), :] = np.zeros(
                (trainSize, numTraces))
            y_train[trainSize * index:trainSize * (index + 1), :] = np.zeros(
                (trainSize, 1))
            print("train= %s\n" % (trainFlag))

        #if (os.stat(devStr).st_size != 0):
        if (devFlag):
            ## get the data for each sub part
            x_dev_inter, y_dev_inter = process_inputs(devStr)
            print("x_dev_inter= %s, y_dev_inter= %s" %
                  (x_dev_inter.shape, y_dev_inter.shape))
            x_dev[devSize * index:devSize *
                  (index + 1), :] = x_dev_inter[0:devSize, :]
            y_dev[devSize * index:devSize * (index + 1),
                  0] = y_dev_inter.reshape(5000, 1)[0:devSize, 0]
            print("Dev= %s\n" % (devFlag))
            print("x_dev= %s, y_dev= %s" % (x_dev.shape, y_dev.shape))

            ## Adding 4000 traces to trainSet here
            x_train[trainSize * (index) + 15000 + 9000:(trainSize *
                                                        (index) + 15000) +
                    13000, :] = x_dev_inter[1000:5000, :]
            y_train[trainSize * (index) + 15000 +
                    9000:(trainSize * (index) + 15000) + 13000,
                    0] = y_dev_inter.reshape(5000, 1)[devSize:5000, 0]
            print("x_trainSize = %s, y_trainSize= %s" %
                  (x_train.shape, y_train.shape))
        else:
            x_dev[devSize * index:devSize * (index + 1), :] = np.zeros(
                (devSize, numTraces))
            y_dev[devSize * index:devSize * (index + 1), :] = np.zeros(
                (devSize, 1))
            print("dev= %s\n" % (devFlag))

        ## Test data is present so check is not performed
        if (testFlag):
            x_test_inter, y_test_inter = process_inputs(testStr)
            x_test[testSize * index:testSize * (index + 1), :] = x_test_inter
            y_test[testSize * index:testSize * (index + 1), 0] = y_test_inter
            print("Test= %s\n" % (testFlag))
            print("x_test= %s, y_test= %s" % (x_test.shape, y_test.shape))
        else:
            x_test[testSize * index:testSize * (index + 1), :] = np.zeros(
                (testSize, numTraces))
            y_test[testSize * index:testSize * (index + 1), :] = np.zeros(
                (testSize, 1))
            print("test= %s\n" % (testFlag))

        print("Finished data processing for %d key\n" % (val))

    ## Clear variables
    x_train_inter = None
    x_dev_inter = None
    x_test_inter = None
    y_train_inter = None
    y_dev_inter = None
    y_test_inter = None
    x_train_inter_more = None
    y_train_inter_more = None
    print("\nCleared variables\n")

    ##Not shuffling for debugging, should be removed
    ## Shuffling
    ## https://scikit-learn.org/stable/modules/generated/sklearn.utils.shuffle.html
    print("\nStarted shuffling of data\nx_train[0]= %s\ny_train[0]= %s" %
          (x_train[0], y_train[0]))
    print("\nx_train[12000]= %s\ny_train[12000]= %s" %
          (x_train[12000], y_train[12000]))
    x_train, y_train = shuffle(x_train, y_train, random_state=0)
    x_dev, y_dev = shuffle(x_dev, y_dev, random_state=0)
    x_test, y_test = shuffle(x_test, y_test, random_state=0)
    print("\nFinished shuffling of data\nx_train[0]= %s\ny_train[0]= %s" %
          (x_train[0], y_train[0]))
    print("\nx_train[12000]= %s\ny_train[12000]= %s" %
          (x_train[12000], y_train[12000]))

    ##NOTE: Remove:
    #Mimport pdb; pdb.set_trace()
    ## One hot assignment
    n_classes = 256
    y_train_oh = np_utils.to_categorical(y_train, n_classes)
    y_dev_oh = np_utils.to_categorical(y_dev, n_classes)
    y_test_oh = np_utils.to_categorical(y_test, n_classes)

    print("\nOne-hot encoded for outputs\n")
    ## Standardizing train, dev and test
    x_train_mean = x_train.mean(axis=0)
    x_train_std = x_train.std(axis=0)

    x_dev_mean = x_dev.mean(axis=0)
    x_dev_std = x_dev.mean(axis=0)

    x_test_mean = x_test.mean(axis=0)
    x_test_std = x_test.std(axis=0)

    #M## Concatenating train and dev
    #Mx_full = np.concatenate((x_train, x_dev), axis=0)
    #Mx_full_mean = x_full.mean(axis=0)
    #Mx_full_std = x_full.std(axis=0)

    ## chunking the normalization process
    print("Strated normalizing\n")
    chunkSize = 28000
    chunkNum = int(len(x_train) / chunkSize)
    for chunkIndex in range(chunkNum):
        print("Train chunkIndx= %s, chunkNum = %s" % (chunkIndex, chunkNum))
        if (chunkIndex != chunkNum - 1):
            x_train[chunkIndex * chunkSize:(chunkIndex + 1) *
                    chunkSize] = (x_train[chunkIndex * chunkSize:
                                          (chunkIndex + 1) * chunkSize] -
                                  x_train_mean) / x_train_std
        else:
            x_train[chunkIndex *
                    chunkSize:] = (x_train[chunkIndex * chunkSize:] -
                                   x_train_mean) / x_train_std

    devChunkSize = 10000
    devChunkNum = int(len(x_dev) / devChunkSize)
    for devChunkIndex in range(devChunkNum):
        print("Dev chunkIndx= %s, chunkNum = %s" %
              (devChunkIndex, devChunkNum))
        if (devChunkIndex != devChunkNum - 1):
            x_dev[devChunkIndex * devChunkSize:(devChunkIndex + 1) *
                  devChunkSize] = (x_dev[devChunkIndex * devChunkSize:
                                         (devChunkIndex + 1) * devChunkSize] -
                                   x_train_mean) / x_train_std
        else:
            x_dev[devChunkIndex *
                  devChunkSize:] = (x_dev[devChunkIndex * devChunkSize:] -
                                    x_train_mean) / x_train_std

    ## Need to do the same for test too
    return (x_train, y_train_oh), (x_dev, y_dev_oh), (x_test, y_test_oh)
예제 #54
0
## - 1s - loss: 0.0067 - mean_squared_error: 0.0027 - val_loss: 0.0049 - val_mean_squared_error: 0.0011
##train 
##mean_squared_error: 0.0005
##test 
##mean_squared_error: 0.0011

random.seed(datetime.now())

# load dataset
X = np.transpose(np.loadtxt("Xtrain.txt", dtype=float))
Y = np.transpose(np.loadtxt('Ytrain.txt', dtype=float))
m = X.shape[0]
print(str(m))

# shuffle dataset (mini batch)
X_shuffled, Y_shuffled = shuffle(X, Y)

# split dataset
m_train = math.floor(m*0.80)
print(str(m_train))
X_train = X_shuffled[:m_train,:].reshape(m_train,height,width,3)
Y_train = Y_shuffled[:m_train]
X_test = X_shuffled[m_train:,:].reshape(m-m_train,height,width,3)
Y_test = Y_shuffled[m_train:]

# create model
model = Sequential()

# CONV
model.add(Conv2D(8, kernel_size=(5, 5), strides=(1, 1), dilation_rate = (1,1), border_mode='valid', activation='relu', input_shape=inmage_shape))
model.add(MaxPooling2D(pool_size=(2, 1)))
#!/usr/bin/env python 

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np

boston = datasets.load_boston()
X, Y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, Y_train = X[:offset], Y[:offset]
X_test, Y_test = X[offset:], Y[offset:]

regressor = BaggingRegressor(DecisionTreeRegressor(max_depth=8))
regressor.fit(X_train,Y_train)
score = regressor.score(X_test,Y_test)
print score
예제 #56
0
def main():
	max_iter = 20
	print_period = 50

	train_X, test_X, train_Y, test_Y = get_normalized_data()
	learning_rate = 0.00004
	reg = 0.01
	train_Y_ind = indicator(train_Y)
	test_Y_ind = indicator(test_Y)

	N, D = train_X.shape
	batch_size = 500
	batch_num = N // batch_size

	M = 300
	K = 10
	W1 = np.random.randn(D, M) / np.sqrt(D)
	b1 = np.zeros(M)
	W2 = np.random.randn(M, K) / np.sqrt(M)
	b2 = np.zeros(K)

	#SAVE INITIAL WEIGHT AND BIAS
	W1_copy = W1.copy()
	b1_copy = b1.copy()
	W2_copy = W2.copy()
	b2_copy = b2.copy()

	#batch
	loss_batch = []
	error_batch =[]
	for i in range(max_iter):
		shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
		for j in range(batch_num):
			x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :]
			y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :]
			pY, Z = forward(x, W1, W2, b1, b2)

			W2 -= learning_rate * (derivative_w2(Z, y, pY) + reg*W2)
			b2 -= learning_rate * (derivative_b2(y, pY) + reg*b2)
			W1 -= learning_rate * (derivative_w1(x, Z, y, pY, W2) + reg*W1)
			b1 -= learning_rate * (derivative_b1(Z, y, pY, W2) + reg*b1)

			if j % print_period == 0:
				p_test, Z_test = forward(test_X, W1, W2, b1, b2)
				l = cost(p_test, test_Y_ind)
				e = error_rate(p_test, test_Y)
				loss_batch.append(l)
				error_batch.append(e)
				print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
				print("error_rate: ", e)
	p_final, z_final = forward(test_X, W1, W2, b1, b2)
	print("final error_rate:", error_rate(p_final, test_Y))



	#momentum
	W1 = W1_copy.copy()
	b1 = b1_copy.copy()
	W2 = W2_copy.copy()
	b2 = b2_copy.copy()

	lose_momentum = []
	error_momentum = []
	mu = 0.9
	dW1 = 0
	dW2 = 0
	db1 = 0
	db2 = 0

	for i in range(max_iter):
		shuffle_X, shuffle_Y = shuffle(train_X, train_Y_ind)
		for j in range (batch_num):
			x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :]
			y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :]
			pY, Z = forward(x, W1, W2, b1, b2)
			# print("overflow?")
			gW2 = derivative_w2(Z, y, pY) + reg*W2
			gb2 = derivative_b2(y, pY) + reg*b2
			gW1 = derivative_w1(x, Z, y, pY, W2) + reg*W1
			gb1 = derivative_b1(Z, y, pY, W2) + reg*b1

			#UDPATE VELOCITIES
			dW2 = mu*dW2 - learning_rate*gW2
			db2 = mu*db2 - learning_rate*gb2
			dW1 = mu*dW1 - learning_rate*gW1
			db1 = mu*db1 - learning_rate*gb1

			#UPDATE WEIGHT
			W2 += dW2
			b2 += db2
			W1 += dW1
			b1 += db1

			if j % print_period == 0:
				p_test, Z_test = forward(test_X, W1, W2, b1, b2)
				l = cost(p_test, test_Y_ind)
				e = error_rate(p_test, test_Y)
				lose_momentum.append(l)
				error_momentum.append(e)
				print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
				print("error_rate: ", e)
	p_final, z_final = forward(test_X, W1, W2, b1, b2)
	print("final error_rate:", error_rate(p_final, test_Y))


	#Nesterov momentum
	W1 = W1_copy.copy()
	b1 = b1_copy.copy()
	W2 = W2_copy.copy()
	b2 = b2_copy.copy()

	lose_nesterov = []
	error_nesterov = []
	mu = 0.9
	dW1 = 0
	db1 = 0
	dW2 = 0
	db2 = 0

	for i in range(max_iter):
		shuffle_X, shuffle_Y = shuffle(test_X, test_Y_ind)
		for j in range(batch_num):
			x = shuffle_X[j*batch_size : (j*batch_size+batch_size), :]
			y = shuffle_Y[j*batch_size : (j*batch_size+batch_size), :]
			pY, Z = forward(x, W1, W2, b1, b2)

			gW2 = derivative_w2(Z, y, pY) + reg*W2
			gb2 = derivative_b2(y, pY) + reg*b2
			gW1 = derivative_w1(x, Z, y, pY, W2) + reg*W1
			gb1 = derivative_b1(Z, y, pY, W2) + reg*b1

			#update velocities
			dW2 = mu*dW2 - learning_rate*gW2
			db2 = mu*db2 - learning_rate*db2
			dW1 = mu*dW1 - learning_rate*gW1
			db1 = mu*db1 - learning_rate*gb1

			#update weight
			W2 += mu*dW2 - learning_rate*gW2
			b2 += mu*db2 - learning_rate*db2
			W1 += mu*dW1 - learning_rate*gW1
			b1 += mu*db1 - learning_rate*gb1

			if j % print_period == 0:
				p_test, Z_test = forward(test_X, W1, W2, b1, b2)
				l = cost(p_test, test_Y_ind)
				e = error_rate(p_test, test_Y)
				lose_nesterov.append(l)
				error_nesterov.append(e)
				print("cost at itertion i=%d, j=%d: %.6f" % (i, j, l))
				print("error_rate: ", e)
	p_final, z_final = forward(test_X, W1, W2, b1, b2)
	print("final error_rate:", error_rate(p_final, test_Y))


	
	plt.plot(loss_batch, label="batch")
	plt.plot(lose_momentum, label="momentum")
	plt.plot(lose_nesterov, label="Nesterov")
	plt.legend()
	plt.show()
예제 #57
0
print("\nConverting groundTruth labels to numpy array...")

# this part is for the groundTruth labels
with open(gt_path + 'drive.json') as f:
    data = json.load(f)

# convert to numpy array
data = np.asarray(data)

# extract speed
y = data[:, 1]

# shuffle
print("\nShuffling the data...")
X, y = shuffle(X, y, random_state=42)

# split into train and test
print("\nSplitting X into train and test...")
X_train = X[train_mask]
X_test = X[test_mask]

print("\nWriting X_train as HDF5...")
write_hdf5(X_train, out_path + "X_train_50.hdf5")

print("\nWriting X_test as HDF5...")
write_hdf5(X_test, out_path + "X_test_50.hdf5")

# split into train and test
print("\nSplitting y into train and test...")
y_train = y[train_mask]
예제 #58
0
def train_test_split_by_part(X,
                             y,
                             pdgid,
                             n_mu=2500,
                             n_el=2500,
                             n_had=2500,
                             n_fake=2500):

    try:
        mu_idx = sample_without_replacement(len(X[abs(pdgid) == 13]),
                                            n_mu,
                                            random_state=23)
        elec_idx = sample_without_replacement(len(X[abs(pdgid) == 11]),
                                              n_el,
                                              random_state=23)
        had_idx = sample_without_replacement(len(X[np.logical_and(
            abs(pdgid) > 37, pdgid != -999)]),
                                             n_had,
                                             random_state=23)
        fake_idx = sample_without_replacement(len(X[pdgid == -999]),
                                              n_fake,
                                              random_state=23)
    except:
        print(
            "Error: Not enough muons/electrons/hadrons/fakes in sample to create training data"
        )
        return [], [], [], [], [], []

    X_train = np.concatenate((X[abs(pdgid)==13][mu_idx],X[abs(pdgid)==11][elec_idx],\
                              X[np.logical_and(abs(pdgid)>37,pdgid!=-999)][had_idx],X[pdgid==-999][fake_idx]))
    y_train = np.concatenate((y[abs(pdgid)==13][mu_idx],y[abs(pdgid)==11][elec_idx],\
                              y[np.logical_and(abs(pdgid)>37,pdgid!=-999)][had_idx],y[pdgid==-999][fake_idx]))
    pdgid_train = np.concatenate((pdgid[abs(pdgid)==13][mu_idx],pdgid[abs(pdgid)==11][elec_idx],\
                                  pdgid[np.logical_and(abs(pdgid)>37,pdgid!=-999)][had_idx],pdgid[pdgid==-999][fake_idx]))

    X_test = np.concatenate((np.delete(X[abs(pdgid)==13],mu_idx,axis=0),np.delete(X[abs(pdgid)==11],elec_idx,axis=0),\
                             np.delete(X[np.logical_and(abs(pdgid)>37,pdgid!=-999)],had_idx,axis=0),\
                             np.delete(X[pdgid==-999],fake_idx,axis=0)))
    y_test = np.concatenate((np.delete(y[abs(pdgid)==13],mu_idx,axis=0),np.delete(y[abs(pdgid)==11],elec_idx,axis=0),\
                             np.delete(y[np.logical_and(abs(pdgid)>37,pdgid!=-999)],had_idx,axis=0),\
                             np.delete(y[pdgid==-999],fake_idx,axis=0)))
    pdgid_test = np.concatenate((np.delete(pdgid[abs(pdgid)==13],mu_idx,axis=0),np.delete(pdgid[abs(pdgid)==11],elec_idx,axis=0),\
                                 np.delete(pdgid[np.logical_and(abs(pdgid)>37,pdgid!=-999)],had_idx,axis=0),\
                                 np.delete(pdgid[pdgid==-999],fake_idx,axis=0)))

    mu_check = (np.sum(abs(pdgid_test) == 13) / np.sum(abs(pdgid) == 13) < .2)
    el_check = (np.sum(abs(pdgid_test) == 11) / np.sum(abs(pdgid) == 11) < .2)
    had_check = (np.sum(pdgid_test == -999) / np.sum(pdgid == -999) < .2)
    fake_check = (
        np.sum(np.logical_and(abs(pdgid_test) > 37, pdgid_test != -999)) /
        np.sum(np.logical_and(abs(pdgid_test) > 37, pdgid_test != -999)) < .2)
    if mu_check or el_check or had_check or fake_check:
        print(
            "Warning: The test set has less than 20% of muons/electrons/hadrons/fakes"
        )

    X_train, y_train, pdgid_train = shuffle(X_train,
                                            y_train,
                                            pdgid_train,
                                            random_state=23)
    X_test, y_test, pdgid_test = shuffle(X_test,
                                         y_test,
                                         pdgid_test,
                                         random_state=23)

    return X_train, y_train, pdgid_train, X_test, y_test, pdgid_test
 def read_csvfile(self, filename):
     self.dataframe = pd.read_csv(filename)
     self.dataframe = shuffle(self.dataframe)
     return self.dataframe
예제 #60
0
파일: CNNTest.py 프로젝트: lovenets/DSR
          activation='relu',
          kernel_initializer = TruncatedNormal(stddev=0.1),
          kernel_regularizer = regularizers.l2(0.01),
          name="D3")(x)

x = BatchNormalization(axis = 1,name="D3_BN")(x)
x = Activation('relu',name = 'D3_relu')(x)

out = Dense(5,activation='softmax',name="OutPut")(x)

model = Model(inputs = inputs,outputs = out)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
print("正在训练网络,请耐心等候......")
X_train,y_train = shuffle(X_train,y_train,random_state=0)
startTime = time.clock()
trainLog = model.fit(X_train,
          y_train,
          validation_split = 0.1,
          batch_size=64,
          epochs=10,
          verbose=1
          )
endTime = time.clock()
# 注意,这里的时间window和linux不同
print("网络训练已完成 耗时%f 秒"%((float)(endTime - startTime)/10))

# 绘制模型的结构图 此处还出现点问题,待解a决
plot_model(model,
           to_file='model.png',