예제 #1
0
def loadData():

    data_df = pd.DataFrame.from_csv('feature_matrix.csv')
    # print data_df[-5:]

    #load feature headers
    filename = 'features.csv'
    f = open(filename, 'r')
    lines = f.readlines()
    f.close()
    FEATURES = []
    for line in lines[1:]:
        line = line.strip()
        FEATURES.append(line)

    #load label headers
    LABEL = 'label'

    #create input
    X = np.array(data_df[FEATURES].values)
    #create output
    y = np.array(data_df[LABEL].values)
    #create baseline
    baseline = np.array(data_df['return_SPY'].values)

    #normalize each feature around 0
    #single features require reshape
    baseline = preprocessing.scale(baseline).reshape(-1,1)
    X = preprocessing.scale(X)

    return X , baseline, y
예제 #2
0
def arrange_data():

	df = DataFrame.from_csv(open('final.csv'))

	#load all games before January 1, 2013 (training set).	
	X0_list = ([df[df.columns[2:22]][datetime(2006,12,15):datetime(2007,4,8)],
					df[df.columns[2:22]][datetime(2007,12,15):datetime(2008,4,8)],
					df[df.columns[2:22]][datetime(2008,12,15):datetime(2009,4,8)],
					df[df.columns[2:22]][datetime(2009,12,15):datetime(2010,4,8)],
					df[df.columns[2:22]][datetime(2010,12,15):datetime(2011,4,8)],
					df[df.columns[2:22]][datetime(2012,12,15):datetime(2013,1,1)]])	
	Y0_list = ([df[df.columns[22:28]][datetime(2006,12,15):datetime(2007,4,8)],
					df[df.columns[22:28]][datetime(2007,12,15):datetime(2008,4,8)],
					df[df.columns[22:28]][datetime(2008,12,15):datetime(2009,4,8)],
					df[df.columns[22:28]][datetime(2009,12,15):datetime(2010,4,8)],
					df[df.columns[22:28]][datetime(2010,12,15):datetime(2011,4,8)],
					df[df.columns[22:28]][datetime(2012,12,15):datetime(2013,1,1)]])
	
	#games after January 1, 2013.
	X1 = df[df.columns[2:22]][datetime(2013,1,1):datetime(2013,4,18)]
	Y1 = df[df.columns[22:28]][datetime(2013,1,1):datetime(2013,4,18)]

	X0 = X0_list[0]
	for i in range(1,len(X0_list)):
		X0 = concat([X0,X0_list[i]])

	Y0 = Y0_list[0]
	for i in range(1,len(Y0_list)):
		Y0 = concat([Y0,Y0_list[i]])
	
	#convert to numpy arrays, leave Y unchanged for now
	X0 = preprocessing.scale(numpy.array(X0))
	X1 = preprocessing.scale(numpy.array(X1))
			
	return (X0,Y0,X1,Y1)
예제 #3
0
파일: mldatasets.py 프로젝트: yk/mldatasets
def create_data_provider(dataset, force_write_cache = False, center_data = True,
                         scale_data = True, add_bias_feature = True, normalize_datapoints = False,
                         center_labels = False, scale_labels = False,
                         transform_labels_to_plus_minus_one = True, test_size=0.0):
    data, labels = dataset.get_data(force_write_cache=force_write_cache)
    copy = False
    if scale_data:
        data = preprocessing.scale(data, copy=copy)
    elif center_data:
        data = preprocessing.scale(data, with_std=False, copy=copy)
    if scale_labels:
        labels = preprocessing.scale(labels, copy=copy)
    elif center_labels:
        labels = preprocessing.scale(labels, with_std=False, copy=copy)
    if add_bias_feature:
        data = np.hstack((data, np.ones((data.shape[0], 1))))
    if normalize_datapoints:
        data /= np.linalg.norm(data, axis=1)[:, np.newaxis]
    if transform_labels_to_plus_minus_one:
        labels = labels * 2.0 - 1.0
    test_provider = None
    if test_size > 0.0:
        data, data_test, labels, labels_test = cross_validation.train_test_split(data, labels, test_size=test_size)
        test_provider = DataProvider(data_test, labels_test)
    return DataProvider(data, labels, test_provider=test_provider)
예제 #4
0
def classify():
    # read training data
    lbls1, X, y = readCsv(TRAIN_CSV, True)
    # read test data
    lbls2, Y, z = readTestCsv(TEST_CSV)
    
    # Conversion to numpy arrays
    X = np.array(X)
    X = X.astype(float)
    y = np.array(y)
    
    Y = np.array(Y)
    Y = Y.astype(float)
    
    # perform feature scaling for zero mean and unit variance
    scale(X, with_mean = True, with_std = True)
    scale(Y, with_mean = True, with_std = True)
    
    lin_svc = svm.LinearSVC(C = 4.0, dual = False)
    lin_svc.fit(X, y)
    
    bestmodel = lin_svc
    preds = bestmodel.predict(Y)
    
    writePredictions(lbls2, preds)
예제 #5
0
def read_dataset(train_size, scale=False, normalize=False):
    logging.info('fetching the dataset')
    #
    d = sklearn.datasets.load_diabetes() # 糖尿病
    #d = sklearn.datasets.load_boston() # ボストン住宅価格
    #
    data = d['data'].astype(np.float32)
    target = d['target'].astype(np.float32).reshape(len(d['target']), 1)
    #"Chainerのmnist.pyだと下記ののような書き方になっているが、ミニバッチの数が2以上だと動かない"らしい 
    #target = diabetes['target'].astype(np.float32) 
    # 本来訓練データで標準化・正規化して、そのパラメータをテストデータに適用すべき
    if normalize and scale:
        raise Exception('both normalize and scale can not be True')
    if normalize:
        data = preprocessing.normalize(data)
        target = preprocessing.normalize(target)
    if scale:
        data = preprocessing.scale(data)
        target = preprocessing.scale(target)
    # 分割
    x_train, x_test = np.split(data, [train_size])
    y_train, y_test = np.split(target, [train_size])
    assert len(x_train)==len(y_train)
    assert len(x_test)==len(y_test)
    return  ((x_train, y_train), (x_test, y_test), 
        {"SHAPE_TRAIN_X":x_train.shape,
          "SHAPE_TRAIN_Y":y_train.shape,
          "SHAPE_TEST_X":x_test.shape,
          "SHAPE_TEST_Y":y_test.shape,
          })
예제 #6
0
 def standardize(self):
     """
     impute
     """
     print('Standardization')
     self.tr = scale(self.tr)
     self.te = scale(self.te)
예제 #7
0
def test_scale_function_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sp.csr_matrix(X)

    X_scaled = scale(X, with_mean=False)
    assert_false(np.any(np.isnan(X_scaled)))

    X_csr_scaled = scale(X_csr, with_mean=False)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    # test csc has same outcome
    X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
    assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())

    # raises value error on axis != 0
    assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1)

    assert_array_almost_equal(X_scaled.mean(axis=0),
                              [0., -0.01,  2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
예제 #8
0
def split_into_chunks(data, train, predict, step, binary=True, scale=True):
    X, Y = [], []
    for i in range(0, len(data), step):
        try:
            x_i = data[i:i+train]
            y_i = data[i+train+predict]
            
            # Use it only for daily return time series
            if binary:
                if y_i > 0.:
                    y_i = [1., 0.]
                else:
                    y_i = [0., 1.]

                if scale: x_i = preprocessing.scale(x_i)
                
            else:
                timeseries = np.array(data[i:i+train+predict])
                if scale: timeseries = preprocessing.scale(timeseries)
                x_i = timeseries[:-1]
                y_i = timeseries[-1]
            
        except:
            break

        X.append(x_i)
        Y.append(y_i)

    return X, Y
예제 #9
0
    def __init__(self, data, labels, validation_data=None, validation_labels=None,
                 hidden_layer_size=0, loss_function="mean-squared-error",
                 learning_rate=1.0, decreasing_rate=False):
        self.input_layer_size = data.shape[1]
        self.hidden_layer_size = hidden_layer_size
        self.output_layer_size = len(np.unique(labels))

        if loss_function not in ("mean-squared-error", "cross-entropy"):
            raise ValueError("Loss function must be 'mean-squared-error' or 'cross-entropy'.")

        self.loss_function = loss_function

        self.data = scale(data)
        self.labels = labels

        self.Y = np.zeros((data.shape[0], self.output_layer_size))
        for i in range(data.shape[0]):
            label_i = labels[i]
            self.Y[i][label_i] = 1

        self.learning_rate = learning_rate
        self.decreasing_rate = decreasing_rate

        if validation_data is not None:
            self.validation_data = scale(validation_data)
        else:
            self.validation_data = None

        self.validation_labels = validation_labels
def trainModel():
    # Model parameters
    W = tf.Variable([.1000], tf.float32)
    b = tf.Variable([-.1000], tf.float32)
    # Model input and output
    x = tf.placeholder(tf.float32, shape=None)
    linear_model = W * x + b
    y = tf.placeholder(tf.float32)
    # loss
    loss = tf.reduce_sum(tf.square(linear_model - y))  # sum of the squares
    # optimizer
    optimizer = tf.train.GradientDescentOptimizer(0.01)
    train = optimizer.minimize(loss)
    # training data
    x_train = preprocessing.scale(mouseClickX)
    y_train = preprocessing.scale(mouseClickY)
    init = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init)  # reset values to wrong
    for i in range(500):
        sess.run([train], {x: x_train, y: y_train})
        if i % 50 == 0:
            # to visualize the result and improvement
            try:
                ax.lines.remove(lines[0])
            except Exception:
                pass
            print(x_train, y_train, i)
            prediction_value = sess.run(linear_model, feed_dict={x: mouseClickX})
            # plot the prediction
            lines = ax.plot(mouseClickX, prediction_value, 'r-', lw=5)
            plt.pause(1)
예제 #11
0
def fold_score_keras(events_A, events_B, model_A, df_A, df_B):
    """Returns scored events_B for a BDT_A."""

    # Get indices, train weights and classes for each of these splits.
    # w and Y need to be numpy arrays to work with skl.
    w_A = np.array([a.train_weight for a in events_A])
    w_B = np.array([a.train_weight for a in events_B])
    Y_A = np.array([a.classification for a in events_A])
    Y_B = np.array([a.classification for a in events_B])

    # Index our X training sets by row; convert to ndarrays.
    X_A = df_A.as_matrix()
    X_B = df_B.as_matrix()

    # Scale for the NN.
    X_A = scale(X_A)
    X_B = scale(X_B)

    # Fit model.
    model_A.fit(X_A, Y_A, sample_weight=w_A, validation_data=(X_B, Y_B, w_B),
                nb_epoch=1000, batch_size=32, callbacks=[EarlyStopping(patience=50)])
    model_A.save(datetime.now().strftime('%d%m%y_%H%S') + '_kerasmodel.h5')


    # Get scores of X_A for BDT_B and vice-versa.
    prob_tuples = model_A.predict_proba(X_B).tolist()
    # Only want the second element of the prob tuple (prob of signal).
    scores = [a[0] for a in prob_tuples]

    for e, s in zip(events_B, scores):
        e.set_decision_value(s)

    return events_B
def buildModel(size):
	with open('Sentiment Analysis Dataset.csv', 'rb') as csvfile:
		pos_tweets =[]
		neg_tweets =[]
		spamreader = csv.reader(csvfile, delimiter=',')
		for row in spamreader:
			if row[1] == '1':
				if not (len(pos_tweets) > size):
					pos_tweets.append(_cleanTweet(row[3]))
			else:
				if not (len(neg_tweets) > size):
					neg_tweets.append(_cleanTweet(row[3]))
	y = np.concatenate((np.ones(len(pos_tweets[0:size])), np.zeros(len(neg_tweets[0:size]))))
	x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets[0:size], neg_tweets[0:size])), y, test_size=0.2)
	x_train = _cleanText(x_train)
	x_test = _cleanText(x_test)
	n_dim = 100
	#Initialize model and build vocab
	imdb_w2v = Word2Vec(size=n_dim, min_count=10)
	imdb_w2v.build_vocab(x_train)
	imdb_w2v.train(x_train)
	train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
	train_vecs = scale(train_vecs)
	#Train word2vec on test tweets
	imdb_w2v.train(x_test)
	#Build test tweet vectors then scale
	test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
	test_vecs = scale(test_vecs)
	lr = SGDClassifier(loss='log', penalty='l1')
	lr.fit(train_vecs, y_train)
	imdb_w2v.save("imdb_w2v")
	f = open("Accuracy.txt","w")
	f.write(str(lr.score(test_vecs, y_test))+" "+str(size*2))
	f.close()
예제 #13
0
def main():
    indata = np.load(inputs)
    training_data = indata['data_training']
    training_scaled = preprocessing.scale(training_data)
    training_labels = indata['label_training']
    validation_data = indata['data_val']
    validation_scaled = preprocessing.scale(validation_data)
    validation_labels = indata['label_val']
    ts = range(-12,6)
    cs = [pow(10, t) for t in ts]
    accuracy_results = []
    accuracy_results_scaled = []

    for c in cs:
        lin_clf = svm.LinearSVC(C=c)
        lin_clf.fit(training_data, training_labels)
        predictions = lin_clf.predict(validation_data)
        accuracy = metrics.accuracy_score(validation_labels, predictions)
        accuracy_results.append(accuracy)

        lin_clf.fit(training_scaled, training_labels)
        predictions = lin_clf.predict(validation_scaled)
        accuracy_scaled = metrics.accuracy_score(validation_labels, predictions)
        accuracy_results_scaled.append(accuracy_scaled)

    plt.plot(range(len(cs)), accuracy_results, label='un-scaled')
    plt.plot(range(len(cs)), accuracy_results_scaled, label='scaled')
    plt.xticks(range(len(cs)), cs, size='small')
    plt.legend()
    plt.show()
    print accuracy_results
    print accuracy_results_scaled
예제 #14
0
    def run(self):
        roi_data = []
        seg_data = []

        provider_roi = self.roi_layer.dataProvider()
        provider_seg = self.seg_layer.dataProvider()

        feat_seg = QgsFeature()

        self.status.emit('building spatial index')
        time.sleep(0.3)
        index = QgsSpatialIndex()
        piter = 0
        feat_count = provider_seg.featureCount()
        for f in provider_seg.getFeatures():
            seg_data.append(f.attributes()[1:])
            index.insertFeature(f)
            piter += 1
            self.progress.emit(piter * 15 / feat_count)


        self.status.emit('extracting attributes')
        self.log.emit('extracting attributes from roi segments intersection')
        time.sleep(0.3)
        # intersect roi with segments and extract attributes
        piter = 0
        feat_count = provider_roi.featureCount()
        for feat_roi in provider_roi.getFeatures():
            geom = feat_roi.geometry()
            attr_roi = feat_roi.attributes()
            intersects = index.intersects(geom.boundingBox())
            for fid in intersects:
                ffilter = QgsFeatureRequest().setFilterFid(int(fid))
                provider_seg.getFeatures(ffilter).nextFeature(feat_seg)
                # filter geometries that does not intersect
                if geom.intersects(feat_seg.geometry()):
                    attr_seg = feat_seg.attributes()
                    roi_data.append(attr_seg[1:] + attr_roi)
            # emit progress
            piter += 1
            self.progress.emit(15 + (piter * 55 / feat_count))

        # read train data
        roi_data = np.array(roi_data)
        samples = roi_data[:,:-1]
        labels = roi_data[:,-1].astype(int)
        # svm fit and predict
        self.status.emit('svm: fitting data')
        time.sleep(0.3)
        classifier = svm.SVC(**self.svm_dict)
        classifier.fit(preprocessing.scale(samples), labels)
        self.progress.emit(85)

        self.status.emit('svm: predicting labels')
        time.sleep(0.3)
        seg_data = preprocessing.scale(seg_data)
        predictions = classifier.predict(seg_data).tolist()
        self.progress.emit(100)

        self.output = pickle.dumps(predictions)
def load_dataset(fname="../data/housing/housing.data",cols=(0,)):
  X = np.genfromtxt(fname,usecols=cols,delimiter = ',')
  #X = np.genfromtxt(fname,usecols=cols)

  num_features = X.shape[1]
  num_triplets = int(6*num_features*(num_features-1)*(num_features-2)/6);
    
  triplets = np.zeros((num_triplets,4*wx.shape[1]))

  print ':: loading dataset...please wait!'
  l = 0
  for i in range(num_features-2):
    for j in range(i+1,num_features-1):
      for k in range(j+1,num_features):
              
        permute_idx = itertools.permutations([i,j,k])
        for idx in permute_idx:
          x = scale(np.array(X[:,idx[0]]))[:,np.newaxis]
          y = scale(np.array(X[:,idx[1]]))[:,np.newaxis]
          z = scale(np.array(X[:,idx[2]]))[:,np.newaxis]
          
          triplets[l,:] = f3(x,y,z,np.hstack((x,y,z)))
          l = l + 1
          
  return (triplets,num_features,num_triplets)
def generate_X_y_arrays(f_train_set='%s/train_set.csv' % (data_path)):
    """
    生成分类器的训练集X 和标签集y

    Args:
        f_train_set: 训练集的csv文件
    Returns:
        X: training samples, size=[n_samples, n_features]
        y: class labels, size=[n_samples, 1]
    """
    from sklearn import preprocessing
    import numpy as np
    X = []
    y = []

    with open(f_train_set, 'r') as fin:
        fin.readline()  # 忽略首行
        for line in fin:
            cols = line.strip().split(',')
            X.append([float(i) for i in cols[1:]])
            y.append(int(cols[0]))  # tag在第一列,0 或 -1

    logger.debug('classifier input X_size=[%s, %s] y_size=[%s, 1]' % (len(X), len(X[0]), len(y)))
    X = preprocessing.scale(np.array(X))
    y = preprocessing.scale(np.array(y))
    return X, y
def load_qm7():
    datafile = '/home/hpc/pr63so/ga93yih2/gdb13/gdb13_atm.pkl'
    dataset = pickle.load(open(datafile, 'r'))
    split = 1
    P = dataset['P'][range(0, split)+ range(split+1, 5)].flatten()
    X = dataset['B'][P]
    Z = dataset['T'][P]
    Z = Z.reshape(Z.shape[0], 1)
    train_labels = Z
    Ptest = dataset['P'][split]
    TX = dataset['B'][Ptest]
    TZ = dataset['T'][Ptest]
    TZ = TZ.reshape(TZ.shape[0], 1)
    test_labels = TZ
    Z = scale(Z, axis=0)
    TZ = scale(TZ, axis=0)

    mean = X.mean(axis=0)
    std = (X - mean).std()

    X = (X - mean) / std
    TX = (TX - mean)/ std


    return X, Z, TX, TZ, train_labels, test_labels
def feature_scale(data,method):
    '''
    特征无量纲化
    常见的无量纲化方法有标准化和区间缩放法。
    
    标准化的前提是特征值服从正态分布,标准化后,其转换成标准正态分布。
    区间缩放法利用了边界值信息,将特征的取值区间缩放到某个特点的范围,例如[0, 1]等。
    
    标准化是依照特征矩阵的列处理数据,其通过求z-score的方法,
        将样本的特征值转换到同一量纲下。
  归一化是依照特征矩阵的行处理数据,其目的在于样本向量在点乘运算或其他核函数计算相似性时,
        拥有统一的标准,也就是说都转化为“单位向量”。
    
    操作流程:先做特征选择,分割训练测试集,再进行这一步
    '''
    
    if method == 'scale':
        from sklearn.preprocessing import scale
        scale(data)
        
    elif method == 'standard': 
        # 1 标准化,返回值为标准化后的数据
        from sklearn.preprocessing import StandardScaler
        StandardScaler().fit_transform(data)
    elif method == 'minmax': 
        # 2 区间缩放,返回值为缩放到[0, 1]区间的数据
        from sklearn.preprocessing import MinMaxScaler
        MinMaxScaler().fit_transform(data)
    elif method == 'normal': 
        #3 归一化,返回值为归一化后的数据
        from sklearn.preprocessing import Normalizer
        Normalizer().fit_transform(data)    
예제 #19
0
def getImages():
   digitsImagesNormalized = getImagesFromDir(digitsPath)
   lettersImagesNormalized = getImagesFromDir(lettersPath)

   digitsImagesNormalized = [skpre.scale(digitsImagesNormalized[0]), digitsImagesNormalized[1]]
   lettersImagesNormalized = [skpre.scale(lettersImagesNormalized[0]), lettersImagesNormalized[1]]

   allImages = []
   for i in digitsImagesNormalized[0]:
      allImages.append(i)

   for i in lettersImagesNormalized[0]:
      allImages.append(i)

   # Divide em teste e treino.
   # Calcula PCA - Reducao de dimensionalidade dos dados. :)
   pca = computePCA(allImages)
   digitstransformedData = pca.transform(digitsImagesNormalized[0])
   letterstransformedData = pca.transform(lettersImagesNormalized[0])

   dtrainDataTF, dtestDataTF, dclassesTrainTF, dclassesTestTF = train_test_split(digitstransformedData, digitsImagesNormalized[1], train_size=0.65)

   ltrainDataTF, ltestDataTF, lclassesTrainTF, lclassesTestTF = train_test_split(letterstransformedData, lettersImagesNormalized[1], train_size=0.65)
   
   return [[dtrainDataTF, dclassesTrainTF], [dtestDataTF, dclassesTestTF]], [[ltrainDataTF, lclassesTrainTF], [ltestDataTF, lclassesTestTF]]
def scaled_logistic_regression(x_train, t_train, x_test, t_test):

	 x_train_new = preprocessing.scale(x_train)

	 x_test_new = preprocessing.scale(x_test)
	 
	 return logistic_regression(x_train_new, t_train, x_test_new, t_test)
예제 #21
0
def extractFeatures(data, n):
    logging.info('Features: extracting {0}...'.format(n))

    # create DF
    columns = []
    col_names = ['open', 'high', 'low', 'close', 'volume']
    for col_name in col_names:
        for m in xrange(1, n+1):
            columns.append('{0}_{1}'.format(col_name, m))
    # pprint(columns)
    df = pd.DataFrame(dtype=float, columns=columns)

    pb = ProgressBar(maxval=len(data)).start()
    for i in xrange(n, len(data)+1):
        pb.update(i)
        slice = data.ix[i-n:i]
        # print slice
        scale(slice, axis=0, copy=False)
        # print slice
        cntr = 0
        item = {}
        for slice_index, slice_row in slice.iterrows():
            cntr += 1
            # print slice_index
            # print slice_row
            for col in slice.columns:
                item['{0}_{1}'.format(col, cntr)] = slice_row[col]
        # pprint(item)
        df.loc[i] = item
        # break
    pb.finish()

    logging.info('Features: extracted')
    return df
예제 #22
0
def main():

	X, Y, X_test = import_data()

	X_n = preprocessing.scale(X)
	X_t_n = preprocessing.scale(X_test)

	X_train, X_test, y_train, y_test = cross_validation.train_test_split( \
	X_n, Y, test_size=0.2, random_state=0)

	alpha = np.arange(0.001, 2.0, 0.001, np.float)

	best_alpha = 0
	best_score = 0

	for a in alpha:
		clf = linear_model.Ridge (alpha = a)
		clf.fit(X_train, y_train)
		sc = clf.score(X_test, y_test)
		if sc > best_score:
			best_alpha = a
			best_score = sc

	
	clf = linear_model.Ridge (alpha = best_alpha)
	clf.fit(X_train, y_train)
	res = clf.predict(X_t_n)

	for var in res:
		print(var[0])
예제 #23
0
def main():
    """TODO: Docstring for main.
    :returns: TODO

    """
    alpha = 1.
    decay = 0.0006
    iter_num = 600
    finetune_iter = 220
    hyper_params = {
            'hidden_layers_sizes':[196,], 'iter_nums':[400,],
            'alphas':[1.,], 'decays':[0.003,],
            'betas':[3,], 'rhos':[0.1,]
            }

    enc = OneHotEncoder(sparse=False)
    mnist = fetch_mldata('MNIST original', data_home='./')
    x_train, x_test, y_train, y_test = \
            train_test_split(scale(mnist.data.astype(float)).astype('float32'),
                             mnist.target.astype('float32'),
                             test_size=0.5, random_state=0)
    x_unlabeled = scale(mnist.data[mnist.target>=5,:].astype(float)).astype('float32')
    y_train = enc.fit_transform(y_train.reshape(y_train.shape[0],1)).astype('float32')

    t_x = T.matrix()
    params, extracted = pretrain_sae(x_unlabeled, hyper_params)
    extracted = function(inputs=[t_x], outputs=[sae_extract(t_x, params)])(x_train)[0]
    params.append(train_softmax(extracted, y_train, iter_num, alpha, decay))
    weights = finetune_sae(x_train, y_train, params, finetune_iter, alpha, decay)

    all_label = np.array(range(0, 10))
    pred = all_label[softmax2class_max(sae_predict(x_test, weights))]
    print accuracy_score(y_test, pred)
    print classification_report(y_test, pred)
    print confusion_matrix(y_test, pred)
예제 #24
0
    def get_correlation_data(self, round_number, liste_id, dataset):
        points = []

        #On récupère d'abord les pourcentages de vote pour la liste donnée
        poll_data = self.retrieve_total_votes_for_liste(round_number, liste_id)

        # on range les des données dans un dico propre
        data_x, data_y = [],[]
        for dept_data in poll_data:
            data_x.append(dept_data["vote_percentage"])
            data_y.append(dataset[dept_data["_id"]] / 100)
            points.append({"dept_id" : dept_data["_id"],
                           "votes_percentage" : dept_data["vote_percentage"],
                           "other_percentage" : dataset[dept_data["_id"]] / 100})

        array_x, array_y = array(data_x), array(data_y)

        # on normalise les données de vote et du dataset
        rescaled_x, rescaled_y  = preprocessing.scale(array_x), preprocessing.scale(array_y)

        #on calcule les couleurs pour chacun des départements
        colors, max_val = self._compute_colors(rescaled_x, rescaled_y)

        #surles données non normalisées, on calcule les coefficients de la droite de régression
        reg_slope, reg_y_intercept = self._linear_regression(array_x, array_y)

        for i, x in enumerate(rescaled_x):
            points[i]["votes_normalized"] = rescaled_x[i]
            points[i]["other_normalized"] = rescaled_y[i]
            points[i]["color"] = colors[i]

        return {"points" : points,
                "graph_metadata": {"max" : max_val,
                                   "regression": {"slope" : reg_slope,
                                                  "intercept" : reg_y_intercept}}}
예제 #25
0
파일: base.py 프로젝트: mikimaus78/groupNMF
def load_all_data(f_name, scale=True, rnd=False):
    """Get data with labels, split into training, validation and test set."""
    data_file = h5py.File(f_name, 'r')
    x_test = data_file['x_test'][:]
    x_dev = data_file['x_dev'][:]
    x_train = data_file['x_train'][:]
    data_file.close()
    if scale:
        print "scaling..."
        x_test = preprocessing.scale(x_test, with_mean=False)
        x_dev = preprocessing.scale(x_dev, with_mean=False)
        x_train = preprocessing.scale(x_train, with_mean=False)
    print "Total dataset size:"
    print "n train samples: %d" % x_train.shape[0]
    print "n test samples: %d" % x_test.shape[0]
    print "n dev samples: %d" % x_dev.shape[0]
    print "n features: %d" % x_test.shape[1]
    if rnd:
        print "Radomizing training set..."
        np.random.shuffle(x_train)

    return dict(
        x_train=x_train,
        x_test=x_test,
        x_dev=x_dev,
    )
예제 #26
0
파일: tools.py 프로젝트: helloTC/ATT
def permutation_cross_validation(estimator, X, y, n_fold=3, isshuffle=True, cvmeth='shufflesplit', score_type='r2', n_perm=1000):
    """
    An easy way to evaluate the significance of a cross-validated score by permutations
    -------------------------------------------------
    Parameters:
        estimator: linear model estimator
        X: IV
        y: DV
        n_fold: fold number cross validation
        cvmeth: kfold or shufflesplit. 
                shufflesplit is the random permutation cross-validation iterator
        score_type: scoring type, 'r2' as default
        n_perm: permutation numbers
    Return:
        score: model scores
        permutation_scores: model scores when permutation labels
        pvalues: p value of permutation scores
    """
    try:
        from sklearn import cross_validation, preprocessing
    except ImportError:
        raise Exception('To call this function, please install sklearn')
    if X.ndim == 1:
        X = np.expand_dims(X, axis = 1)
    if y.ndim == 1:
        y = np.expand_dims(y, axis = 1)
    X = preprocessing.scale(X)
    y = preprocessing.scale(y)
    if cvmeth == 'kfold':
        cvmethod = cross_validation.KFold(y.shape[0], n_fold, shuffle = isshuffle)
    elif cvmeth == 'shufflesplit':
        testsize = 1.0/n_fold
        cvmethod = cross_validation.ShuffleSplit(y.shape[0], n_iter = 100, test_size = testsize, random_state = 0)
    score, permutation_scores, pvalues = cross_validation.permutation_test_score(estimator, X, y, scoring = score_type, cv = cvmethod, n_permutations = n_perm)
    return score, permutation_scores, pvalues
def try_lvc_clf(train_X,train_y,test_X,test_y):

    train_X=scale(train_X)

    lvc=LinearSVC(C=0.1)
    lvc.fit(train_X,train_y)
    
    dec_y=lvc.decision_function(train_X)
    
    #choose the smallest 90%
    num_sel=int(len(dec_y)*0.8)
    assert len(dec_y)==train_X.shape[0]
    assert num_sel<=train_X.shape[0]
    
    s_idx=np.argsort(np.abs(dec_y))
    
    assert len(s_idx)==train_X.shape[0]

    for i in s_idx:
        if np.isnan(train_y[i])==True:
            print("smoking index:%s"%i)

    n_train_X=train_X[s_idx[0:num_sel],:]
    n_train_y=train_y[s_idx[0:num_sel]]


    n_train_X=scale(n_train_X)

    lvc.fit(n_train_X,n_train_y)
    

    test_X=scale(test_X)
    pred_y=lvc.predict(test_X)
    return pred_y
예제 #28
0
def get_feature_importances(data_table, obs_metadata, lines_table, use_con_flux=False):
    feature_importances_list = []
    X_colnames = None
    for line_name, line_wavelength in lines_table['source', 'wavelength_target']:
        subset = data_table[(data_table['source'] == line_name) & (data_table['wavelength_target'] == line_wavelength)]
        X, y, labels = get_X_and_y(subset, obs_metadata, use_con_flux)
        if X_colnames is None:
            X_colnames = X.colnames

        params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
                'learning_rate': 0.01, 'loss': 'lad'}
        clf = ensemble.GradientBoostingRegressor(**params)
        X = ndarrayidze(X)

        # Scaling is optional, but I think I'm going to do it (for now) for all methods,
        # just in comparing between valued here and with e.g. ICA there are fewer diffs
        X = skpp.scale(X)
        y = skpp.scale(y)

        clf.fit(X, y)
        feature_importances_list.append(clf.feature_importances_)

    fi = np.array(feature_importances_list)
    fi_table = Table(fi, names = X_colnames)
    fi_table.add_column(lines_table['source'])
    fi_table.add_column(lines_table['wavelength_target'])

    return fi_table
예제 #29
0
	def scale(self):
		# FIXME: this cannot work this way, scaling must be done with
		# the joined set.
		if (self.X != None):
			self.X = preprocessing.scale(self.X)
		if (self.X_test != None):
			self.X_test = preprocessing.scale(self.X_test)
from sklearn import preprocessing
import numpy as np

data= np.array([[2.2,5.9,-1.8],[5.4,-3.2,-5.1],[-1.9,4.2, 3.2]])
data
bindata=preprocessing.Binarizer(threshold=1.5).transform(data)
bindata


#Mean removal

data.mean(axis=0)#array([ 1.9       ,  2.3       , -1.23333333])
data.std(axis=0)#highly variable array([2.98775278, 3.95052739, 3.41207008])

#so,
scaled_data=preprocessing.scale(data)
scaled_data.mean(axis=0)#array([0.00000000e+00, 0.00000000e+00, 7.40148683e-17])
scaled_data.std(axis=0)#array([1., 1., 1.])

#scaling
#work with same data
data

minmax_scaler=preprocessing.MinMaxScaler(feature_range=(0, 1))
data_minmax=minmax_scaler.fit_transform(data)
data_minmax


#Normalization
#bringing the values of each feature vector on a common scale
예제 #31
0
# k means clustering for hand written digits classification
# on dataset from sklearn
# learning ML with https://www.techwithtim.net/tutorials/machine-learning-python/k-means-1/

import numpy as np
import sklearn
from sklearn.preprocessing import scale
from sklearn.datasets import load_digits
from sklearn.cluster import KMeans
from sklearn import metrics

digits = load_digits()
# using scale to scale data down - large values converted to range -1 - 1
data = scale(digits.data)
y = digits.target

k = 10
samples, features = data.shape


def bench_k_means(estimator, name, data):
    estimator.fit(data)
    print('%-9s\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' %
          (name, estimator.inertia_,
           metrics.homogeneity_score(y, estimator.labels_),
           metrics.completeness_score(y, estimator.labels_),
           metrics.v_measure_score(y, estimator.labels_),
           metrics.adjusted_rand_score(y, estimator.labels_),
           metrics.adjusted_mutual_info_score(y, estimator.labels_),
           metrics.silhouette_score(
               data, estimator.labels_, metric='euclidean')))
print("Starting preprocessing filtered tweets")
tweets_filtered['ekphrasis_text'] = tweets_filtered['text'].progress_apply(ekphrasis_preprocessing)
print('time taken:', str(time.time() - start_time), 'seconds')


import os
#print('GENSIM_DATA_DIR', os.environ['GENSIM_DATA_DIR'] )
    
start_time = time.time()
print('loading glove')
glove_twitter = api.load("glove-twitter-200")
print('time taken:', str(time.time() - start_time), 'seconds')    

start_time = time.time()
print('calculating embeddings')    
filtered_data_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in tqdm(tweets_filtered["text"])]))
print('time taken:', str(time.time() - start_time), 'seconds')
print('per tweet:', (time.time() - start_time)/tweets_filtered.shape[0], 'seconds')


for column in ["is_unemployed", "lost_job_1mo", "job_search", "is_hired_1mo", "job_offer"]:

    print('\n\n!!!!!', column)

#     start = time.time()
#     learner = create_model(column, best_epochs[column])
#     print('load model:', str(time.time() - start_time), 'seconds')

#     print('Predictions of Filtered Tweets:')
#     start_time = time.time()
#     predictions_filtered = learner.predict_batch(tweets_filtered['text'].values.tolist())
예제 #33
0
    plt.axis('tight')
    plt.xlabel('log alpha')
    plt.ylabel('coefficients')
    plt.title('coefficient trajectories for ' + name +
              ' regression at each alpha value')


''' 
LASSO regression
'''

lasso = Lasso(max_iter=10000, normalize=True)
coefs = []
for a in alphas:
    lasso.set_params(alpha=a)
    lasso.fit(scale(X), y)
    coefs.append(lasso.coef_)
plot_coefs(coefs, 'LASSO')

lassocv = LassoCV(alphas=None, cv=10, max_iter=100000, normalize=True)
lassocv.fit(X, y)
LASSO_best_alpha = lassocv.alpha_
lasso.set_params(
    alpha=LASSO_best_alpha
)  # fit LASSO regression with best alpha value after perform 10 folds CV
lasso.fit(X, y)
best_LASSO_MSE = mean_squared_error(y, lasso.predict(X))
LASSO_best_coefs = pd.Series(lasso.coef_, index=X.columns)
print("Best coefficients for LASSO regression: \n", LASSO_best_coefs)
'''
Ridge regression 
예제 #34
0
def X_transcriptome(base,index,standardize=True):
    X = pd.read_pickle(base + r'/trscr.pkl')
    if standardize:
        preprocessing.scale(X,copy=False)
    return X.loc[index]
예제 #35
0
forecast_col = 'Adj. Close'
df.fillna(-999999, inplace=True)


#Predict data 1% of the length of the dataframe in advance.
forecast_out = int(math.ceil(0.01*len(df)))
#print("Days in advance: "+ str(forecast_out))

df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)


#Features column, drop the label in our data set.
X = np.array(df.drop(['label'], 1))
y = np.array(df['label'])

X = preprocessing.scale(X) #Increases processing time.
y = np.array(df['label'])

#print("x is this long: "+ str(len(x)))
#print("Y is this long: " + str(len(y)))

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

clf = LinearRegression()
clf.fit(X_train, y_train)

accuracy = clf.score(X_test, y_test)

print(accuracy)
예제 #36
0
]]

# Lesson 3 - Replace Nan data to -9999. Create label and forecast out.
forecast_col = 'Adj. Close'
training_data.fillna(-9999, inplace=True)

forecast_out = int(math.ceil(0.01 * len(training_data)))

training_data['label'] = training_data[forecast_col].shift(-forecast_out)
training_data.dropna(inplace=True)

# Lesson 3-4 - Regression training and testing.

training_data.dropna(inplace=True)
X = np.array(training_data.drop(['label'], 1))
X = preprocessing.scale(X)
X_lately = X[-forecast_out:]
X = X[:-forecast_out]

training_data.dropna(inplace=True)
y = np.array(training_data['label'])
y_lately = y[-forecast_out:]
y = y[:-forecast_out]

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2)

# Linear regression
classifier = LinearRegression(n_jobs=-1)
classifier.fit(X_train, y_train)
accuracy = classifier.score(X_test, y_test)
예제 #37
0
파일: xgboost_pre.py 프로젝트: cycle13/moji
        if file[:3] == 'sfc' and file[-5:] == '.grib':
            inputfile = os.path.join(rootpath, file)

            sfcfile = Nio.open_file(inputfile, 'r')

            #参数0是指第0个时次的预报,这里只是一个文件的2000个站的列表。
            GetStationsAndOnetimesFromEC(ll, sfc_varinames, sfcfile, inputfile)
#训练集
stationArray = numpy.array(stationsVlist)
#预测集
trainlebelArray = numpy.array(trainlebellist)
a_train, a_test = train_test_split(stationArray,
                                   test_size=0.33,
                                   random_state=7)
#数据训练前进行标准化
x_scaled = preprocessing.scale(stationArray)
stationArray = x_scaled
#xgboost,训练集和预测集分割
x_train, x_test, y_train, y_test = train_test_split(stationArray,
                                                    trainlebelArray,
                                                    test_size=0.33,
                                                    random_state=7)
xgbtrain = xgboost.DMatrix(x_train, label=y_train)
xgbtest = xgboost.DMatrix(x_test, label=y_test)
#xgbtrain.save_binary('train.buffer')
#print len(x_train),len(x_test),len(y_train),len(y_test)
#print xgbtest
#训练和验证的错误率
watchlist = [(xgbtrain, 'xgbtrain'), (xgbtest, 'xgbeval')]
params = {
    'booster': 'gbtree',
예제 #38
0
    kn = KNeighborsClassifier(n_neighbors=k)
    kn.fit(X, Y)
    array = cross_val_score(estimator=kn, X=X, y=Y, cv=kf, scoring='accuracy')
    m = array.mean()
    kMeans.append(m)

m = max(kMeans)
indices = [i for i, j in enumerate(kMeans) if j == m]

print(indices[0] + 1)
print(np.round(m, decimals=2))

# Произведите масштабирование признаков с помощью функции sklearn.preprocessing.scale.
# Снова найдите оптимальное k на кросс-валидации.

X_scale = scale(X)

kMeans = list()
for k in range(1, 51):
    kn = KNeighborsClassifier(n_neighbors=k)
    array = cross_val_score(estimator=kn,
                            X=X_scale,
                            y=Y,
                            cv=kf,
                            scoring='accuracy')
    m = array.mean()
    kMeans.append(m)

# Какое значение k получилось оптимальным после приведения признаков к одному масштабу

m = max(kMeans)
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter +=1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0)
# Standardize the Data

scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

# Shuffle the Data

shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

# Splitting the Dataset into Train, Validation, Testing Dataset

samples_count = shuffled_inputs.shape[0]
train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count
예제 #40
0
# What features may distinguish cities? based on business sense and exploratory analysis

num_list = [
    'duration', 'days_in_advance', 'orig_destination_distance', 'is_mobile',
    'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt'
]
city_data = sample.dropna(axis=0)[num_list + ['user_location_city']]
city_groups = city_data.groupby(
    'user_location_city').mean().reset_index().dropna(axis=0)

# Step 2: shall I standardise the data?
# What is the magnitude of data range?

city_groups_std = city_groups.copy()
for i in num_list:
    city_groups_std[i] = preprocessing.scale(city_groups_std[i])

# Step 3: select clustering method and number of clusters
# The Elbow methods? choose a K so that the sum of the square error of the distances decrease drastically
# using an ad-hoc k=3 here, there are methods to help derive the optimal number for k

km = cluster.KMeans(n_clusters=3, max_iter=300, random_state=None)
city_groups_std['cluster'] = km.fit_predict(city_groups_std[num_list])

# Principal Component Analysis
pca = decomposition.PCA(n_components=2, whiten=True)
pca.fit(city_groups[num_list])
city_groups_std['x'] = pca.fit_transform(city_groups_std[num_list])[:, 0]
city_groups_std['y'] = pca.fit_transform(city_groups_std[num_list])[:, 1]
plt.scatter(city_groups_std['x'],
            city_groups_std['y'],
예제 #41
0
def l2_norm(x):
    return preprocessing.scale(x, axis=1, with_mean=False, with_std=True)
예제 #42
0
def run_SAM(in_data,
            skeleton=None,
            is_mixed=False,
            device="cpu",
            train=10000,
            test=1,
            batch_size=-1,
            lr_gen=.001,
            lr_disc=.01,
            lambda1=0.001,
            lambda2=0.0000001,
            nh=None,
            dnh=None,
            verbose=True,
            losstype="fgan",
            functionalComplexity="n_hidden_units",
            sampletype="sigmoidproba",
            dagstart=0,
            dagloss=False,
            dagpenalization=0.05,
            dagpenalization_increase=0.0,
            categorical_threshold=50,
            linear=False,
            numberHiddenLayersG=2,
            numberHiddenLayersD=2,
            idx=0):

    list_nodes = list(in_data.columns)
    if is_mixed:
        onehotdata = []
        for i in range(len(list_nodes)):
            # print(pd.get_dummies(in_data.iloc[:, i]).values.shape[1])
            if pd.get_dummies(
                    in_data.iloc[:,
                                 i]).values.shape[1] < categorical_threshold:
                onehotdata.append(pd.get_dummies(in_data.iloc[:, i]).values)
            else:
                onehotdata.append(scale(in_data.iloc[:, [i]].values))
        cat_sizes = [i.shape[1] for i in onehotdata]

        data = np.concatenate(onehotdata, 1)
    else:
        data = scale(in_data[list_nodes].values)
        cat_sizes = None

    nb_var = len(list_nodes)
    data = data.astype('float32')
    data = th.from_numpy(data).to(device)
    if batch_size == -1:
        batch_size = data.shape[0]

    lambda1 = lambda1 / data.shape[0]
    lambda2 = lambda2 / data.shape[0]

    rows, cols = data.size()
    # Get the list of indexes to ignore
    if skeleton is not None:
        skeleton = th.from_numpy(skeleton.astype('float32'))

    sam = SAM_generators((batch_size, cols),
                         nh,
                         skeleton=skeleton,
                         cat_sizes=cat_sizes,
                         linear=linear,
                         numberHiddenLayersG=numberHiddenLayersG).to(device)

    sam.reset_parameters()
    g_optimizer = th.optim.Adam(list(sam.parameters()), lr=lr_gen)

    if losstype != "mse":
        discriminator = SAM_discriminator(
            cols,
            dnh,
            numberHiddenLayersD,
            mask=sam.categorical_matrix,
        ).to(device)
        discriminator.reset_parameters()
        d_optimizer = th.optim.Adam(discriminator.parameters(), lr=lr_disc)
        criterion = th.nn.BCEWithLogitsLoss()
    else:
        criterion = th.nn.MSELoss()
        disc_loss = th.zeros(1)

    if sampletype == "sigmoid":
        graph_sampler = SimpleMatrixConnection(len(list_nodes),
                                               mask=skeleton).to(device)
    elif sampletype == "sigmoidproba":
        graph_sampler = MatrixSampler(len(list_nodes),
                                      mask=skeleton,
                                      gumble=False).to(device)
    elif sampletype == "gumbleproba":
        graph_sampler = MatrixSampler(len(list_nodes),
                                      mask=skeleton,
                                      gumble=True).to(device)
    else:
        raise ValueError('Unknown Graph sampler')

    graph_sampler.weights.data.fill_(2)

    graph_optimizer = th.optim.Adam(graph_sampler.parameters(), lr=lr_gen)

    if not linear and functionalComplexity == "n_hidden_units":
        neuron_sampler = MatrixSampler((nh, len(list_nodes)),
                                       mask=False,
                                       gumble=True).to(device)
        neuron_optimizer = th.optim.Adam(list(neuron_sampler.parameters()),
                                         lr=lr_gen)

    _true = th.ones(1).to(device)
    _false = th.zeros(1).to(device)
    output = th.zeros(len(list_nodes), len(list_nodes)).to(device)

    data_iterator = DataLoader(data,
                               batch_size=batch_size,
                               shuffle=True,
                               drop_last=True)

    # RUN
    if verbose:
        pbar = tqdm(range(train + test))
    else:
        pbar = range(train + test)
    for epoch in pbar:
        for i_batch, batch in enumerate(data_iterator):

            if losstype != "mse":
                d_optimizer.zero_grad()

            # Train the discriminator

            drawn_graph = graph_sampler()

            if not linear and functionalComplexity == "n_hidden_units":
                drawn_neurons = neuron_sampler()

            if linear or functionalComplexity != "n_hidden_units":
                generated_variables = sam(batch, drawn_graph)
            else:
                generated_variables = sam(batch, drawn_graph, drawn_neurons)

            if losstype != "mse":
                disc_vars_d = discriminator(generated_variables.detach(),
                                            batch)
                true_vars_disc = discriminator(batch)

                if losstype == "gan":
                    disc_loss = sum([criterion(gen, _false.expand_as(gen)) for gen in disc_vars_d]) / nb_var \
                                     + criterion(true_vars_disc, _true.expand_as(true_vars_disc))
                    # Gen Losses per generator: multiply py the number of channels
                elif losstype == "fgan":

                    disc_loss = th.mean(th.exp(disc_vars_d - 1), [0, 2]).sum(
                    ) / nb_var - th.mean(true_vars_disc)

                disc_loss.backward()
                d_optimizer.step()

            ### OPTIMIZING THE GENERATORS
            g_optimizer.zero_grad()
            graph_optimizer.zero_grad()

            if not linear and functionalComplexity == "n_hidden_units":
                neuron_optimizer.zero_grad()

            if losstype == "mse":
                gen_loss = criterion(generated_variables, batch)
            else:
                disc_vars_g = discriminator(generated_variables, batch)

                if losstype == "gan":
                    # Gen Losses per generator: multiply py the number of channels
                    gen_loss = sum([
                        criterion(gen, _true.expand_as(gen))
                        for gen in disc_vars_g
                    ])
                elif losstype == "fgan":
                    gen_loss = -th.mean(th.exp(disc_vars_g - 1), [0, 2]).sum()

            filters = graph_sampler.get_proba()
            struc_loss = lambda1 * drawn_graph.sum()

            if linear:
                func_loss = 0
            else:
                if functionalComplexity == "n_hidden_units":
                    func_loss = lambda2 * drawn_neurons.sum()

                elif functionalComplexity == "l2_norm":
                    l2_reg = th.Tensor([0.]).to(device)
                    for param in sam.parameters():
                        l2_reg += th.norm(param)

                    func_loss = lambda2 * l2_reg

            regul_loss = struc_loss + func_loss

            # Optional: prune edges and sam parameters before dag search

            if dagloss and epoch > train * dagstart:
                dag_constraint = notears_constr(filters * filters)
                #dag_constraint = notears_constr(drawn_graph)

                loss = gen_loss + regul_loss + (
                    dagpenalization + (epoch - train * dagstart) *
                    dagpenalization_increase) * dag_constraint
            else:
                loss = gen_loss + regul_loss
            if verbose and epoch % 20 == 0 and i_batch == 0:
                pbar.set_postfix(gen=gen_loss.item() / cols,
                                 disc=disc_loss.item(),
                                 regul_loss=regul_loss.item(),
                                 tot=loss.item())

            if epoch < train + test - 1:
                loss.backward()

            if epoch >= train:
                output.add_(filters.data)

            g_optimizer.step()
            graph_optimizer.step()
            if not linear and functionalComplexity == "n_hidden_units":
                neuron_optimizer.step()

    return output.div_(test).cpu().numpy()
예제 #43
0
    def classify(self, proto, df):
        from sklearn import preprocessing
        from sklearn.externals import joblib
        #from sklearn import preprocessing
        print('proto:', proto)
        #print(df)
        if proto == "tcp":
            tcp_packet = preprocessing.scale(
                df.drop(['ipv4src', 'ipv4dst'], axis=1))
            tcpclf = joblib.load('tcp_clf_kn.pkl')
            result = tcpclf.predict(tcp_packet)
            result = result[0].split(
                '_')  # result sth like 'http_norm_request'
            eth_tp = 0x0800  # ipv4
            ip_pt = 6  # tcp
            if result[1] == 'norm':  # for QoS, skip 1st 2nd HS
                flg = self.tcpFlg(
                    df.tcpFlgint
                )  # useless, convert the flg bit back to readable str
                self.countHS = self.countHS + 1  # count for the tcp handshake
                info = ''.join(
                    map(str,
                        (self.countHS, ') ', df.ipv4src.values[0], ':',
                         df.tcpSport.values[0], ' -> ', df.ipv4dst.values[0],
                         ':', df.tcpDport.values[0], flg.values[0], '(',
                         df.tcpFlgint.values[0], ')')))
                print(info)  # just print out the ip port tcpFlags
                if self.countHS < 5:  # look into first 4 initial handshakes, if not SYN(2) or SYN/ACK(18) or ACK(16) or FIN/ACK(1/17) or FIN/PSH/ACK(25)
                    if df.tcpFlgint.values[0] not in [
                            1, 2, 16, 18, 17, 24, 25
                    ]:
                        print('invalid 3 way handshake... blocked')
                        return (99, eth_tp, ip_pt, "bad", df.ipv4src.values[0],
                                df.ipv4dst.values[0], df.tcpSport.values[0],
                                df.tcpDport.values[0])
                if self.countHS > 4:
                    self.countHS = 0
                    return (1, eth_tp, ip_pt, 'norm', df.ipv4src.values[0],
                            df.ipv4dst.values[0], None, None)
                #if re.match(r'(http*)', result[0]):
                return (1, None, None, 'later', None, None, None, None
                        )  # delay flow install, mon mon sin
        else:
            self.countHS = 0

        if proto == "http":
            #df.drop(['Accept','Host','httpPath'],axis=1,inplace=True)
            empcol = [
                'Host', 'Accept', 'Connection_Keep-Alive',
                'Connection_keep-alive', 'httpMethod_GET', 'httpMethod_POST',
                'httpProto_HTTP/1.0', 'httpProto_HTTP/1.1',
                'uAgentBrowser_Chrome', 'uAgentBrowser_Firefox',
                'uAgentBrowser_Wget', 'uAgentBrowser_curl', 'uAgentOS_Linux',
                'uAgentOS_Other', 'uAgentOS_Windows 7'
            ]
            empDF = pd.DataFrame(columns=empcol)
            new_features = pd.concat([empDF, pd.get_dummies(df)],
                                     axis=0,
                                     join_axes=[empDF.columns]).fillna(value=0)
            new_features['Host'].fillna(0, inplace=True)
            new_features['Accept'].fillna(0, inplace=True)
            new_features['Host'][new_features.Host != 0] = 1
            new_features['Accept'][new_features.Accept != 0] = 1
            #print(new_features)
            httpclf = joblib.load('http_clf_KN.pkl')
            result = httpclf.predict(new_features)
            print(result)
            return result[0]  # return user agent to decide how to handle
예제 #44
0
    os.makedirs(plotpath)

for toy_label, toy_X in toy_dataset_list:

    print('\n##### Now running dataset %s through tier 1 #####' % toy_label)

    #Create directory if directory does not exist
    toy_filepath = '%s%s/' % (filepath, toy_label)
    toy_plotpath = '%splotly_js/' % toy_filepath

    if not os.path.exists(toy_filepath):
        os.makedirs(toy_filepath)
    if not os.path.exists(toy_plotpath):
        os.makedirs(toy_plotpath)

    toy_X_scaled = scale(toy_X)

    toy_X_rows, toy_X_cols = toy_X_scaled.shape

    #default gamma
    def_gamma = 1 / toy_X_cols

    #Tier1 gamma values
    #t1_gamma_list = [def_gamma/10000, def_gamma/1000, def_gamma/100, def_gamma/10, def_gamma, def_gamma*10, def_gamma*100, def_gamma*1000, def_gamma*10000, def_gamma*10000]
    #t1_gamma_list = [def_gamma/100, def_gamma/10, def_gamma]
    t1_gamma_list = [def_gamma]

    # Dict of gammas w/ t1 Matrices
    amat_dict = dict()

    #Scale initial data to centre
예제 #45
0
df['label'] = df[forcast_column].shift(-forecast_out)

#############################################################################
#
#               HL_PCT  PCT_change  Adj. Close  Adj. Volume       label
# Date
# 2004-08-19  3.712563    0.324968   50.322842   44659000.0  214.973603
# 2004-08-20  0.710922    7.227007   54.322689   22834300.0  212.395645
# 2004-08-23  3.729433   -1.227880   54.869377   18256100.0  202.394773
# 2004-08-24  6.417469   -5.726357   52.597363   15247300.0  203.083148
# 2004-08-25  1.886792    1.183658   53.164113    9188600.0  207.686157
#
#############################################################################

X = np.array(df.drop(['label'], 1))
X = preprocessing.scale(X)  # normalize X

X_lately = X[-forecast_out:]
X = X[:-forecast_out:]

df.dropna(inplace=True)
y = np.array(df['label'])

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2)  # making 20% testing data

LrModel = LinearRegression(
)  # n_jobs = 10 parameter means it will work 10 jobs parallel
LrModel.fit(X_train, y_train)
accuracy = LrModel.score(X_test, y_test)
"""
#Load the dataset
auto_data = pd.read_csv("auto-data.csv")
auto_data.dtypes
auto_data.describe()
auto_data.head()

#Look at scatter plots
plt.scatter(auto_data.HP, auto_data.PRICE)
plt.cla()
plt.scatter(auto_data['MPG-CITY'], auto_data['MPG-HWY'])
plt.cla()

#Center and scale
from sklearn import preprocessing
auto_data['HP'] = preprocessing.scale(auto_data['HP'].astype('float64'))
auto_data['RPM'] = preprocessing.scale(auto_data['RPM'].astype('float64'))
auto_data['MPG-CITY'] = preprocessing.scale(
    auto_data['MPG-CITY'].astype('float64'))
auto_data['MPG-HWY'] = preprocessing.scale(
    auto_data['MPG-HWY'].astype('float64'))
auto_data['PRICE'] = preprocessing.scale(auto_data['PRICE'].astype('float64'))
auto_data.describe()
"""
In order to demonstrate the clusters being formed on a 
2-dimensional plot, we will only use 100 samples and 
2 attributes - HP and PRICE to create 4 clusters.

"""

from sklearn.cluster import KMeans
예제 #47
0
print(ink)
# MEAN
ink_mean = [np.mean(ink[labels == i]) for i in range(10)]
print(ink_mean)

# STANDARD DEV
ink_std = [np.std(ink[labels == i]) for i in range(10)]
print(ink_std)

print(zero_digits)
zero_digits_mean = [np.mean(zero_digits[labels == i]) for i in range(10)]
zero_digits_std = [np.std(zero_digits[labels == i]) for i in range(10)]
print(zero_digits_mean)
print(zero_digits_std)

ink = prep.scale(ink).reshape(-1, 1)
zero_digits = prep.scale(ink).reshape(-1, 1)

x_ink = ink
x_ink_zero = pd.DataFrame(data=np.column_stack((ink, zero_digits)))
x_zero_digits = zero_digits

zero = np.array([sum(row) for row in zero])
zero = prep.scale(zero).reshape(-1, 1)
x_zero = zero

# <number>_extra => concat the ink feature with the zero count feature
zero_extra = np.array([np.count_nonzero(row == 0) for row in zero])
zero_extra = prep.scale(zero_extra).reshape(-1, 1)
x_zero_extra = pd.DataFrame(data=np.column_stack((zero, zero_extra)))
import numpy as np
import sklearn
from sklearn.preprocessing import scale
from sklearn.datasets import load_digits
from sklearn.cluster import KMeans
from sklearn import metrics

digits = load_digits()
data = scale(digits.data) #scaling our data down to a range from -1 to 1
y = digits.target

k = len(np.unique(y))
samples, features = data.shape

def bench_k_means(estimator, name, data):
    estimator.fit(data)
    print('%-9s\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, estimator.inertia_,
             metrics.homogeneity_score(y, estimator.labels_),
             metrics.completeness_score(y, estimator.labels_),
             metrics.v_measure_score(y, estimator.labels_),
             metrics.adjusted_rand_score(y, estimator.labels_),
             metrics.adjusted_mutual_info_score(y,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean')))

clf = KMeans(n_clusters=k, init="random", n_init=10)
bench_k_means(clf,"1", data)


예제 #49
0
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] -
                    df['Adj. Open']) / df['Adj. Open'] * 100.0
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
#label is future price Adj. Close of future
#lets create the variable
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True)  #NA's treatement

forcast_out = int(math.ceil(0.01 * len(df)))
print(forcast_out)
#10% of total time in future value to predict
df['label'] = df[forecast_col].shift(-forcast_out)
df.dropna(inplace=True)
#features are everything except label
x = np.array(df.drop(['label'], 1))
y = np.array(df['label'])
#scale along the training data
x = preprocessing.scale(x)
#print(len(x),len(y))

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
    x, y, test_size=0.2)

clf = svm.SVR()  #switching it to SVM
#try svm.SVR(kernel='poly')
clf.fit(X_train, Y_train)
accuracy = clf.score(X_test, Y_test)
print(accuracy)
예제 #50
0
knn_1 = KNeighborsClassifier(n_neighbors=5)
knn_1.fit(X_train_minmax, y_train.values.ravel())
score = accuracy_score(y_test, knn_1.predict(X_test_minmax))
score
"""Why Normalization?
Normalization rescales the values into a range of [0,1]. This might be useful in some cases where all parameters need to have the same positive scale.
"""

print(X_train_minmax)

# Standardizing the train and test data
from sklearn.preprocessing import scale

X_train_scale = scale(X_train[[
    'Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Key',
    'Liveness', 'Loudness', 'Mode', 'PreviousHit', 'Speechiness', 'Tempo',
    'Valence'
]])
X_test_scale = scale(X_test[[
    'Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Key',
    'Liveness', 'Loudness', 'Mode', 'PreviousHit', 'Speechiness', 'Tempo',
    'Valence'
]])

print(X_train_scale)
knn_2 = KNeighborsClassifier(n_neighbors=15)
knn_2.fit(X_train_scale, y_train.values.ravel())
score = accuracy_score(y_test, knn_2.predict(X_test_scale))
score
"""Standardization is the process where the features are rescaled so that they’ll have the properties of a standard normal distribution with μ=0 and σ=1, where μ is the mean (average) and σ is the standard deviation from the mean. 
예제 #51
0
# x = np.array([2.5,0.5,2.2,1.9,3.1,2.3,2,1,1.5,1.1])
# y = np.array([2.4,0.7,2.9,2.2,3,2.7,1.6,1.1,1.6,0.9])
x = np.array([1, 2, 3])
y = np.array([4, 5, 6])

x_mean = np.mean(x)
y_mean = np.mean(y)

scaled_x = x - x_mean
scaled_y = y - y_mean
# data = np.matrix([[scaled_x[i], scaled_y[i]] for i in range(len(scaled_x))])
data = np.matrix(list(zip(scaled_x, scaled_y)))

standard = StandardScaler()
standard = scale()
data_standard = standard.fit_transform(np.array(list(zip(x, y))))

plt.scatter(scaled_x, scaled_y)
plt.scatter(x, y)
plt.show()

cov = np.cov(scaled_x, scaled_y)
cov = np.cov(data.T)

eig_val, eig_vec = np.linalg.eig(cov)
eig_pairs = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(len(eig_val))]
eig_pairs.sort(reverse=True)

feature = eig_pairs[0][1]
new_data_reduced = np.transpose(np.dot(feature, np.transpose(data)))
예제 #52
0
seed = 3453
np.random.seed(seed)

split = 1
P = np.hstack(dataset['P'][range(0, split) + range(split + 1, 5)])
X = dataset['B'][P]
Z = dataset['T'][P]
#Z = Z.reshape(Z.shape[0], 1)
train_labels = Z

Ptest = dataset['P'][split]
TX = dataset['B'][Ptest]
TZ = dataset['T'][Ptest]
#TZ = TZ.reshape(TZ.shape[0], 1)
test_labels = TZ
Z = scale(Z, axis=0)
TZ = scale(TZ, axis=0)
weights = []

batch_size = 25
#max_iter = max_passes * X.shape[ 0] / batch_size
max_iter = 1000
n_report = X.shape[0] / batch_size

stop = climin.stops.AfterNIterations(max_iter)
pause = climin.stops.ModuloNIterations(n_report)

optimizer = 'gd', {'step_rate': 0.001, 'momentum': 0}

typ = 'plain'
if typ == 'plain':
예제 #53
0

#supress scikit future warnings
def warn(*args, **kwargs):
    pass


import warnings
warnings.warn = warn

from numpy import mean, std
from sklearn.preprocessing import StandardScaler, scale
scaler = StandardScaler()
scaler.fit(x)
x_transformed = scaler.transform(x)
x_scaled = scale(x)
y_scaled = scale(y)
# print(x_scaled)
# print(x_transformed)
#print(mean(x))
#print(mean(x_scaled))
#print(mean(x_transformed))
#print(std(x))
#print(std(x_scaled))
#print(std(x_transformed))

#print(mean(x_scaled))
#print(std(x_scaled))

from sklearn.linear_model import Lars
from sklearn.model_selection import train_test_split
예제 #54
0
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)

#####K MEANS CLUSTER
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from scipy.spatial.distance import cdist, pdist, euclidean
from sklearn.cluster import KMeans
from sklearn import metrics

X = Fullplayerlistf._get_numeric_data().dropna(axis=1)

del X['UFA']
del X['Age']

df = pd.DataFrame(X)
X = scale(X)

Player = Fullplayerlistf['Player']

#DETERMINE # OF VARIABLES TO USE
pca = PCA(n_components=28)
pca.fit(X)
var = pca.explained_variance_ratio_
var1 = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4) * 100)
print(var1)
plt.plot(var1)

pca = PCA(n_components=21)
pca.fit(X)
X1 = pca.fit_transform(X)
loadings_df = pd.DataFrame(pca.components_, columns=df.columns)
print('方法一求出的众数:', argmaxcount1)
print('方法二求出的众数:', argmaxcount2)

# 中位数
medianumber = np.median(data)
print('中位数:', medianumber)

# 极差
ptp = np.ptp(data)
print('极差:', ptp)

# 标准差
standard = np.std(data)
print('标准差:', standard)

# 对这些数据进行预处理,使其均值为零,方差为1
data_preprocess = preprocessing.scale(data)
print(data_preprocess)
print('处理后的数据方差为:', data_preprocess.std())

# 曲线形式,直方图形式
plt.hist(data_preprocess,
         bins=40,
         normed=0,
         facecolor="blue",
         edgecolor="black",
         alpha=0.7)
plt.xlabel("随机数据的区间")
plt.ylabel("随机数据频数/频率")
plt.title("随机生成数据的频数直方图")
plt.show()
예제 #56
0
def feature_scale(x):
    b, h, w, c = x.shape
    x = scale(x.reshape([b, -1]), 1)
    return x.reshape([b, h, w, c])
                timestep) == 0:  #return remainder after division

        T1_result.append(T1)
        T2_result.append(T2)
        water_level_result.append(water_level)
        surface_runoff_result.append(surface_runoff / timestep * 1000)  #mm/day
        subsurface_runoff_result.append(water_out_subsurface_runoff * daySec *
                                        1000)  #mm/day
        #try to avoid this append procedure, sloving down the code alot?

        #ADD SENSIBLE AND LATENT HEAT HERE SO YOU CAN PLOt THEM

    count = count + 1
#%%
from sklearn import preprocessing
surface_runoff_scaled = preprocessing.scale(surface_runoff_result)
plt.figure()
plt.plot(surface_runoff_result)
plt.show()
#%%
a = np.zeros(10)
print(a)

new_a = a

for i in range(0, len(a)):
    new_a[i] = i
print(new_a)
#%%
'''Complete Water Balance'''
'''
예제 #58
0
def long_features(pat, outfile, datapath, timer):
    f = datapath + "/*mat"

    pat_num = pat
    ff = glob.glob(f)

    label = [str(os.path.basename(n)) for n in ff]
    print(label)

    output = []
    featureList = []

    mytimer = []
    bands = [0.1, 4, 8, 12, 30, 70]

    for j in range(16):

        mydata = []
        for i in range(len(ff)):
            output = []
            outputtimer = []
            featureList = []
            featureListimer = []
            if os.path.basename(ff[i]) == "1_45_1.mat":
                continue
            data = get_data(ff[i])
            data = preprocessing.scale(data, axis=1, with_std=True)
            featureList.append("File")
            # featureListimer.append('File')
            output.append(label[i])
            # outputtimer.append(label[i])
            featureList.append("pat")
            # featureListimer.append('pat')
            output.append(pat_num)
            # outputtimer.append(pat_num)
            welsh = []

            hold = spsig.decimate(data[j, :], 5, zero_phase=True)

            # start = time.time()
            # featureList.append('sigma%i' % (j))
            # output.append(hold.std())
            total_time = time.time() - start
            featureListimer.append("sigma%i" % (j))
            outputtimer.append(total_time)

            # start = time.time()
            featureList.append("kurt%i" % (j))
            output.append(spstat.kurtosis(hold))
            # total_time = time.time() - start
            # featureListimer.append('kurt%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            featureList.append("skew%i" % (j))
            output.append(spstat.skew(hold))
            # total_time = time.time() - start
            # featureListimer.append('skew%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            # featureList.append('zero%i'%(j))
            # output.append(((hold[:-1] * hold[1:]) < 0).sum())
            # total_time = time.time() - start
            # featureListimer.append('zero%i'%(j))
            # outputtimer.append(total_time)

            diff = np.diff(hold, n=1)
            diff2 = np.diff(hold, n=2)

            # start = time.time()
            # featureList.append('sigmad1%i'%(j))
            # output.append(diff.std())
            # total_time = time.time() - start
            # featureListimer.append('sigmad1%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            # featureList.append('sigmad2%i'%(j))
            # output.append(diff2.std())
            # total_time = time.time() - start
            # featureListimer.append('sigmad2%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            featureList.append("zerod%i" % (j))
            output.append(((diff[:-1] * diff[1:]) < 0).sum())
            # total_time = time.time() - start
            # featureListimer.append('zerod%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            # featureList.append('zerod2%i'%(j))
            # output.append(((diff2[:-1] * diff2[1:]) < 0).sum())
            # total_time = time.time() - start
            # featureListimer.append('zerod2%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            featureList.append("RMS%i" % (j))
            output.append(np.sqrt((hold**2).mean()))
            # total_time = time.time() - start
            # featureListimer.append('RMS%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            f, psd = spsig.welch(hold, fs=80)
            print(f)
            print(psd)
            print("yes")
            # total_time = time.time() - start
            # welsh.append(total_time)

            psd[0] = 0

            # start = time.time()
            featureList.append("MaxF%i" % (j))
            output.append(psd.argmax())
            # total_time = time.time() - start
            # featureListimer.append('MaxF%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            featureList.append("SumEnergy%i" % (j))
            output.append(psd.sum())
            # total_time = time.time() - start
            # featureListimer.append('SumEnergy%i'%(j))
            # outputtimer.append(total_time)

            psd /= psd.sum()
            for c in range(1, len(bands)):
                # start = time.time()
                featureList.append("BandEnergy%i%i" % (j, c))
                output.append(psd[(f > bands[c - 1]) & (f < bands[c])].sum())
                # total_time = time.time() - start
                # featureListimer.append('BandEnergy%i%i'%(j,c))
                # outputtimer.append(total_time)

            # start = time.time()
            # featureList.append('entropy%i'%(j))
            # output.append(-1.0*np.sum(psd[f>bands[0]]*np.log10(psd[f>bands[0]])))
            # total_time = time.time() - start
            # featureListimer.append('entropy%i'%(j))
            # outputtimer.append(total_time)

            # pdb.exit()
            # start = time.time()
            featureList.append("Mobility%i" % (j))
            output.append(np.std(diff) / hold.std())
            # total_time = time.time() - start
            # featureListimer.append('Mobility%i'%(j))
            # outputtimer.append(total_time)

            # start = time.time()
            featureList.append("Complexity%i" % (j))
            output.append(np.std(diff2) * np.std(hold) / (np.std(diff)**2.0))
            # total_time = time.time() - start
            # featureListimer.append('Complexity%i'%(j))
            # outputtimer.append(total_time)

            mydata.append(
                pd.DataFrame({
                    "Features": output
                }, index=featureList).T)
            # mytimer.append(pd.DataFrame({'Features':outputtimer},index=featureListimer).T)

            welsh_df = pd.DataFrame(welsh, columns=["value"])

            trainSample = pd.concat(mydata, ignore_index=True)

        new_outfile = outfile[:-4] + "_" + str(j) + ".csv"
        trainSample.to_csv(new_outfile)

    return 1
예제 #59
0
    
        else:                             # condition where student received an incomplete
            new.append(2)
    return(new)                           # 1-dimensional array returned

X = df.drop('G3',1)                       # this is the design matrix
y = list(df.G3)                           # this is the discrete response vector
y_new = response_conv(y)                  # this is the multinomial response vector

clf = DecisionTreeClassifier()
clf.fit(X,y)

model = SelectFromModel(clf,prefit=True)
newX = model.transform(X)                 # select most influential predictors

X_scale = preprocessing.scale(newX)       # scaled design matrix
X_norm = preprocessing.normalize(newX)    # normalized design matrix

random.seed(42)
X1_train, X1_test, y1_train, y1_test = train_test_split(newX, y_new, test_size=0.33, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scale, y_new, test_size=0.33, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X_norm, y_new, test_size=0.33, random_state=42)
########################################################################################################################
combos = cartesian([['gini','entropy'],['best','random'],['auto','log2'],np.arange(1,(X1_train.shape[0]-1))])

def opt(X,y):
    acc = []

    for c,s,mf,md in combos:
        dt = DecisionTreeClassifier(criterion=c,splitter=s,max_features=mf,max_depth=int(md),random_state=42)
        scores = cross_val_score(dt, X, y, cv=10, scoring='accuracy')
예제 #60
-1
def normalize_data(tr_x,ts_x,normz=None,axis=0):
    if normz is 'scale':
        tr_x = scale(tr_x,axis=axis)
        ts_x = scale(ts_x,axis=axis)
    elif normz is 'minmax':
        minmax_scaler = MinMaxScaler()
        if axis==0:
            for c_i in range(tr_x.shape[1]):
                tr_x[:,c_i] = minmax_scaler.fit_transform(tr_x[:,c_i])
                ts_x[:,c_i] = minmax_scaler.fit_transform(ts_x[:,c_i])
        elif axis==1:
            for r_i in range(tr_x.shape[0]):
                tr_x[r_i,:] = minmax_scaler.fit_transform(tr_x[r_i,:])
                ts_x[r_i,:] = minmax_scaler.fit_transform(ts_x[r_i,:])
    elif normz is 'sigmoid':
        if axis==0:
            col_max = np.max(tr_x,axis=0)
            cols_non_norm = np.argwhere(col_max>1).tolist()
            tr_x[:,cols_non_norm] = -0.5 + (1 / (1 + np.exp(-tr_x[:,cols_non_norm])))
            # TODO: implement col_max col_non_norm for test set
            ts_x[:,cols_non_norm] = -0.5 + (1/(1+np.exp(-ts_x[:,cols_non_norm])))
        elif axis==1:
            row_max = np.max(tr_x,axis=1)
            rows_non_norm = np.argwhere(row_max>1).tolist()
            tr_x[rows_non_norm,:] = -0.5 + (1 / (1 + np.exp(-tr_x[rows_non_norm,:])))
            # TODO: implement row_max row_non_norm for test set
            ts_x[rows_non_norm,:] = -0.5 + (1/(1+np.exp(-ts_x[rows_non_norm,:])))

    return tr_x,ts_x