def loadData(): data_df = pd.DataFrame.from_csv('feature_matrix.csv') # print data_df[-5:] #load feature headers filename = 'features.csv' f = open(filename, 'r') lines = f.readlines() f.close() FEATURES = [] for line in lines[1:]: line = line.strip() FEATURES.append(line) #load label headers LABEL = 'label' #create input X = np.array(data_df[FEATURES].values) #create output y = np.array(data_df[LABEL].values) #create baseline baseline = np.array(data_df['return_SPY'].values) #normalize each feature around 0 #single features require reshape baseline = preprocessing.scale(baseline).reshape(-1,1) X = preprocessing.scale(X) return X , baseline, y
def arrange_data(): df = DataFrame.from_csv(open('final.csv')) #load all games before January 1, 2013 (training set). X0_list = ([df[df.columns[2:22]][datetime(2006,12,15):datetime(2007,4,8)], df[df.columns[2:22]][datetime(2007,12,15):datetime(2008,4,8)], df[df.columns[2:22]][datetime(2008,12,15):datetime(2009,4,8)], df[df.columns[2:22]][datetime(2009,12,15):datetime(2010,4,8)], df[df.columns[2:22]][datetime(2010,12,15):datetime(2011,4,8)], df[df.columns[2:22]][datetime(2012,12,15):datetime(2013,1,1)]]) Y0_list = ([df[df.columns[22:28]][datetime(2006,12,15):datetime(2007,4,8)], df[df.columns[22:28]][datetime(2007,12,15):datetime(2008,4,8)], df[df.columns[22:28]][datetime(2008,12,15):datetime(2009,4,8)], df[df.columns[22:28]][datetime(2009,12,15):datetime(2010,4,8)], df[df.columns[22:28]][datetime(2010,12,15):datetime(2011,4,8)], df[df.columns[22:28]][datetime(2012,12,15):datetime(2013,1,1)]]) #games after January 1, 2013. X1 = df[df.columns[2:22]][datetime(2013,1,1):datetime(2013,4,18)] Y1 = df[df.columns[22:28]][datetime(2013,1,1):datetime(2013,4,18)] X0 = X0_list[0] for i in range(1,len(X0_list)): X0 = concat([X0,X0_list[i]]) Y0 = Y0_list[0] for i in range(1,len(Y0_list)): Y0 = concat([Y0,Y0_list[i]]) #convert to numpy arrays, leave Y unchanged for now X0 = preprocessing.scale(numpy.array(X0)) X1 = preprocessing.scale(numpy.array(X1)) return (X0,Y0,X1,Y1)
def create_data_provider(dataset, force_write_cache = False, center_data = True, scale_data = True, add_bias_feature = True, normalize_datapoints = False, center_labels = False, scale_labels = False, transform_labels_to_plus_minus_one = True, test_size=0.0): data, labels = dataset.get_data(force_write_cache=force_write_cache) copy = False if scale_data: data = preprocessing.scale(data, copy=copy) elif center_data: data = preprocessing.scale(data, with_std=False, copy=copy) if scale_labels: labels = preprocessing.scale(labels, copy=copy) elif center_labels: labels = preprocessing.scale(labels, with_std=False, copy=copy) if add_bias_feature: data = np.hstack((data, np.ones((data.shape[0], 1)))) if normalize_datapoints: data /= np.linalg.norm(data, axis=1)[:, np.newaxis] if transform_labels_to_plus_minus_one: labels = labels * 2.0 - 1.0 test_provider = None if test_size > 0.0: data, data_test, labels, labels_test = cross_validation.train_test_split(data, labels, test_size=test_size) test_provider = DataProvider(data_test, labels_test) return DataProvider(data, labels, test_provider=test_provider)
def classify(): # read training data lbls1, X, y = readCsv(TRAIN_CSV, True) # read test data lbls2, Y, z = readTestCsv(TEST_CSV) # Conversion to numpy arrays X = np.array(X) X = X.astype(float) y = np.array(y) Y = np.array(Y) Y = Y.astype(float) # perform feature scaling for zero mean and unit variance scale(X, with_mean = True, with_std = True) scale(Y, with_mean = True, with_std = True) lin_svc = svm.LinearSVC(C = 4.0, dual = False) lin_svc.fit(X, y) bestmodel = lin_svc preds = bestmodel.predict(Y) writePredictions(lbls2, preds)
def read_dataset(train_size, scale=False, normalize=False): logging.info('fetching the dataset') # d = sklearn.datasets.load_diabetes() # 糖尿病 #d = sklearn.datasets.load_boston() # ボストン住宅価格 # data = d['data'].astype(np.float32) target = d['target'].astype(np.float32).reshape(len(d['target']), 1) #"Chainerのmnist.pyだと下記ののような書き方になっているが、ミニバッチの数が2以上だと動かない"らしい #target = diabetes['target'].astype(np.float32) # 本来訓練データで標準化・正規化して、そのパラメータをテストデータに適用すべき if normalize and scale: raise Exception('both normalize and scale can not be True') if normalize: data = preprocessing.normalize(data) target = preprocessing.normalize(target) if scale: data = preprocessing.scale(data) target = preprocessing.scale(target) # 分割 x_train, x_test = np.split(data, [train_size]) y_train, y_test = np.split(target, [train_size]) assert len(x_train)==len(y_train) assert len(x_test)==len(y_test) return ((x_train, y_train), (x_test, y_test), {"SHAPE_TRAIN_X":x_train.shape, "SHAPE_TRAIN_Y":y_train.shape, "SHAPE_TEST_X":x_test.shape, "SHAPE_TEST_Y":y_test.shape, })
def standardize(self): """ impute """ print('Standardization') self.tr = scale(self.tr) self.te = scale(self.te)
def test_scale_function_without_centering(): rng = np.random.RandomState(42) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero X_csr = sp.csr_matrix(X) X_scaled = scale(X, with_mean=False) assert_false(np.any(np.isnan(X_scaled))) X_csr_scaled = scale(X_csr, with_mean=False) assert_false(np.any(np.isnan(X_csr_scaled.data))) # test csc has same outcome X_csc_scaled = scale(X_csr.tocsc(), with_mean=False) assert_array_almost_equal(X_scaled, X_csc_scaled.toarray()) # raises value error on axis != 0 assert_raises(ValueError, scale, X_csr, with_mean=False, axis=1) assert_array_almost_equal(X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X) X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled) assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0)) assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
def split_into_chunks(data, train, predict, step, binary=True, scale=True): X, Y = [], [] for i in range(0, len(data), step): try: x_i = data[i:i+train] y_i = data[i+train+predict] # Use it only for daily return time series if binary: if y_i > 0.: y_i = [1., 0.] else: y_i = [0., 1.] if scale: x_i = preprocessing.scale(x_i) else: timeseries = np.array(data[i:i+train+predict]) if scale: timeseries = preprocessing.scale(timeseries) x_i = timeseries[:-1] y_i = timeseries[-1] except: break X.append(x_i) Y.append(y_i) return X, Y
def __init__(self, data, labels, validation_data=None, validation_labels=None, hidden_layer_size=0, loss_function="mean-squared-error", learning_rate=1.0, decreasing_rate=False): self.input_layer_size = data.shape[1] self.hidden_layer_size = hidden_layer_size self.output_layer_size = len(np.unique(labels)) if loss_function not in ("mean-squared-error", "cross-entropy"): raise ValueError("Loss function must be 'mean-squared-error' or 'cross-entropy'.") self.loss_function = loss_function self.data = scale(data) self.labels = labels self.Y = np.zeros((data.shape[0], self.output_layer_size)) for i in range(data.shape[0]): label_i = labels[i] self.Y[i][label_i] = 1 self.learning_rate = learning_rate self.decreasing_rate = decreasing_rate if validation_data is not None: self.validation_data = scale(validation_data) else: self.validation_data = None self.validation_labels = validation_labels
def trainModel(): # Model parameters W = tf.Variable([.1000], tf.float32) b = tf.Variable([-.1000], tf.float32) # Model input and output x = tf.placeholder(tf.float32, shape=None) linear_model = W * x + b y = tf.placeholder(tf.float32) # loss loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares # optimizer optimizer = tf.train.GradientDescentOptimizer(0.01) train = optimizer.minimize(loss) # training data x_train = preprocessing.scale(mouseClickX) y_train = preprocessing.scale(mouseClickY) init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) # reset values to wrong for i in range(500): sess.run([train], {x: x_train, y: y_train}) if i % 50 == 0: # to visualize the result and improvement try: ax.lines.remove(lines[0]) except Exception: pass print(x_train, y_train, i) prediction_value = sess.run(linear_model, feed_dict={x: mouseClickX}) # plot the prediction lines = ax.plot(mouseClickX, prediction_value, 'r-', lw=5) plt.pause(1)
def fold_score_keras(events_A, events_B, model_A, df_A, df_B): """Returns scored events_B for a BDT_A.""" # Get indices, train weights and classes for each of these splits. # w and Y need to be numpy arrays to work with skl. w_A = np.array([a.train_weight for a in events_A]) w_B = np.array([a.train_weight for a in events_B]) Y_A = np.array([a.classification for a in events_A]) Y_B = np.array([a.classification for a in events_B]) # Index our X training sets by row; convert to ndarrays. X_A = df_A.as_matrix() X_B = df_B.as_matrix() # Scale for the NN. X_A = scale(X_A) X_B = scale(X_B) # Fit model. model_A.fit(X_A, Y_A, sample_weight=w_A, validation_data=(X_B, Y_B, w_B), nb_epoch=1000, batch_size=32, callbacks=[EarlyStopping(patience=50)]) model_A.save(datetime.now().strftime('%d%m%y_%H%S') + '_kerasmodel.h5') # Get scores of X_A for BDT_B and vice-versa. prob_tuples = model_A.predict_proba(X_B).tolist() # Only want the second element of the prob tuple (prob of signal). scores = [a[0] for a in prob_tuples] for e, s in zip(events_B, scores): e.set_decision_value(s) return events_B
def buildModel(size): with open('Sentiment Analysis Dataset.csv', 'rb') as csvfile: pos_tweets =[] neg_tweets =[] spamreader = csv.reader(csvfile, delimiter=',') for row in spamreader: if row[1] == '1': if not (len(pos_tweets) > size): pos_tweets.append(_cleanTweet(row[3])) else: if not (len(neg_tweets) > size): neg_tweets.append(_cleanTweet(row[3])) y = np.concatenate((np.ones(len(pos_tweets[0:size])), np.zeros(len(neg_tweets[0:size])))) x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets[0:size], neg_tweets[0:size])), y, test_size=0.2) x_train = _cleanText(x_train) x_test = _cleanText(x_test) n_dim = 100 #Initialize model and build vocab imdb_w2v = Word2Vec(size=n_dim, min_count=10) imdb_w2v.build_vocab(x_train) imdb_w2v.train(x_train) train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train]) train_vecs = scale(train_vecs) #Train word2vec on test tweets imdb_w2v.train(x_test) #Build test tweet vectors then scale test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test]) test_vecs = scale(test_vecs) lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs, y_train) imdb_w2v.save("imdb_w2v") f = open("Accuracy.txt","w") f.write(str(lr.score(test_vecs, y_test))+" "+str(size*2)) f.close()
def main(): indata = np.load(inputs) training_data = indata['data_training'] training_scaled = preprocessing.scale(training_data) training_labels = indata['label_training'] validation_data = indata['data_val'] validation_scaled = preprocessing.scale(validation_data) validation_labels = indata['label_val'] ts = range(-12,6) cs = [pow(10, t) for t in ts] accuracy_results = [] accuracy_results_scaled = [] for c in cs: lin_clf = svm.LinearSVC(C=c) lin_clf.fit(training_data, training_labels) predictions = lin_clf.predict(validation_data) accuracy = metrics.accuracy_score(validation_labels, predictions) accuracy_results.append(accuracy) lin_clf.fit(training_scaled, training_labels) predictions = lin_clf.predict(validation_scaled) accuracy_scaled = metrics.accuracy_score(validation_labels, predictions) accuracy_results_scaled.append(accuracy_scaled) plt.plot(range(len(cs)), accuracy_results, label='un-scaled') plt.plot(range(len(cs)), accuracy_results_scaled, label='scaled') plt.xticks(range(len(cs)), cs, size='small') plt.legend() plt.show() print accuracy_results print accuracy_results_scaled
def run(self): roi_data = [] seg_data = [] provider_roi = self.roi_layer.dataProvider() provider_seg = self.seg_layer.dataProvider() feat_seg = QgsFeature() self.status.emit('building spatial index') time.sleep(0.3) index = QgsSpatialIndex() piter = 0 feat_count = provider_seg.featureCount() for f in provider_seg.getFeatures(): seg_data.append(f.attributes()[1:]) index.insertFeature(f) piter += 1 self.progress.emit(piter * 15 / feat_count) self.status.emit('extracting attributes') self.log.emit('extracting attributes from roi segments intersection') time.sleep(0.3) # intersect roi with segments and extract attributes piter = 0 feat_count = provider_roi.featureCount() for feat_roi in provider_roi.getFeatures(): geom = feat_roi.geometry() attr_roi = feat_roi.attributes() intersects = index.intersects(geom.boundingBox()) for fid in intersects: ffilter = QgsFeatureRequest().setFilterFid(int(fid)) provider_seg.getFeatures(ffilter).nextFeature(feat_seg) # filter geometries that does not intersect if geom.intersects(feat_seg.geometry()): attr_seg = feat_seg.attributes() roi_data.append(attr_seg[1:] + attr_roi) # emit progress piter += 1 self.progress.emit(15 + (piter * 55 / feat_count)) # read train data roi_data = np.array(roi_data) samples = roi_data[:,:-1] labels = roi_data[:,-1].astype(int) # svm fit and predict self.status.emit('svm: fitting data') time.sleep(0.3) classifier = svm.SVC(**self.svm_dict) classifier.fit(preprocessing.scale(samples), labels) self.progress.emit(85) self.status.emit('svm: predicting labels') time.sleep(0.3) seg_data = preprocessing.scale(seg_data) predictions = classifier.predict(seg_data).tolist() self.progress.emit(100) self.output = pickle.dumps(predictions)
def load_dataset(fname="../data/housing/housing.data",cols=(0,)): X = np.genfromtxt(fname,usecols=cols,delimiter = ',') #X = np.genfromtxt(fname,usecols=cols) num_features = X.shape[1] num_triplets = int(6*num_features*(num_features-1)*(num_features-2)/6); triplets = np.zeros((num_triplets,4*wx.shape[1])) print ':: loading dataset...please wait!' l = 0 for i in range(num_features-2): for j in range(i+1,num_features-1): for k in range(j+1,num_features): permute_idx = itertools.permutations([i,j,k]) for idx in permute_idx: x = scale(np.array(X[:,idx[0]]))[:,np.newaxis] y = scale(np.array(X[:,idx[1]]))[:,np.newaxis] z = scale(np.array(X[:,idx[2]]))[:,np.newaxis] triplets[l,:] = f3(x,y,z,np.hstack((x,y,z))) l = l + 1 return (triplets,num_features,num_triplets)
def generate_X_y_arrays(f_train_set='%s/train_set.csv' % (data_path)): """ 生成分类器的训练集X 和标签集y Args: f_train_set: 训练集的csv文件 Returns: X: training samples, size=[n_samples, n_features] y: class labels, size=[n_samples, 1] """ from sklearn import preprocessing import numpy as np X = [] y = [] with open(f_train_set, 'r') as fin: fin.readline() # 忽略首行 for line in fin: cols = line.strip().split(',') X.append([float(i) for i in cols[1:]]) y.append(int(cols[0])) # tag在第一列,0 或 -1 logger.debug('classifier input X_size=[%s, %s] y_size=[%s, 1]' % (len(X), len(X[0]), len(y))) X = preprocessing.scale(np.array(X)) y = preprocessing.scale(np.array(y)) return X, y
def load_qm7(): datafile = '/home/hpc/pr63so/ga93yih2/gdb13/gdb13_atm.pkl' dataset = pickle.load(open(datafile, 'r')) split = 1 P = dataset['P'][range(0, split)+ range(split+1, 5)].flatten() X = dataset['B'][P] Z = dataset['T'][P] Z = Z.reshape(Z.shape[0], 1) train_labels = Z Ptest = dataset['P'][split] TX = dataset['B'][Ptest] TZ = dataset['T'][Ptest] TZ = TZ.reshape(TZ.shape[0], 1) test_labels = TZ Z = scale(Z, axis=0) TZ = scale(TZ, axis=0) mean = X.mean(axis=0) std = (X - mean).std() X = (X - mean) / std TX = (TX - mean)/ std return X, Z, TX, TZ, train_labels, test_labels
def feature_scale(data,method): ''' 特征无量纲化 常见的无量纲化方法有标准化和区间缩放法。 标准化的前提是特征值服从正态分布,标准化后,其转换成标准正态分布。 区间缩放法利用了边界值信息,将特征的取值区间缩放到某个特点的范围,例如[0, 1]等。 标准化是依照特征矩阵的列处理数据,其通过求z-score的方法, 将样本的特征值转换到同一量纲下。 归一化是依照特征矩阵的行处理数据,其目的在于样本向量在点乘运算或其他核函数计算相似性时, 拥有统一的标准,也就是说都转化为“单位向量”。 操作流程:先做特征选择,分割训练测试集,再进行这一步 ''' if method == 'scale': from sklearn.preprocessing import scale scale(data) elif method == 'standard': # 1 标准化,返回值为标准化后的数据 from sklearn.preprocessing import StandardScaler StandardScaler().fit_transform(data) elif method == 'minmax': # 2 区间缩放,返回值为缩放到[0, 1]区间的数据 from sklearn.preprocessing import MinMaxScaler MinMaxScaler().fit_transform(data) elif method == 'normal': #3 归一化,返回值为归一化后的数据 from sklearn.preprocessing import Normalizer Normalizer().fit_transform(data)
def getImages(): digitsImagesNormalized = getImagesFromDir(digitsPath) lettersImagesNormalized = getImagesFromDir(lettersPath) digitsImagesNormalized = [skpre.scale(digitsImagesNormalized[0]), digitsImagesNormalized[1]] lettersImagesNormalized = [skpre.scale(lettersImagesNormalized[0]), lettersImagesNormalized[1]] allImages = [] for i in digitsImagesNormalized[0]: allImages.append(i) for i in lettersImagesNormalized[0]: allImages.append(i) # Divide em teste e treino. # Calcula PCA - Reducao de dimensionalidade dos dados. :) pca = computePCA(allImages) digitstransformedData = pca.transform(digitsImagesNormalized[0]) letterstransformedData = pca.transform(lettersImagesNormalized[0]) dtrainDataTF, dtestDataTF, dclassesTrainTF, dclassesTestTF = train_test_split(digitstransformedData, digitsImagesNormalized[1], train_size=0.65) ltrainDataTF, ltestDataTF, lclassesTrainTF, lclassesTestTF = train_test_split(letterstransformedData, lettersImagesNormalized[1], train_size=0.65) return [[dtrainDataTF, dclassesTrainTF], [dtestDataTF, dclassesTestTF]], [[ltrainDataTF, lclassesTrainTF], [ltestDataTF, lclassesTestTF]]
def scaled_logistic_regression(x_train, t_train, x_test, t_test): x_train_new = preprocessing.scale(x_train) x_test_new = preprocessing.scale(x_test) return logistic_regression(x_train_new, t_train, x_test_new, t_test)
def extractFeatures(data, n): logging.info('Features: extracting {0}...'.format(n)) # create DF columns = [] col_names = ['open', 'high', 'low', 'close', 'volume'] for col_name in col_names: for m in xrange(1, n+1): columns.append('{0}_{1}'.format(col_name, m)) # pprint(columns) df = pd.DataFrame(dtype=float, columns=columns) pb = ProgressBar(maxval=len(data)).start() for i in xrange(n, len(data)+1): pb.update(i) slice = data.ix[i-n:i] # print slice scale(slice, axis=0, copy=False) # print slice cntr = 0 item = {} for slice_index, slice_row in slice.iterrows(): cntr += 1 # print slice_index # print slice_row for col in slice.columns: item['{0}_{1}'.format(col, cntr)] = slice_row[col] # pprint(item) df.loc[i] = item # break pb.finish() logging.info('Features: extracted') return df
def main(): X, Y, X_test = import_data() X_n = preprocessing.scale(X) X_t_n = preprocessing.scale(X_test) X_train, X_test, y_train, y_test = cross_validation.train_test_split( \ X_n, Y, test_size=0.2, random_state=0) alpha = np.arange(0.001, 2.0, 0.001, np.float) best_alpha = 0 best_score = 0 for a in alpha: clf = linear_model.Ridge (alpha = a) clf.fit(X_train, y_train) sc = clf.score(X_test, y_test) if sc > best_score: best_alpha = a best_score = sc clf = linear_model.Ridge (alpha = best_alpha) clf.fit(X_train, y_train) res = clf.predict(X_t_n) for var in res: print(var[0])
def main(): """TODO: Docstring for main. :returns: TODO """ alpha = 1. decay = 0.0006 iter_num = 600 finetune_iter = 220 hyper_params = { 'hidden_layers_sizes':[196,], 'iter_nums':[400,], 'alphas':[1.,], 'decays':[0.003,], 'betas':[3,], 'rhos':[0.1,] } enc = OneHotEncoder(sparse=False) mnist = fetch_mldata('MNIST original', data_home='./') x_train, x_test, y_train, y_test = \ train_test_split(scale(mnist.data.astype(float)).astype('float32'), mnist.target.astype('float32'), test_size=0.5, random_state=0) x_unlabeled = scale(mnist.data[mnist.target>=5,:].astype(float)).astype('float32') y_train = enc.fit_transform(y_train.reshape(y_train.shape[0],1)).astype('float32') t_x = T.matrix() params, extracted = pretrain_sae(x_unlabeled, hyper_params) extracted = function(inputs=[t_x], outputs=[sae_extract(t_x, params)])(x_train)[0] params.append(train_softmax(extracted, y_train, iter_num, alpha, decay)) weights = finetune_sae(x_train, y_train, params, finetune_iter, alpha, decay) all_label = np.array(range(0, 10)) pred = all_label[softmax2class_max(sae_predict(x_test, weights))] print accuracy_score(y_test, pred) print classification_report(y_test, pred) print confusion_matrix(y_test, pred)
def get_correlation_data(self, round_number, liste_id, dataset): points = [] #On récupère d'abord les pourcentages de vote pour la liste donnée poll_data = self.retrieve_total_votes_for_liste(round_number, liste_id) # on range les des données dans un dico propre data_x, data_y = [],[] for dept_data in poll_data: data_x.append(dept_data["vote_percentage"]) data_y.append(dataset[dept_data["_id"]] / 100) points.append({"dept_id" : dept_data["_id"], "votes_percentage" : dept_data["vote_percentage"], "other_percentage" : dataset[dept_data["_id"]] / 100}) array_x, array_y = array(data_x), array(data_y) # on normalise les données de vote et du dataset rescaled_x, rescaled_y = preprocessing.scale(array_x), preprocessing.scale(array_y) #on calcule les couleurs pour chacun des départements colors, max_val = self._compute_colors(rescaled_x, rescaled_y) #surles données non normalisées, on calcule les coefficients de la droite de régression reg_slope, reg_y_intercept = self._linear_regression(array_x, array_y) for i, x in enumerate(rescaled_x): points[i]["votes_normalized"] = rescaled_x[i] points[i]["other_normalized"] = rescaled_y[i] points[i]["color"] = colors[i] return {"points" : points, "graph_metadata": {"max" : max_val, "regression": {"slope" : reg_slope, "intercept" : reg_y_intercept}}}
def load_all_data(f_name, scale=True, rnd=False): """Get data with labels, split into training, validation and test set.""" data_file = h5py.File(f_name, 'r') x_test = data_file['x_test'][:] x_dev = data_file['x_dev'][:] x_train = data_file['x_train'][:] data_file.close() if scale: print "scaling..." x_test = preprocessing.scale(x_test, with_mean=False) x_dev = preprocessing.scale(x_dev, with_mean=False) x_train = preprocessing.scale(x_train, with_mean=False) print "Total dataset size:" print "n train samples: %d" % x_train.shape[0] print "n test samples: %d" % x_test.shape[0] print "n dev samples: %d" % x_dev.shape[0] print "n features: %d" % x_test.shape[1] if rnd: print "Radomizing training set..." np.random.shuffle(x_train) return dict( x_train=x_train, x_test=x_test, x_dev=x_dev, )
def permutation_cross_validation(estimator, X, y, n_fold=3, isshuffle=True, cvmeth='shufflesplit', score_type='r2', n_perm=1000): """ An easy way to evaluate the significance of a cross-validated score by permutations ------------------------------------------------- Parameters: estimator: linear model estimator X: IV y: DV n_fold: fold number cross validation cvmeth: kfold or shufflesplit. shufflesplit is the random permutation cross-validation iterator score_type: scoring type, 'r2' as default n_perm: permutation numbers Return: score: model scores permutation_scores: model scores when permutation labels pvalues: p value of permutation scores """ try: from sklearn import cross_validation, preprocessing except ImportError: raise Exception('To call this function, please install sklearn') if X.ndim == 1: X = np.expand_dims(X, axis = 1) if y.ndim == 1: y = np.expand_dims(y, axis = 1) X = preprocessing.scale(X) y = preprocessing.scale(y) if cvmeth == 'kfold': cvmethod = cross_validation.KFold(y.shape[0], n_fold, shuffle = isshuffle) elif cvmeth == 'shufflesplit': testsize = 1.0/n_fold cvmethod = cross_validation.ShuffleSplit(y.shape[0], n_iter = 100, test_size = testsize, random_state = 0) score, permutation_scores, pvalues = cross_validation.permutation_test_score(estimator, X, y, scoring = score_type, cv = cvmethod, n_permutations = n_perm) return score, permutation_scores, pvalues
def try_lvc_clf(train_X,train_y,test_X,test_y): train_X=scale(train_X) lvc=LinearSVC(C=0.1) lvc.fit(train_X,train_y) dec_y=lvc.decision_function(train_X) #choose the smallest 90% num_sel=int(len(dec_y)*0.8) assert len(dec_y)==train_X.shape[0] assert num_sel<=train_X.shape[0] s_idx=np.argsort(np.abs(dec_y)) assert len(s_idx)==train_X.shape[0] for i in s_idx: if np.isnan(train_y[i])==True: print("smoking index:%s"%i) n_train_X=train_X[s_idx[0:num_sel],:] n_train_y=train_y[s_idx[0:num_sel]] n_train_X=scale(n_train_X) lvc.fit(n_train_X,n_train_y) test_X=scale(test_X) pred_y=lvc.predict(test_X) return pred_y
def get_feature_importances(data_table, obs_metadata, lines_table, use_con_flux=False): feature_importances_list = [] X_colnames = None for line_name, line_wavelength in lines_table['source', 'wavelength_target']: subset = data_table[(data_table['source'] == line_name) & (data_table['wavelength_target'] == line_wavelength)] X, y, labels = get_X_and_y(subset, obs_metadata, use_con_flux) if X_colnames is None: X_colnames = X.colnames params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.01, 'loss': 'lad'} clf = ensemble.GradientBoostingRegressor(**params) X = ndarrayidze(X) # Scaling is optional, but I think I'm going to do it (for now) for all methods, # just in comparing between valued here and with e.g. ICA there are fewer diffs X = skpp.scale(X) y = skpp.scale(y) clf.fit(X, y) feature_importances_list.append(clf.feature_importances_) fi = np.array(feature_importances_list) fi_table = Table(fi, names = X_colnames) fi_table.add_column(lines_table['source']) fi_table.add_column(lines_table['wavelength_target']) return fi_table
def scale(self): # FIXME: this cannot work this way, scaling must be done with # the joined set. if (self.X != None): self.X = preprocessing.scale(self.X) if (self.X_test != None): self.X_test = preprocessing.scale(self.X_test)
from sklearn import preprocessing import numpy as np data= np.array([[2.2,5.9,-1.8],[5.4,-3.2,-5.1],[-1.9,4.2, 3.2]]) data bindata=preprocessing.Binarizer(threshold=1.5).transform(data) bindata #Mean removal data.mean(axis=0)#array([ 1.9 , 2.3 , -1.23333333]) data.std(axis=0)#highly variable array([2.98775278, 3.95052739, 3.41207008]) #so, scaled_data=preprocessing.scale(data) scaled_data.mean(axis=0)#array([0.00000000e+00, 0.00000000e+00, 7.40148683e-17]) scaled_data.std(axis=0)#array([1., 1., 1.]) #scaling #work with same data data minmax_scaler=preprocessing.MinMaxScaler(feature_range=(0, 1)) data_minmax=minmax_scaler.fit_transform(data) data_minmax #Normalization #bringing the values of each feature vector on a common scale
# k means clustering for hand written digits classification # on dataset from sklearn # learning ML with https://www.techwithtim.net/tutorials/machine-learning-python/k-means-1/ import numpy as np import sklearn from sklearn.preprocessing import scale from sklearn.datasets import load_digits from sklearn.cluster import KMeans from sklearn import metrics digits = load_digits() # using scale to scale data down - large values converted to range -1 - 1 data = scale(digits.data) y = digits.target k = 10 samples, features = data.shape def bench_k_means(estimator, name, data): estimator.fit(data) print('%-9s\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (name, estimator.inertia_, metrics.homogeneity_score(y, estimator.labels_), metrics.completeness_score(y, estimator.labels_), metrics.v_measure_score(y, estimator.labels_), metrics.adjusted_rand_score(y, estimator.labels_), metrics.adjusted_mutual_info_score(y, estimator.labels_), metrics.silhouette_score( data, estimator.labels_, metric='euclidean')))
print("Starting preprocessing filtered tweets") tweets_filtered['ekphrasis_text'] = tweets_filtered['text'].progress_apply(ekphrasis_preprocessing) print('time taken:', str(time.time() - start_time), 'seconds') import os #print('GENSIM_DATA_DIR', os.environ['GENSIM_DATA_DIR'] ) start_time = time.time() print('loading glove') glove_twitter = api.load("glove-twitter-200") print('time taken:', str(time.time() - start_time), 'seconds') start_time = time.time() print('calculating embeddings') filtered_data_vecs_glove_mean = scale(np.concatenate([get_w2v_general(z, 200, glove_twitter,'mean') for z in tqdm(tweets_filtered["text"])])) print('time taken:', str(time.time() - start_time), 'seconds') print('per tweet:', (time.time() - start_time)/tweets_filtered.shape[0], 'seconds') for column in ["is_unemployed", "lost_job_1mo", "job_search", "is_hired_1mo", "job_offer"]: print('\n\n!!!!!', column) # start = time.time() # learner = create_model(column, best_epochs[column]) # print('load model:', str(time.time() - start_time), 'seconds') # print('Predictions of Filtered Tweets:') # start_time = time.time() # predictions_filtered = learner.predict_batch(tweets_filtered['text'].values.tolist())
plt.axis('tight') plt.xlabel('log alpha') plt.ylabel('coefficients') plt.title('coefficient trajectories for ' + name + ' regression at each alpha value') ''' LASSO regression ''' lasso = Lasso(max_iter=10000, normalize=True) coefs = [] for a in alphas: lasso.set_params(alpha=a) lasso.fit(scale(X), y) coefs.append(lasso.coef_) plot_coefs(coefs, 'LASSO') lassocv = LassoCV(alphas=None, cv=10, max_iter=100000, normalize=True) lassocv.fit(X, y) LASSO_best_alpha = lassocv.alpha_ lasso.set_params( alpha=LASSO_best_alpha ) # fit LASSO regression with best alpha value after perform 10 folds CV lasso.fit(X, y) best_LASSO_MSE = mean_squared_error(y, lasso.predict(X)) LASSO_best_coefs = pd.Series(lasso.coef_, index=X.columns) print("Best coefficients for LASSO regression: \n", LASSO_best_coefs) ''' Ridge regression
def X_transcriptome(base,index,standardize=True): X = pd.read_pickle(base + r'/trscr.pkl') if standardize: preprocessing.scale(X,copy=False) return X.loc[index]
forecast_col = 'Adj. Close' df.fillna(-999999, inplace=True) #Predict data 1% of the length of the dataframe in advance. forecast_out = int(math.ceil(0.01*len(df))) #print("Days in advance: "+ str(forecast_out)) df['label'] = df[forecast_col].shift(-forecast_out) df.dropna(inplace=True) #Features column, drop the label in our data set. X = np.array(df.drop(['label'], 1)) y = np.array(df['label']) X = preprocessing.scale(X) #Increases processing time. y = np.array(df['label']) #print("x is this long: "+ str(len(x))) #print("Y is this long: " + str(len(y))) X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2) clf = LinearRegression() clf.fit(X_train, y_train) accuracy = clf.score(X_test, y_test) print(accuracy)
]] # Lesson 3 - Replace Nan data to -9999. Create label and forecast out. forecast_col = 'Adj. Close' training_data.fillna(-9999, inplace=True) forecast_out = int(math.ceil(0.01 * len(training_data))) training_data['label'] = training_data[forecast_col].shift(-forecast_out) training_data.dropna(inplace=True) # Lesson 3-4 - Regression training and testing. training_data.dropna(inplace=True) X = np.array(training_data.drop(['label'], 1)) X = preprocessing.scale(X) X_lately = X[-forecast_out:] X = X[:-forecast_out] training_data.dropna(inplace=True) y = np.array(training_data['label']) y_lately = y[-forecast_out:] y = y[:-forecast_out] X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) # Linear regression classifier = LinearRegression(n_jobs=-1) classifier.fit(X_train, y_train) accuracy = classifier.score(X_test, y_test)
if file[:3] == 'sfc' and file[-5:] == '.grib': inputfile = os.path.join(rootpath, file) sfcfile = Nio.open_file(inputfile, 'r') #参数0是指第0个时次的预报,这里只是一个文件的2000个站的列表。 GetStationsAndOnetimesFromEC(ll, sfc_varinames, sfcfile, inputfile) #训练集 stationArray = numpy.array(stationsVlist) #预测集 trainlebelArray = numpy.array(trainlebellist) a_train, a_test = train_test_split(stationArray, test_size=0.33, random_state=7) #数据训练前进行标准化 x_scaled = preprocessing.scale(stationArray) stationArray = x_scaled #xgboost,训练集和预测集分割 x_train, x_test, y_train, y_test = train_test_split(stationArray, trainlebelArray, test_size=0.33, random_state=7) xgbtrain = xgboost.DMatrix(x_train, label=y_train) xgbtest = xgboost.DMatrix(x_test, label=y_test) #xgbtrain.save_binary('train.buffer') #print len(x_train),len(x_test),len(y_train),len(y_test) #print xgbtest #训练和验证的错误率 watchlist = [(xgbtrain, 'xgbtrain'), (xgbtest, 'xgbeval')] params = { 'booster': 'gbtree',
kn = KNeighborsClassifier(n_neighbors=k) kn.fit(X, Y) array = cross_val_score(estimator=kn, X=X, y=Y, cv=kf, scoring='accuracy') m = array.mean() kMeans.append(m) m = max(kMeans) indices = [i for i, j in enumerate(kMeans) if j == m] print(indices[0] + 1) print(np.round(m, decimals=2)) # Произведите масштабирование признаков с помощью функции sklearn.preprocessing.scale. # Снова найдите оптимальное k на кросс-валидации. X_scale = scale(X) kMeans = list() for k in range(1, 51): kn = KNeighborsClassifier(n_neighbors=k) array = cross_val_score(estimator=kn, X=X_scale, y=Y, cv=kf, scoring='accuracy') m = array.mean() kMeans.append(m) # Какое значение k получилось оптимальным после приведения признаков к одному масштабу m = max(kMeans)
num_one_targets = int(np.sum(targets_all)) zero_targets_counter = 0 indices_to_remove = [] for i in range(targets_all.shape[0]): if targets_all[i] == 0: zero_targets_counter +=1 if zero_targets_counter > num_one_targets: indices_to_remove.append(i) unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0) targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0) # Standardize the Data scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors) # Shuffle the Data shuffled_indices = np.arange(scaled_inputs.shape[0]) np.random.shuffle(shuffled_indices) shuffled_inputs = scaled_inputs[shuffled_indices] shuffled_targets = targets_equal_priors[shuffled_indices] # Splitting the Dataset into Train, Validation, Testing Dataset samples_count = shuffled_inputs.shape[0] train_samples_count = int(0.8*samples_count) validation_samples_count = int(0.1*samples_count) test_samples_count = samples_count - train_samples_count - validation_samples_count
# What features may distinguish cities? based on business sense and exploratory analysis num_list = [ 'duration', 'days_in_advance', 'orig_destination_distance', 'is_mobile', 'is_package', 'srch_adults_cnt', 'srch_children_cnt', 'srch_rm_cnt' ] city_data = sample.dropna(axis=0)[num_list + ['user_location_city']] city_groups = city_data.groupby( 'user_location_city').mean().reset_index().dropna(axis=0) # Step 2: shall I standardise the data? # What is the magnitude of data range? city_groups_std = city_groups.copy() for i in num_list: city_groups_std[i] = preprocessing.scale(city_groups_std[i]) # Step 3: select clustering method and number of clusters # The Elbow methods? choose a K so that the sum of the square error of the distances decrease drastically # using an ad-hoc k=3 here, there are methods to help derive the optimal number for k km = cluster.KMeans(n_clusters=3, max_iter=300, random_state=None) city_groups_std['cluster'] = km.fit_predict(city_groups_std[num_list]) # Principal Component Analysis pca = decomposition.PCA(n_components=2, whiten=True) pca.fit(city_groups[num_list]) city_groups_std['x'] = pca.fit_transform(city_groups_std[num_list])[:, 0] city_groups_std['y'] = pca.fit_transform(city_groups_std[num_list])[:, 1] plt.scatter(city_groups_std['x'], city_groups_std['y'],
def l2_norm(x): return preprocessing.scale(x, axis=1, with_mean=False, with_std=True)
def run_SAM(in_data, skeleton=None, is_mixed=False, device="cpu", train=10000, test=1, batch_size=-1, lr_gen=.001, lr_disc=.01, lambda1=0.001, lambda2=0.0000001, nh=None, dnh=None, verbose=True, losstype="fgan", functionalComplexity="n_hidden_units", sampletype="sigmoidproba", dagstart=0, dagloss=False, dagpenalization=0.05, dagpenalization_increase=0.0, categorical_threshold=50, linear=False, numberHiddenLayersG=2, numberHiddenLayersD=2, idx=0): list_nodes = list(in_data.columns) if is_mixed: onehotdata = [] for i in range(len(list_nodes)): # print(pd.get_dummies(in_data.iloc[:, i]).values.shape[1]) if pd.get_dummies( in_data.iloc[:, i]).values.shape[1] < categorical_threshold: onehotdata.append(pd.get_dummies(in_data.iloc[:, i]).values) else: onehotdata.append(scale(in_data.iloc[:, [i]].values)) cat_sizes = [i.shape[1] for i in onehotdata] data = np.concatenate(onehotdata, 1) else: data = scale(in_data[list_nodes].values) cat_sizes = None nb_var = len(list_nodes) data = data.astype('float32') data = th.from_numpy(data).to(device) if batch_size == -1: batch_size = data.shape[0] lambda1 = lambda1 / data.shape[0] lambda2 = lambda2 / data.shape[0] rows, cols = data.size() # Get the list of indexes to ignore if skeleton is not None: skeleton = th.from_numpy(skeleton.astype('float32')) sam = SAM_generators((batch_size, cols), nh, skeleton=skeleton, cat_sizes=cat_sizes, linear=linear, numberHiddenLayersG=numberHiddenLayersG).to(device) sam.reset_parameters() g_optimizer = th.optim.Adam(list(sam.parameters()), lr=lr_gen) if losstype != "mse": discriminator = SAM_discriminator( cols, dnh, numberHiddenLayersD, mask=sam.categorical_matrix, ).to(device) discriminator.reset_parameters() d_optimizer = th.optim.Adam(discriminator.parameters(), lr=lr_disc) criterion = th.nn.BCEWithLogitsLoss() else: criterion = th.nn.MSELoss() disc_loss = th.zeros(1) if sampletype == "sigmoid": graph_sampler = SimpleMatrixConnection(len(list_nodes), mask=skeleton).to(device) elif sampletype == "sigmoidproba": graph_sampler = MatrixSampler(len(list_nodes), mask=skeleton, gumble=False).to(device) elif sampletype == "gumbleproba": graph_sampler = MatrixSampler(len(list_nodes), mask=skeleton, gumble=True).to(device) else: raise ValueError('Unknown Graph sampler') graph_sampler.weights.data.fill_(2) graph_optimizer = th.optim.Adam(graph_sampler.parameters(), lr=lr_gen) if not linear and functionalComplexity == "n_hidden_units": neuron_sampler = MatrixSampler((nh, len(list_nodes)), mask=False, gumble=True).to(device) neuron_optimizer = th.optim.Adam(list(neuron_sampler.parameters()), lr=lr_gen) _true = th.ones(1).to(device) _false = th.zeros(1).to(device) output = th.zeros(len(list_nodes), len(list_nodes)).to(device) data_iterator = DataLoader(data, batch_size=batch_size, shuffle=True, drop_last=True) # RUN if verbose: pbar = tqdm(range(train + test)) else: pbar = range(train + test) for epoch in pbar: for i_batch, batch in enumerate(data_iterator): if losstype != "mse": d_optimizer.zero_grad() # Train the discriminator drawn_graph = graph_sampler() if not linear and functionalComplexity == "n_hidden_units": drawn_neurons = neuron_sampler() if linear or functionalComplexity != "n_hidden_units": generated_variables = sam(batch, drawn_graph) else: generated_variables = sam(batch, drawn_graph, drawn_neurons) if losstype != "mse": disc_vars_d = discriminator(generated_variables.detach(), batch) true_vars_disc = discriminator(batch) if losstype == "gan": disc_loss = sum([criterion(gen, _false.expand_as(gen)) for gen in disc_vars_d]) / nb_var \ + criterion(true_vars_disc, _true.expand_as(true_vars_disc)) # Gen Losses per generator: multiply py the number of channels elif losstype == "fgan": disc_loss = th.mean(th.exp(disc_vars_d - 1), [0, 2]).sum( ) / nb_var - th.mean(true_vars_disc) disc_loss.backward() d_optimizer.step() ### OPTIMIZING THE GENERATORS g_optimizer.zero_grad() graph_optimizer.zero_grad() if not linear and functionalComplexity == "n_hidden_units": neuron_optimizer.zero_grad() if losstype == "mse": gen_loss = criterion(generated_variables, batch) else: disc_vars_g = discriminator(generated_variables, batch) if losstype == "gan": # Gen Losses per generator: multiply py the number of channels gen_loss = sum([ criterion(gen, _true.expand_as(gen)) for gen in disc_vars_g ]) elif losstype == "fgan": gen_loss = -th.mean(th.exp(disc_vars_g - 1), [0, 2]).sum() filters = graph_sampler.get_proba() struc_loss = lambda1 * drawn_graph.sum() if linear: func_loss = 0 else: if functionalComplexity == "n_hidden_units": func_loss = lambda2 * drawn_neurons.sum() elif functionalComplexity == "l2_norm": l2_reg = th.Tensor([0.]).to(device) for param in sam.parameters(): l2_reg += th.norm(param) func_loss = lambda2 * l2_reg regul_loss = struc_loss + func_loss # Optional: prune edges and sam parameters before dag search if dagloss and epoch > train * dagstart: dag_constraint = notears_constr(filters * filters) #dag_constraint = notears_constr(drawn_graph) loss = gen_loss + regul_loss + ( dagpenalization + (epoch - train * dagstart) * dagpenalization_increase) * dag_constraint else: loss = gen_loss + regul_loss if verbose and epoch % 20 == 0 and i_batch == 0: pbar.set_postfix(gen=gen_loss.item() / cols, disc=disc_loss.item(), regul_loss=regul_loss.item(), tot=loss.item()) if epoch < train + test - 1: loss.backward() if epoch >= train: output.add_(filters.data) g_optimizer.step() graph_optimizer.step() if not linear and functionalComplexity == "n_hidden_units": neuron_optimizer.step() return output.div_(test).cpu().numpy()
def classify(self, proto, df): from sklearn import preprocessing from sklearn.externals import joblib #from sklearn import preprocessing print('proto:', proto) #print(df) if proto == "tcp": tcp_packet = preprocessing.scale( df.drop(['ipv4src', 'ipv4dst'], axis=1)) tcpclf = joblib.load('tcp_clf_kn.pkl') result = tcpclf.predict(tcp_packet) result = result[0].split( '_') # result sth like 'http_norm_request' eth_tp = 0x0800 # ipv4 ip_pt = 6 # tcp if result[1] == 'norm': # for QoS, skip 1st 2nd HS flg = self.tcpFlg( df.tcpFlgint ) # useless, convert the flg bit back to readable str self.countHS = self.countHS + 1 # count for the tcp handshake info = ''.join( map(str, (self.countHS, ') ', df.ipv4src.values[0], ':', df.tcpSport.values[0], ' -> ', df.ipv4dst.values[0], ':', df.tcpDport.values[0], flg.values[0], '(', df.tcpFlgint.values[0], ')'))) print(info) # just print out the ip port tcpFlags if self.countHS < 5: # look into first 4 initial handshakes, if not SYN(2) or SYN/ACK(18) or ACK(16) or FIN/ACK(1/17) or FIN/PSH/ACK(25) if df.tcpFlgint.values[0] not in [ 1, 2, 16, 18, 17, 24, 25 ]: print('invalid 3 way handshake... blocked') return (99, eth_tp, ip_pt, "bad", df.ipv4src.values[0], df.ipv4dst.values[0], df.tcpSport.values[0], df.tcpDport.values[0]) if self.countHS > 4: self.countHS = 0 return (1, eth_tp, ip_pt, 'norm', df.ipv4src.values[0], df.ipv4dst.values[0], None, None) #if re.match(r'(http*)', result[0]): return (1, None, None, 'later', None, None, None, None ) # delay flow install, mon mon sin else: self.countHS = 0 if proto == "http": #df.drop(['Accept','Host','httpPath'],axis=1,inplace=True) empcol = [ 'Host', 'Accept', 'Connection_Keep-Alive', 'Connection_keep-alive', 'httpMethod_GET', 'httpMethod_POST', 'httpProto_HTTP/1.0', 'httpProto_HTTP/1.1', 'uAgentBrowser_Chrome', 'uAgentBrowser_Firefox', 'uAgentBrowser_Wget', 'uAgentBrowser_curl', 'uAgentOS_Linux', 'uAgentOS_Other', 'uAgentOS_Windows 7' ] empDF = pd.DataFrame(columns=empcol) new_features = pd.concat([empDF, pd.get_dummies(df)], axis=0, join_axes=[empDF.columns]).fillna(value=0) new_features['Host'].fillna(0, inplace=True) new_features['Accept'].fillna(0, inplace=True) new_features['Host'][new_features.Host != 0] = 1 new_features['Accept'][new_features.Accept != 0] = 1 #print(new_features) httpclf = joblib.load('http_clf_KN.pkl') result = httpclf.predict(new_features) print(result) return result[0] # return user agent to decide how to handle
os.makedirs(plotpath) for toy_label, toy_X in toy_dataset_list: print('\n##### Now running dataset %s through tier 1 #####' % toy_label) #Create directory if directory does not exist toy_filepath = '%s%s/' % (filepath, toy_label) toy_plotpath = '%splotly_js/' % toy_filepath if not os.path.exists(toy_filepath): os.makedirs(toy_filepath) if not os.path.exists(toy_plotpath): os.makedirs(toy_plotpath) toy_X_scaled = scale(toy_X) toy_X_rows, toy_X_cols = toy_X_scaled.shape #default gamma def_gamma = 1 / toy_X_cols #Tier1 gamma values #t1_gamma_list = [def_gamma/10000, def_gamma/1000, def_gamma/100, def_gamma/10, def_gamma, def_gamma*10, def_gamma*100, def_gamma*1000, def_gamma*10000, def_gamma*10000] #t1_gamma_list = [def_gamma/100, def_gamma/10, def_gamma] t1_gamma_list = [def_gamma] # Dict of gammas w/ t1 Matrices amat_dict = dict() #Scale initial data to centre
df['label'] = df[forcast_column].shift(-forecast_out) ############################################################################# # # HL_PCT PCT_change Adj. Close Adj. Volume label # Date # 2004-08-19 3.712563 0.324968 50.322842 44659000.0 214.973603 # 2004-08-20 0.710922 7.227007 54.322689 22834300.0 212.395645 # 2004-08-23 3.729433 -1.227880 54.869377 18256100.0 202.394773 # 2004-08-24 6.417469 -5.726357 52.597363 15247300.0 203.083148 # 2004-08-25 1.886792 1.183658 53.164113 9188600.0 207.686157 # ############################################################################# X = np.array(df.drop(['label'], 1)) X = preprocessing.scale(X) # normalize X X_lately = X[-forecast_out:] X = X[:-forecast_out:] df.dropna(inplace=True) y = np.array(df['label']) X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2) # making 20% testing data LrModel = LinearRegression( ) # n_jobs = 10 parameter means it will work 10 jobs parallel LrModel.fit(X_train, y_train) accuracy = LrModel.score(X_test, y_test)
""" #Load the dataset auto_data = pd.read_csv("auto-data.csv") auto_data.dtypes auto_data.describe() auto_data.head() #Look at scatter plots plt.scatter(auto_data.HP, auto_data.PRICE) plt.cla() plt.scatter(auto_data['MPG-CITY'], auto_data['MPG-HWY']) plt.cla() #Center and scale from sklearn import preprocessing auto_data['HP'] = preprocessing.scale(auto_data['HP'].astype('float64')) auto_data['RPM'] = preprocessing.scale(auto_data['RPM'].astype('float64')) auto_data['MPG-CITY'] = preprocessing.scale( auto_data['MPG-CITY'].astype('float64')) auto_data['MPG-HWY'] = preprocessing.scale( auto_data['MPG-HWY'].astype('float64')) auto_data['PRICE'] = preprocessing.scale(auto_data['PRICE'].astype('float64')) auto_data.describe() """ In order to demonstrate the clusters being formed on a 2-dimensional plot, we will only use 100 samples and 2 attributes - HP and PRICE to create 4 clusters. """ from sklearn.cluster import KMeans
print(ink) # MEAN ink_mean = [np.mean(ink[labels == i]) for i in range(10)] print(ink_mean) # STANDARD DEV ink_std = [np.std(ink[labels == i]) for i in range(10)] print(ink_std) print(zero_digits) zero_digits_mean = [np.mean(zero_digits[labels == i]) for i in range(10)] zero_digits_std = [np.std(zero_digits[labels == i]) for i in range(10)] print(zero_digits_mean) print(zero_digits_std) ink = prep.scale(ink).reshape(-1, 1) zero_digits = prep.scale(ink).reshape(-1, 1) x_ink = ink x_ink_zero = pd.DataFrame(data=np.column_stack((ink, zero_digits))) x_zero_digits = zero_digits zero = np.array([sum(row) for row in zero]) zero = prep.scale(zero).reshape(-1, 1) x_zero = zero # <number>_extra => concat the ink feature with the zero count feature zero_extra = np.array([np.count_nonzero(row == 0) for row in zero]) zero_extra = prep.scale(zero_extra).reshape(-1, 1) x_zero_extra = pd.DataFrame(data=np.column_stack((zero, zero_extra)))
import numpy as np import sklearn from sklearn.preprocessing import scale from sklearn.datasets import load_digits from sklearn.cluster import KMeans from sklearn import metrics digits = load_digits() data = scale(digits.data) #scaling our data down to a range from -1 to 1 y = digits.target k = len(np.unique(y)) samples, features = data.shape def bench_k_means(estimator, name, data): estimator.fit(data) print('%-9s\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (name, estimator.inertia_, metrics.homogeneity_score(y, estimator.labels_), metrics.completeness_score(y, estimator.labels_), metrics.v_measure_score(y, estimator.labels_), metrics.adjusted_rand_score(y, estimator.labels_), metrics.adjusted_mutual_info_score(y, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean'))) clf = KMeans(n_clusters=k, init="random", n_init=10) bench_k_means(clf,"1", data)
df = df[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']] df['HL_PCT'] = (df['Adj. High'] - df['Adj. Close']) / df['Adj. Close'] * 100.0 df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0 df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']] #label is future price Adj. Close of future #lets create the variable forecast_col = 'Adj. Close' df.fillna(-99999, inplace=True) #NA's treatement forcast_out = int(math.ceil(0.01 * len(df))) print(forcast_out) #10% of total time in future value to predict df['label'] = df[forecast_col].shift(-forcast_out) df.dropna(inplace=True) #features are everything except label x = np.array(df.drop(['label'], 1)) y = np.array(df['label']) #scale along the training data x = preprocessing.scale(x) #print(len(x),len(y)) X_train, X_test, Y_train, Y_test = cross_validation.train_test_split( x, y, test_size=0.2) clf = svm.SVR() #switching it to SVM #try svm.SVR(kernel='poly') clf.fit(X_train, Y_train) accuracy = clf.score(X_test, Y_test) print(accuracy)
knn_1 = KNeighborsClassifier(n_neighbors=5) knn_1.fit(X_train_minmax, y_train.values.ravel()) score = accuracy_score(y_test, knn_1.predict(X_test_minmax)) score """Why Normalization? Normalization rescales the values into a range of [0,1]. This might be useful in some cases where all parameters need to have the same positive scale. """ print(X_train_minmax) # Standardizing the train and test data from sklearn.preprocessing import scale X_train_scale = scale(X_train[[ 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Key', 'Liveness', 'Loudness', 'Mode', 'PreviousHit', 'Speechiness', 'Tempo', 'Valence' ]]) X_test_scale = scale(X_test[[ 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Key', 'Liveness', 'Loudness', 'Mode', 'PreviousHit', 'Speechiness', 'Tempo', 'Valence' ]]) print(X_train_scale) knn_2 = KNeighborsClassifier(n_neighbors=15) knn_2.fit(X_train_scale, y_train.values.ravel()) score = accuracy_score(y_test, knn_2.predict(X_test_scale)) score """Standardization is the process where the features are rescaled so that they’ll have the properties of a standard normal distribution with μ=0 and σ=1, where μ is the mean (average) and σ is the standard deviation from the mean.
# x = np.array([2.5,0.5,2.2,1.9,3.1,2.3,2,1,1.5,1.1]) # y = np.array([2.4,0.7,2.9,2.2,3,2.7,1.6,1.1,1.6,0.9]) x = np.array([1, 2, 3]) y = np.array([4, 5, 6]) x_mean = np.mean(x) y_mean = np.mean(y) scaled_x = x - x_mean scaled_y = y - y_mean # data = np.matrix([[scaled_x[i], scaled_y[i]] for i in range(len(scaled_x))]) data = np.matrix(list(zip(scaled_x, scaled_y))) standard = StandardScaler() standard = scale() data_standard = standard.fit_transform(np.array(list(zip(x, y)))) plt.scatter(scaled_x, scaled_y) plt.scatter(x, y) plt.show() cov = np.cov(scaled_x, scaled_y) cov = np.cov(data.T) eig_val, eig_vec = np.linalg.eig(cov) eig_pairs = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(len(eig_val))] eig_pairs.sort(reverse=True) feature = eig_pairs[0][1] new_data_reduced = np.transpose(np.dot(feature, np.transpose(data)))
seed = 3453 np.random.seed(seed) split = 1 P = np.hstack(dataset['P'][range(0, split) + range(split + 1, 5)]) X = dataset['B'][P] Z = dataset['T'][P] #Z = Z.reshape(Z.shape[0], 1) train_labels = Z Ptest = dataset['P'][split] TX = dataset['B'][Ptest] TZ = dataset['T'][Ptest] #TZ = TZ.reshape(TZ.shape[0], 1) test_labels = TZ Z = scale(Z, axis=0) TZ = scale(TZ, axis=0) weights = [] batch_size = 25 #max_iter = max_passes * X.shape[ 0] / batch_size max_iter = 1000 n_report = X.shape[0] / batch_size stop = climin.stops.AfterNIterations(max_iter) pause = climin.stops.ModuloNIterations(n_report) optimizer = 'gd', {'step_rate': 0.001, 'momentum': 0} typ = 'plain' if typ == 'plain':
#supress scikit future warnings def warn(*args, **kwargs): pass import warnings warnings.warn = warn from numpy import mean, std from sklearn.preprocessing import StandardScaler, scale scaler = StandardScaler() scaler.fit(x) x_transformed = scaler.transform(x) x_scaled = scale(x) y_scaled = scale(y) # print(x_scaled) # print(x_transformed) #print(mean(x)) #print(mean(x_scaled)) #print(mean(x_transformed)) #print(std(x)) #print(std(x_scaled)) #print(std(x_transformed)) #print(mean(x_scaled)) #print(std(x_scaled)) from sklearn.linear_model import Lars from sklearn.model_selection import train_test_split
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns) #####K MEANS CLUSTER from sklearn.decomposition import PCA from sklearn.preprocessing import scale from scipy.spatial.distance import cdist, pdist, euclidean from sklearn.cluster import KMeans from sklearn import metrics X = Fullplayerlistf._get_numeric_data().dropna(axis=1) del X['UFA'] del X['Age'] df = pd.DataFrame(X) X = scale(X) Player = Fullplayerlistf['Player'] #DETERMINE # OF VARIABLES TO USE pca = PCA(n_components=28) pca.fit(X) var = pca.explained_variance_ratio_ var1 = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4) * 100) print(var1) plt.plot(var1) pca = PCA(n_components=21) pca.fit(X) X1 = pca.fit_transform(X) loadings_df = pd.DataFrame(pca.components_, columns=df.columns)
print('方法一求出的众数:', argmaxcount1) print('方法二求出的众数:', argmaxcount2) # 中位数 medianumber = np.median(data) print('中位数:', medianumber) # 极差 ptp = np.ptp(data) print('极差:', ptp) # 标准差 standard = np.std(data) print('标准差:', standard) # 对这些数据进行预处理,使其均值为零,方差为1 data_preprocess = preprocessing.scale(data) print(data_preprocess) print('处理后的数据方差为:', data_preprocess.std()) # 曲线形式,直方图形式 plt.hist(data_preprocess, bins=40, normed=0, facecolor="blue", edgecolor="black", alpha=0.7) plt.xlabel("随机数据的区间") plt.ylabel("随机数据频数/频率") plt.title("随机生成数据的频数直方图") plt.show()
def feature_scale(x): b, h, w, c = x.shape x = scale(x.reshape([b, -1]), 1) return x.reshape([b, h, w, c])
timestep) == 0: #return remainder after division T1_result.append(T1) T2_result.append(T2) water_level_result.append(water_level) surface_runoff_result.append(surface_runoff / timestep * 1000) #mm/day subsurface_runoff_result.append(water_out_subsurface_runoff * daySec * 1000) #mm/day #try to avoid this append procedure, sloving down the code alot? #ADD SENSIBLE AND LATENT HEAT HERE SO YOU CAN PLOt THEM count = count + 1 #%% from sklearn import preprocessing surface_runoff_scaled = preprocessing.scale(surface_runoff_result) plt.figure() plt.plot(surface_runoff_result) plt.show() #%% a = np.zeros(10) print(a) new_a = a for i in range(0, len(a)): new_a[i] = i print(new_a) #%% '''Complete Water Balance''' '''
def long_features(pat, outfile, datapath, timer): f = datapath + "/*mat" pat_num = pat ff = glob.glob(f) label = [str(os.path.basename(n)) for n in ff] print(label) output = [] featureList = [] mytimer = [] bands = [0.1, 4, 8, 12, 30, 70] for j in range(16): mydata = [] for i in range(len(ff)): output = [] outputtimer = [] featureList = [] featureListimer = [] if os.path.basename(ff[i]) == "1_45_1.mat": continue data = get_data(ff[i]) data = preprocessing.scale(data, axis=1, with_std=True) featureList.append("File") # featureListimer.append('File') output.append(label[i]) # outputtimer.append(label[i]) featureList.append("pat") # featureListimer.append('pat') output.append(pat_num) # outputtimer.append(pat_num) welsh = [] hold = spsig.decimate(data[j, :], 5, zero_phase=True) # start = time.time() # featureList.append('sigma%i' % (j)) # output.append(hold.std()) total_time = time.time() - start featureListimer.append("sigma%i" % (j)) outputtimer.append(total_time) # start = time.time() featureList.append("kurt%i" % (j)) output.append(spstat.kurtosis(hold)) # total_time = time.time() - start # featureListimer.append('kurt%i'%(j)) # outputtimer.append(total_time) # start = time.time() featureList.append("skew%i" % (j)) output.append(spstat.skew(hold)) # total_time = time.time() - start # featureListimer.append('skew%i'%(j)) # outputtimer.append(total_time) # start = time.time() # featureList.append('zero%i'%(j)) # output.append(((hold[:-1] * hold[1:]) < 0).sum()) # total_time = time.time() - start # featureListimer.append('zero%i'%(j)) # outputtimer.append(total_time) diff = np.diff(hold, n=1) diff2 = np.diff(hold, n=2) # start = time.time() # featureList.append('sigmad1%i'%(j)) # output.append(diff.std()) # total_time = time.time() - start # featureListimer.append('sigmad1%i'%(j)) # outputtimer.append(total_time) # start = time.time() # featureList.append('sigmad2%i'%(j)) # output.append(diff2.std()) # total_time = time.time() - start # featureListimer.append('sigmad2%i'%(j)) # outputtimer.append(total_time) # start = time.time() featureList.append("zerod%i" % (j)) output.append(((diff[:-1] * diff[1:]) < 0).sum()) # total_time = time.time() - start # featureListimer.append('zerod%i'%(j)) # outputtimer.append(total_time) # start = time.time() # featureList.append('zerod2%i'%(j)) # output.append(((diff2[:-1] * diff2[1:]) < 0).sum()) # total_time = time.time() - start # featureListimer.append('zerod2%i'%(j)) # outputtimer.append(total_time) # start = time.time() featureList.append("RMS%i" % (j)) output.append(np.sqrt((hold**2).mean())) # total_time = time.time() - start # featureListimer.append('RMS%i'%(j)) # outputtimer.append(total_time) # start = time.time() f, psd = spsig.welch(hold, fs=80) print(f) print(psd) print("yes") # total_time = time.time() - start # welsh.append(total_time) psd[0] = 0 # start = time.time() featureList.append("MaxF%i" % (j)) output.append(psd.argmax()) # total_time = time.time() - start # featureListimer.append('MaxF%i'%(j)) # outputtimer.append(total_time) # start = time.time() featureList.append("SumEnergy%i" % (j)) output.append(psd.sum()) # total_time = time.time() - start # featureListimer.append('SumEnergy%i'%(j)) # outputtimer.append(total_time) psd /= psd.sum() for c in range(1, len(bands)): # start = time.time() featureList.append("BandEnergy%i%i" % (j, c)) output.append(psd[(f > bands[c - 1]) & (f < bands[c])].sum()) # total_time = time.time() - start # featureListimer.append('BandEnergy%i%i'%(j,c)) # outputtimer.append(total_time) # start = time.time() # featureList.append('entropy%i'%(j)) # output.append(-1.0*np.sum(psd[f>bands[0]]*np.log10(psd[f>bands[0]]))) # total_time = time.time() - start # featureListimer.append('entropy%i'%(j)) # outputtimer.append(total_time) # pdb.exit() # start = time.time() featureList.append("Mobility%i" % (j)) output.append(np.std(diff) / hold.std()) # total_time = time.time() - start # featureListimer.append('Mobility%i'%(j)) # outputtimer.append(total_time) # start = time.time() featureList.append("Complexity%i" % (j)) output.append(np.std(diff2) * np.std(hold) / (np.std(diff)**2.0)) # total_time = time.time() - start # featureListimer.append('Complexity%i'%(j)) # outputtimer.append(total_time) mydata.append( pd.DataFrame({ "Features": output }, index=featureList).T) # mytimer.append(pd.DataFrame({'Features':outputtimer},index=featureListimer).T) welsh_df = pd.DataFrame(welsh, columns=["value"]) trainSample = pd.concat(mydata, ignore_index=True) new_outfile = outfile[:-4] + "_" + str(j) + ".csv" trainSample.to_csv(new_outfile) return 1
else: # condition where student received an incomplete new.append(2) return(new) # 1-dimensional array returned X = df.drop('G3',1) # this is the design matrix y = list(df.G3) # this is the discrete response vector y_new = response_conv(y) # this is the multinomial response vector clf = DecisionTreeClassifier() clf.fit(X,y) model = SelectFromModel(clf,prefit=True) newX = model.transform(X) # select most influential predictors X_scale = preprocessing.scale(newX) # scaled design matrix X_norm = preprocessing.normalize(newX) # normalized design matrix random.seed(42) X1_train, X1_test, y1_train, y1_test = train_test_split(newX, y_new, test_size=0.33, random_state=42) X2_train, X2_test, y2_train, y2_test = train_test_split(X_scale, y_new, test_size=0.33, random_state=42) X3_train, X3_test, y3_train, y3_test = train_test_split(X_norm, y_new, test_size=0.33, random_state=42) ######################################################################################################################## combos = cartesian([['gini','entropy'],['best','random'],['auto','log2'],np.arange(1,(X1_train.shape[0]-1))]) def opt(X,y): acc = [] for c,s,mf,md in combos: dt = DecisionTreeClassifier(criterion=c,splitter=s,max_features=mf,max_depth=int(md),random_state=42) scores = cross_val_score(dt, X, y, cv=10, scoring='accuracy')
def normalize_data(tr_x,ts_x,normz=None,axis=0): if normz is 'scale': tr_x = scale(tr_x,axis=axis) ts_x = scale(ts_x,axis=axis) elif normz is 'minmax': minmax_scaler = MinMaxScaler() if axis==0: for c_i in range(tr_x.shape[1]): tr_x[:,c_i] = minmax_scaler.fit_transform(tr_x[:,c_i]) ts_x[:,c_i] = minmax_scaler.fit_transform(ts_x[:,c_i]) elif axis==1: for r_i in range(tr_x.shape[0]): tr_x[r_i,:] = minmax_scaler.fit_transform(tr_x[r_i,:]) ts_x[r_i,:] = minmax_scaler.fit_transform(ts_x[r_i,:]) elif normz is 'sigmoid': if axis==0: col_max = np.max(tr_x,axis=0) cols_non_norm = np.argwhere(col_max>1).tolist() tr_x[:,cols_non_norm] = -0.5 + (1 / (1 + np.exp(-tr_x[:,cols_non_norm]))) # TODO: implement col_max col_non_norm for test set ts_x[:,cols_non_norm] = -0.5 + (1/(1+np.exp(-ts_x[:,cols_non_norm]))) elif axis==1: row_max = np.max(tr_x,axis=1) rows_non_norm = np.argwhere(row_max>1).tolist() tr_x[rows_non_norm,:] = -0.5 + (1 / (1 + np.exp(-tr_x[rows_non_norm,:]))) # TODO: implement row_max row_non_norm for test set ts_x[rows_non_norm,:] = -0.5 + (1/(1+np.exp(-ts_x[rows_non_norm,:]))) return tr_x,ts_x