def readTestData(): testData = np.loadtxt('data/test.csv', delimiter=',', skiprows=1) xTest = testData[:,1:31] scale = MMS() allX = scale.fit_transform(xTest) indexTest = list(testData[:,0]) return [allX, indexTest]
def test_min_max_scaler_iris(): X = iris.data scaler = MinMaxScaler() # default params X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.min(axis=0), 0) assert_array_almost_equal(X_trans.max(axis=0), 1) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # not default params: min=1, max=2 scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), 1) assert_array_almost_equal(X_trans.max(axis=0), 2) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # min=-.5, max=.6 scaler = MinMaxScaler(feature_range=(-.5, .6)) X_trans = scaler.fit_transform(X) assert_array_almost_equal(X_trans.min(axis=0), -.5) assert_array_almost_equal(X_trans.max(axis=0), .6) X_trans_inv = scaler.inverse_transform(X_trans) assert_array_almost_equal(X, X_trans_inv) # raises on invalid range scaler = MinMaxScaler(feature_range=(2, 1)) assert_raises(ValueError, scaler.fit, X)
def sample_from_generator(history, nb_samples, latent_dim=12, valid_split=0.3, random_split=True, hidden_dims=None, **kwargs): scaler = MinMaxScaler() scaler.fit(history) scaled = scaler.transform(history) nb_train = history.shape[0] if not valid_split: nb_valid = 0 elif isinstance(valid_split, float): nb_valid = nb_train - int(np.floor(nb_train*valid_split)) else: nb_valid = valid_split if nb_valid > 0: if random_split: ind = np.arange(nb_train) np.random.shuffle(ind) x_valid = scaled[ind[-nb_valid:], :] x_train = scaled[ind[:-nb_valid], :] else: x_valid = scaled[-nb_valid:, :] x_train = scaled[:-nb_valid, :] else: x_valid = None x_train = scaled _, generator = build_model(latent_dim, x_train, x_valid=x_valid, hidden_dims=hidden_dims, **kwargs) normal_sample = np.random.standard_normal((nb_samples, latent_dim)) draws = generator.predict(normal_sample) return scaler.inverse_transform(draws)
mpl.rc('figure', figsize=(8, 7)) from matplotlib.pylab import rcParams rcParams['figure.figsize'] = 20, 10 from sklearn.preprocessing import MinMaxScaler from sklearn.neural_network import MLPClassifier as MLP import pandas_datareader.data as web from pandas import Series, DataFrame from sklearn.linear_model import LinearRegression import datetime, math from sklearn.neighbors import KNeighborsRegressor as knn import matplotlib.dates as mdates clf = LinearRegression() #n_jobs=-1) days = 240 sight = 480 scaler = MinMaxScaler(feature_range=(0, 1)) start = datetime.datetime(2010, 1, 1) end = datetime.datetime.today() + datetime.timedelta(days=days) dayss = (end - start).days predicted_list = [end - datetime.timedelta(days=x) for x in range(days)] predicted_list.reverse() stock = input("Stock: ").upper() df = web.DataReader(stock, 'yahoo', start, end) data = df['Adj Close'] X, y = [], [] fig, ax = plt.subplots() formatter = mdates.DateFormatter("%Y") date_list = list(data.reset_index()["Date"]) for i in range(0, len(data) - (sight + 1)):
import numpy as np import pandas as pd import tensorflow as tf import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler data = pd.read_csv('monthly-milk-production-pounds.csv', index_col='Month') print(data.head()) data.plot() plt.show() data.index = pd.to_datetime(data.index) train_data = data.head(156) test_data = data.tail(12) scl = MinMaxScaler() train_scaled = scl.fit_transform(train_data) test_scaled = scl.transform(test_data) print(train_scaled) print(test_scaled) def next_batch(training_data, steps): random_start = np.random.randint(0, len(training_data) - steps) y_data = np.array(training_data[random_start:random_start + steps + 1]).reshape(1, steps + 1) return y_data[:, :-1].reshape(-1, steps, 1), y_data[:, 1:].reshape(-1, steps, 1) num_inputs = 1 num_outputs = 1
import matplotlib.pyplot as plt import pandas as pd train_date = pd.Timestamp('2015-06-20') train = naver2.loc[:train_date, ['Close']] test = naver2.loc[train_date:, ['Close']] ax = train.plot() test.plot(ax=ax) plt.legend(['train', 'val','test']) plt.show() from sklearn.preprocessing import MinMaxScaler sc = MinMaxScaler() train_sc = sc.fit_transform(train) test_sc = sc.transform(test) train_sc.shape train_sc.head() train_sc_df = pd.DataFrame(train_sc, columns=['Scaled'], index=train.index) test_sc_df = pd.DataFrame(test_sc, columns=['Scaled'], index=test.index) train_sc_df.head() for s in range(1, 13): train_sc_df['shift_{}'.format(s)] = train_sc_df['Scaled'].shift(s) test_sc_df['shift_{}'.format(s)] = test_sc_df['Scaled'].shift(s)
sel_rows = df_merged_volume[lambda r: ((r.timeofday >= 6) & (r.timeofday < 10)) | ( (r.timeofday >= 15) & (r.timeofday < 19))] sel_rows = sel_rows[useful_cols] #split to train and test set train_rows = sel_rows[:-24 * 7] test_rows = sel_rows[-24 * 7:] #reserve 1 week for test #get numpy array from panda dataframe train_arr = train_rows.values test_arr = test_rows.values #scale feature array to range -1 to 1 scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(train_arr) train_scaled_arr = scaler.transform(train_arr) test_scaled_arr = scaler.transform(test_arr) #sample subsequence from the time series train_seqs = [] nSegments = train_arr.shape[ 0] // 12 # each segment holds 4hr data (12 datapoints, 20min each) for segment in range(nSegments): for t in range(6): startIdx = segment * 12 + t train_seqs.append(train_scaled_arr[startIdx:startIdx + 7]) train_seqs = np.stack(train_seqs) test_seqs = []
data.describe() # Create dummy variables for categorical feature data['area code'] = data['area code'].astype(str) cotegorical_columns = ['state', 'area code'] df_dummies = pd.get_dummies(data[cotegorical_columns]) data = pd.merge(data, df_dummies, how="inner", left_index=True, right_index=True).drop(columns=cotegorical_columns) # Min-Max Scaling from sklearn.preprocessing import MinMaxScaler min_max_scaler = MinMaxScaler() column_list_for_scaling = data.columns.tolist()[:17] data[column_list_for_scaling] = min_max_scaler.fit_transform( data[column_list_for_scaling]) X = data.drop(['churn'], axis=1) y = data.churn # CLASSIFIER SELECTION PIPELINE # https://www.kaggle.com/sandipdatta/customer-churn-analysis # from "Stratified Cross Validation - Since the Response values are not balanced" on # ensemble.GradientBoostingClassifier # svm.SVC # ensemble.RandomForestClassifier # neighbors.KNeighborsClassifier
class PrepareDataset: def __init__(self, path_to_data, dataset_name='full_dataset.csv'): self.path_to_data = path_to_data self.dataset_name = dataset_name # self.pred_template_name = pred_template_name self.initial_df = pd.read_csv(os.path.join(self.path_to_data, self.dataset_name)) # self.pred_template = pd.read_csv(os.path.join(self.path_to_data, self.pred_template_name)) self.time_periods = np.unique(self.initial_df['time_period'], return_index=True)[0] self.time_periods_index = np.unique(self.initial_df['time_period'], return_index=True)[1] self.transformed_df = None self.scaler = None self.transformed_scaled_df = None self.pca = None def collapse_basic_indicators(self, df=None, indicators_range=range(1, 71)): if df is None: df = copy.copy(self.initial_df) print('Begin to collapse basic indicators') print('{} indicators were selected'.format(indicators_range)) variable_list = ["X" + str(i) + '_' for i in indicators_range] for var in variable_list: df[var + 'avg'] = df.filter(regex=var).mean(axis=1) for var in variable_list: df[var + 'std'] = df.filter(regex=var).std(axis=1) for var in variable_list: df[var + 'avg' + '_pctile'] = stat.rankdata(df[var + 'avg']) / df[var + 'avg'].shape[0] for var in variable_list: df[var + 'std' + '_pctile'] = stat.rankdata(df[var + 'std']) / df[var + 'std'].shape[0] model_data_new = pd.concat([df.iloc[:, 0:5], df.filter(regex='avg'), df.filter(regex='std'), ], axis=1) model_data_new['is_second_half'] = model_data_new['time_period'].apply(lambda x: 1 if x.endswith('1') else 0) model_data_new.fillna(0, inplace=True) self.transformed_df = model_data_new print('Basic indicators collapsed') def drop_outliers(self, quantile = None): if quantile is None: quantile = 0.001 print('Start to drop rows with outliers. {} quantile will be removed from each side'.format(quantile/2.)) idx_to_drop = [] features_list = list(self.transformed_df.filter(regex='avg$').columns) for col in features_list: condition1 = self.transformed_df.Train == 1 condition2 = self.transformed_df[col] > self.transformed_df[col].quantile(1. - quantile/2.) condition3 = self.transformed_df[col] < self.transformed_df[col].quantile(quantile/2.) to_drop = list(self.transformed_df[(condition1) & (condition2)].index) idx_to_drop += to_drop to_drop = list(self.transformed_df[(condition1) & (condition3)].index) idx_to_drop += to_drop self.transformed_df = self.transformed_df.drop(list(set(idx_to_drop))) print('Done. {} rows were removed'.format(len(list(set(idx_to_drop))))) def add_tech_indicators(self, df=None): if df is None: df = self.transformed_df if self.transformed_df is not None else self.initial_df print('Start to form basic indicators') data_agg_avg = df.groupby(['time_period']).mean() period_agg_df = pd.DataFrame(index=self.time_periods) period_agg_df['Close'] = data_agg_avg['Norm_Ret_F6M'] # Shift Norm_Ret_F6M to the next period to avoid future looking period_agg_df['Close'] = period_agg_df['Close'].shift(1) period_agg_df['Close'].fillna(method='bfill', inplace=True) period_agg_df = ti.MA(period_agg_df, 2) period_agg_df = ti.MA(period_agg_df, 3) period_agg_df = ti.EMA(period_agg_df, 2) period_agg_df = ti.EMA(period_agg_df, 3) period_agg_df = ti.MOM(period_agg_df, 2) period_agg_df = ti.MOM(period_agg_df, 3) period_agg_df = ti.ROC(period_agg_df, 2) period_agg_df = ti.ROC(period_agg_df, 3) period_agg_df = ti.MACD(period_agg_df, 2, 3) period_agg_df = ti.KST(period_agg_df, 1, 2, 3, 4, 1, 2, 3, 4) period_agg_df = ti.TSI(period_agg_df, 2, 2) period_agg_df = ti.COPP(period_agg_df, 2) period_agg_df = ti.COPP(period_agg_df, 3) period_agg_df = ti.STDDEV(period_agg_df, 2) period_agg_df = ti.STDDEV(period_agg_df, 3) # concatenate technical indicator with the transformed dataset df = df.join(period_agg_df, on='time_period') df.fillna(method='ffill', inplace=True) df.fillna(method='bfill', inplace=True) self.transformed_df = df print('Done') def generate_synthetic_indicators(self, types=None): features_list = list(self.transformed_df.filter(regex='avg$').columns) if types is None: types = ['substract', 'multiply'] for i1, col1 in enumerate(features_list): print('Processing column {}'.format(col1)) for i2, col2 in enumerate(features_list): if 'substract' in types: self.transformed_df['%s_%s_1' % (col1, col2)] = self.transformed_df[col1] - \ self.transformed_df[col2] if 'add' in types: self.transformed_df['%s_%s_2' % (col1, col2)] = self.transformed_df[col1] + \ self.transformed_df[col2] if 'divide' in types: self.transformed_df['%s_%s_3' % (col1, col2)] = self.transformed_df[col1] / \ (self.transformed_df[col2]+0.01) if 'multiply' in types: self.transformed_df['%s_%s_4' % (col1, col2)] = self.transformed_df[col1] * \ self.transformed_df[col2] print('Done') def scale_df(self, scaled_columns=None): if scaled_columns is None: scaled_columns = [x for x in self.transformed_df.columns if x.endswith('pctile')] columns_for_scale = set(self.transformed_df.iloc[:, 5:].columns) - set(scaled_columns) self.scaler = MinMaxScaler() df_for_scale = copy.copy(self.transformed_df) df_for_scale.loc[:, columns_for_scale] = self.scaler.fit_transform(df_for_scale.loc[:, columns_for_scale]) self.transformed_scaled_df = df_for_scale def apply_pca_to_scaled_df(self, n_components=0.99, svd_solver='full'): self.pca = PCA(n_components=n_components, svd_solver=svd_solver, random_state=17) self.pca.fit(self.transformed_scaled_df.iloc[:, 5:]) pca_arr = self.pca.transform(self.transformed_scaled_df.iloc[:, 5:]) columns_pca = ['pca_' + str(x) for x in range(1, pca_arr.shape[1]+1)] pca_df = pd.DataFrame(pca_arr, columns=columns_pca) self.transformed_scaled_df = pd.concat([self.transformed_scaled_df.iloc[:, :5], pca_df], axis=1)
idmax = int(0.8 * ndat) a = mfrw.fitrw([dat[:idmax, 0]], [dat[:idmax, 1]], [sig[:idmax]], floin=1. / 200, fhiin=2.0, ploton=1, dtresin=-1, nits=1, tplotlims=[-10.0, 120.0, 0.1]) # load the dataset else: dataframe = read_csv(tit_input, usecols=[1], engine='python', skipfooter=3) dataset = dataframe.values dataset = dataset.astype('float32') # normalize the dataset scaler = MinMaxScaler(feature_range=(0.0, 1.0)) dataset = scaler.fit_transform(dataset) # split into train and test sets train_size = idtrain_end #int(len(dataset) * 0.09) test_size = len(dataset) - train_size train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :] # reshape into X=t and Y=t+1 trainX, trainY = create_dataset(train, look_back) testX, testY = create_dataset(test, look_back) # reshape input to be [samples, time steps, features] trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
x = tf.placeholder(tf.float32, shape=[None, 10]) y = tf.placeholder(tf.float32, shape=[None, 1]) # 실습 dataset = load_diabetes() x_data = dataset.data y_data = dataset.target.reshape(-1, 1) print(x_data.shape) print(y_data.shape) x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=42) scaler = MinMaxScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) w = tf.Variable(tf.random_normal([10, 1]), name='weight') b = tf.Variable(tf.random_normal([1]), name='bias') hypothesis = tf.matmul(x, w) + b cost = tf.reduce_mean(tf.square(hypothesis - y)) train = tf.train.GradientDescentOptimizer(learning_rate=0.3475).minimize(cost) with tf.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer())
from keras.models import Sequential from keras.utils import np_utils from keras.layers.core import Dense, Activation, Dropout from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from keras.utils.vis_utils import plot_model import csv sc= MinMaxScaler() import pandas as pd import numpy as np # Read data data = pd.read_csv('./desharnais.csv') Y = data.iloc[:, 6].values data = data.drop('YearEnd', axis=1) data = data.drop('Effort', axis=1) X = data.iloc[:,2:] Y = Y.reshape(-1, 1) X_normalised = sc.fit_transform(X) Y_normalised = sc.fit_transform(Y) total_length = len(data) train_length = int(0.8*total_length) test_length = int(0.2*total_length) X_train = X_normalised[:train_length] X_test = X_normalised[train_length:] Y_train = Y_normalised[:train_length] Y_test = Y_normalised[train_length:]
def normalize(dataset): scaler = MinMaxScaler(feature_range=(0,1)) scaled_data = scaler.fit_transform(dataset) return scaled_data
df = web.DataReader('AAPL', data_source='yahoo', start='2012-01-01', end='2020-08-19') ## pulling data plt.figure(figsize=(16,8)) plt.plot(df['Close']) plt.title('Close Price of MSFT') plt.xlabel('Date', fontsize=18) plt.ylabel('Close Price', fontsize=18) # plt.show() ## process data data = df.filter(['Close']) dataset = data.values trainDataLength = math.ceil(len(dataset) * 0.8) ## scale data scaler = MinMaxScaler(feature_range=(0,1)) scaled_data = scaler.fit_transform(dataset) trainData = scaled_data[0:trainDataLength , :] ## separate into xtrain and ytrain xtrain = [] ytrain = [] for i in range(60, len(trainData)): # past 60 days xtrain.append(trainData[i-60:i,0]) ytrain.append(trainData[i,0]) ## convert to numpy arrays xtrain, ytrain = np.array(xtrain), np.array(ytrain) ## reshape data to 3D for LSTM model xtrain = np.reshape(xtrain, (xtrain.shape[0], xtrain.shape[1], 1))
This CV training loop standardizes X and standardizes Y every iteration for each CV fold. """ fold_id = str(i+1) print('fold: ', fold_id) cv_train, cv_test = training.iloc[cv_train_idx[i], :].copy(), training.iloc[cv_test_idx[i], :].copy() # below: X standardization cv_train_scaler_X = StandardScaler() cv_train[cv_train.columns[~cv_train.columns.isin(['subject', 'PCL', 'group'])]] = cv_train_scaler_X.fit_transform( cv_train[cv_train.columns[~cv_train.columns.isin(['subject', 'PCL', 'group'])]]) cv_test[cv_test.columns[~cv_test.columns.isin(['subject', 'PCL', 'group'])]] = cv_train_scaler_X.transform( cv_test[test.columns[~cv_test.columns.isin(['subject', 'PCL', 'group'])]]) # below: Y min-max scaling cv_train_scaler_Y = MinMaxScaler(feature_range=(0, 1)) cv_train[cv_train.columns[cv_train.columns.isin(['PCL'])]] = cv_train_scaler_Y.fit_transform( cv_train[cv_train.columns[cv_train.columns.isin(['PCL'])]]) cv_test[cv_test.columns[cv_test.columns.isin(['PCL'])]] = cv_train_scaler_Y.fit_transform( cv_test[cv_test.columns[cv_test.columns.isin(['PCL'])]]) # transform into numpy arrays cv_train_X, cv_train_Y = longitudinal_cv_xy_array(input=cv_train, Y_colnames=['PCL'], remove_colnames=['subject', 'group'], n_features=n_features) cv_test_X, cv_test_Y = longitudinal_cv_xy_array(input=cv_test, Y_colnames=['PCL'], remove_colnames=['subject', 'group'], n_features=n_features) # train cv_m, cv_m_history, cv_pred, cv_m_test_rmse, cv_m_test_rsq = lstm_cv_train(trainX=cv_train_X, trainY=cv_train_Y, testX=cv_test_X, testY=cv_test_Y, lstm_model='stacked',
def write_submission(preds, output): sample = pd.read_csv('sampleSubmission.csv') preds = pd.DataFrame(preds, index=sample.id.values, columns=sample.columns[1:]) preds.to_csv(output, index_label='id') def load_test(): test = pd.read_csv('test.csv') test = test.drop('id', axis=1) return test.values X, y = load_train_data() scaler = MinMaxScaler() #Tested with and without X = scaler.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) X_FinalTest = load_test() model = Sequential() #Input layer model.add( Dense(93, input_dim=93, kernel_initializer='normal', activation='relu') ) #Tested with Neurons: 100,93: PRELU activation, RELU Activation model.add(Dropout(0.13)) #Tested with: 0.0, 0.1, 0.13 model.add(BatchNormalization()) # with and without
employee_df['BusinessTravel'].unique() X_numerical = employee_df[['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']] X_all = pd.concat([X_cat, X_numerical], axis = 1) "NORMALIZAÇÃO DOS DADOS para não considerar um dado mais importante que o outro" from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X = scaler.fit_transform(X_all) #atributos previsores y = employee_df['Attrition'] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) # 25% dos dados para testar e o restante para o algoritmo aprender # X_TRAIN contem os dados que serão utilizados como base para a previsão # Y_TRAIN contem os dados que contem os dados de entrada para o algoritmo prever X_train.shape, y_train #X_test -> Atributos previsores #y_test -> Classe X_test.shape, y_test
# EmployeeNumber是唯一识别号码,删除 df_train.drop( ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True) df_test.drop( ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True) # 预测变量 target_var = 'Attrition' # 连续变量 continuous_var = [ 'Age', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager' ] scaler = MinMaxScaler() df_train[continuous_var] = scaler.fit_transform(np.log(df_train[continuous_var]+0.001)) pca_result = pca.fit_transform(df_train[continuous_var]) ##print(pca.components_) #print(pca_result) label = df_train[target_var].map({0:'red', 1:'green'}) first_pc = pca.components_[0] second_pc = pca.components_[1] fig = plt.figure() for ii, jj in pca_result: plt.scatter(first_pc[0]*ii[0], first_pc[1]*ii[0], c='r') plt.scatter(second_pc[0]*ii[1], second_pc[1]*ii[1], c='c')
modelName = "naiveLstm" resultDicmainStr = dataSrc + "_" + modelName + "_resultDicmain_4Time3InOut100ep_200_100" baseAddress = "lstm_multi_stacked\\" dataFileBir = 'finalBirminghamDataArrDF.csv' resultGraphBase = "\\multiLstm\\multiExpResul\\" # load the new file dataset = read_csv(dataFileBir, header=0, infer_datetime_format=True, parse_dates=['date'], index_col=0).fillna(0) values = dataset.iloc[:, :-18].values print("values.shape: ", values.shape) scaler = MinMaxScaler(feature_range=(0, 1)) scaledValues = scaler.fit_transform(values) print(scaledValues.shape) # div factor is week n_train = 1187 + 1 #n_train+8064 trainFrom, trainTo, testFrom, testTo, divFactor = 0, n_train, n_train, len( values), 1 train, test = split_dataset2(values, trainFrom, trainTo, testFrom, testTo, divFactor) print("train.shape: ", train.shape) print("test.shape: ", test.shape) #def RunGetResult(dataSrc,expNum,inN,outN,train,test): for expNum in expNumRange:
nt = int(T / dt) # data = y1 and y2; target = slope = (y^(n+1)-y^n)/dt # y1(t) = cos(wt), y2(t) = -w*sin(wt) data = [[np.cos(omega * dt * i), -omega * np.sin(omega * dt * i)] for i in range(nt)] data = np.array(data, dtype=np.float64) data = data target = [[(data[i, 0] - data[i - 1, 0]) / dt, (data[i, 1] - data[i - 1, 1]) / dt] for i in range(1, nt)] target = np.array(target, dtype=np.float64) target = target # normalization scaler = MinMaxScaler(feature_range=(0, 1)) #data = scaler.fit_transform(data) #target = scaler.fit_transform(target) # data for training # data_train = data[1:100].reshape(99,1,2) # target_train = target.reshape(99,1,2) data_train = data[1:100].reshape(99, 1, 2) target_train = target #.reshape(1,99,2) model = Sequential() model.add(LSTM(4, input_shape=(1, 2), activation='tanh')) model.add(Dense(2)) # compile the model
def __init__(self, limits): self._fit_inner = False self.limits = np.array(limits) self.cdf = norm(0, 1).cdf self.icdf = norm(0, 1).ppf self.scaler = MinMaxScaler()
data.loc[data["Embarked"] == "S", "Embarked"] = 0 data.loc[data["Embarked"] == "C", "Embarked"] = 1 data.loc[data["Embarked"] == "Q", "Embarked"] = 2 data["Embarked"] = data["Embarked"].fillna(3) # data.loc[data["Embarked"]==None,"Embarked"]=3 print(data["Embarked"].describe()) print(data["Embarked"].unique()) print(data["Embarked"].value_counts()) print("--------------追加特征---------------") print(data["Ticket"].describe()) print(data["Ticket"].unique()) print(data["Ticket"].value_counts()) print("--------------Fare 归一化---------------") data_scaler = MinMaxScaler(feature_range=(0, 1)) data_Fare = np.array(data["Fare"].values) lenInt = len(data_Fare) arr = [] for i in range(0, lenInt): temp = [] temp.append(data_Fare[i]) arr.append(temp) data_rescaledX = data_scaler.fit_transform(arr) data["Fare_scaler"] = data_rescaledX print("--------------Name 特征处理---------------") data["NameLength"] = data["Name"].apply(lambda x: len(x)) def getTitle(name):
def fit_modele(self, config, final=False): """ Fait un fit rapide (100 itérations par défaut) d'un LSTM selon les paramètres contenus dans config (h, n, f, d) """ resultat = None key = str(config) iter = 500 if final else 100 # entraînement final du modèle retenu nbre_couches = config.get("nbre_couches") taille = config.get("taille_entree") nbre_neurones = config.get("nbre_neurones") activation = config.get("activation") dropout = config.get("dropout") nbre_retards = np.count_nonzero( np.isnan(self.serie.data['Série stationnarisée'])) # MinMaxScaler donnees_brutes = self.serie.data['Série stationnarisée'][0:self.serie.index_fin_entrainement].dropna( ).values scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(np.array(donnees_brutes).reshape(-1, 1)) serie_reduite = scaler.transform(np.array( self.serie.data['Série stationnarisée'].dropna().values).reshape(-1, 1)) a = np.empty((1, nbre_retards)) a[:] = np.nan serie_reduite = np.concatenate((a, np.array(serie_reduite)), axis=0) self.serie.data['Série stationnarisée réduite'] = serie_reduite X_train, y_train = decouper_serie_apprentissage_supervise( self.serie.data['Série stationnarisée réduite'][0:self.serie.index_fin_entrainement].dropna().values, taille) X_test, y_test = decouper_serie_apprentissage_supervise( self.serie.data['Série stationnarisée réduite'][self.serie.index_fin_entrainement:self.serie.index_fin_test].dropna().values, taille) n_features = 1 # une variable explicative X_train = X_train.reshape( (X_train.shape[0], X_train.shape[1], n_features)) X_test = X_test.reshape( (X_test.shape[0], X_test.shape[1], n_features)) # Création du réseau de neurones model = Sequential() # couche d'entrée for i in range(0, nbre_couches): model.add(kLSTM(nbre_neurones, activation=activation, return_sequences=True, input_shape=(taille, n_features))) model.add(Dropout(dropout)) # ajout d'un dropout # dernière couche (pas de retour) model.add(kLSTM(nbre_neurones, activation=activation)) model.add(Dropout(dropout)) # ajout d'un dropout # couche de sortie (1 dimension) model.add(Dense(1)) methode_optimisation = optimizers.Nadam() model.compile(optimizer=methode_optimisation, loss='mse') # Critère d'arret prématuré, aucune amélioration sur le jeu de test # pendant plus de 20 itérations critere_stop = EarlyStopping( monitor='val_loss', min_delta=0, patience=20) # Fit du modèle historique = model.fit( X_train, y_train, validation_data=(X_test, y_test), epochs=iter, verbose=final, callbacks=[critere_stop], shuffle=False) if final: # sauvegarde de l'historique d'entrainement si modèle final self.historique = historique # Prédiction sur jeu de test + validation serie_predite = [] serie_predite_temp = [] # stock les prédictions réduites serie_predite_dynamique = [] # walk-forward validation (one step ahead) for i in range(0, len(self.serie.data['Test'].dropna())+len(self.serie.data['Validation'].dropna())): x_input = self.serie.data['Série stationnarisée réduite'][self.serie.index_fin_entrainement - taille+i:self.serie.index_fin_entrainement+i].values x_input = x_input.reshape((1, taille, n_features)) yhat = model.predict(x_input, verbose=0)[0][0] serie_predite_temp.append(yhat) # inversion de la mise à l'échelle padding = np.zeros(taille-1).reshape(1, taille - 1) yhat = np.append(padding, [yhat]).reshape(1, -1) yhat = scaler.inverse_transform(yhat) yhat = yhat[0][-1] # déstationnarisation yhat = yhat + \ self.serie.data['Série'][self.serie.index_fin_entrainement+i-nbre_retards] serie_predite.append(yhat) # prévision dynamique if final: anciennes_predictions = [] for i in range(0, len(self.serie.data['Test'].dropna())+len(self.serie.data['Validation'].dropna())): decoupe = -taille + i if decoupe < 0: x_input_dynamique = np.append( self.serie.data['Série stationnarisée réduite'][decoupe:].values, anciennes_predictions) else: x_input_dynamique = np.array(anciennes_predictions)[-taille:] x_input_dynamique = x_input_dynamique.reshape( (1, taille, n_features)) yhat_dynamique = model.predict( x_input_dynamique, verbose=0)[0][0] anciennes_predictions.append(yhat_dynamique) # inversion de la mise à l'échelle padding = np.zeros(taille-1).reshape(1, taille - 1) yhat_dynamique = np.append( padding, [yhat_dynamique]).reshape(1, -1) yhat_dynamique = scaler.inverse_transform(yhat_dynamique) yhat_dynamique = yhat_dynamique[0][-1] # déstationnarisation yhat_dynamique = yhat_dynamique + \ self.serie.data['Série'][self.serie.index_fin_entrainement+i-nbre_retards] serie_predite_dynamique.append(yhat_dynamique) # ajout d'un padding avec des nan a = np.empty((1, len(self.serie.data['Entraînement'].dropna()))) a[:] = np.nan serie_predite = np.concatenate((a[0], np.array(serie_predite)), axis=0) # calcul du MSE uniquement sur le jeu de test resultat = mean_squared_error( serie_predite[self.serie.index_fin_entrainement:self.serie.index_fin_test], self.serie.data['Test'].dropna()) if final: self.serie.data[self.__class__.__name__] = serie_predite # ajout d'un padding avec des nan a = np.empty((1, len(self.serie.data['Entraînement'].dropna()))) a[:] = np.nan serie_predite_dynamique = np.concatenate( (a[0], np.array(serie_predite_dynamique)), axis=0) self.serie.data[self.__class__.__name__ + "_dynamique"] = serie_predite_dynamique self.modele = model print("Fit du modèle " + key + " : " + str(resultat)) return (key, resultat)
# Separate the variables to traning # In[279]: training_vars = [var for var in X_train.columns if var not in ['PassengerId', 'Survived']] training_vars # In[280]: # fit scaler scaler = MinMaxScaler() # create an instance scaler.fit(X_train[training_vars]) # fit the scaler to the train set and then transform it # ### Phase #4 Modeling & #5 Evaluation # ##### Machine Learning algorithm building # #### xgboost # In[281]: xgb_model = xgb.XGBClassifier() eval_set = [(X_test[training_vars], y_test)]
#=============================================================================# ############################# import functions ################################ #=============================================================================# import numpy as np import os import pandas as pd from scipy import signal from scipy.signal import resample from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() from Utils.utils_aug_ft import augment_train_set_ft, segment_signal from Utils.FeatureExtraction import ExtraFeatures #=============================================================================# ########################## read data: simulation ############################## #=============================================================================# ## 20 mm wheel flat simulation data FolderPath = os.path.abspath( os.path.join(os.getcwd(), "./Data/Augmentation/input_data/")) sim_normal = pd.read_csv(FolderPath + '/sim_norm_20mm.txt', engine='python') # can also be 30mm or 50mm WF sim_WF = pd.read_csv(FolderPath + '/sim_WF_20mm.txt', engine='python') # time shifting for more data samples FolderPath1 = ['v70'] # choose the speed from dataframe FolderPath2 = ['normal', 'WF'] FolderPath3 = ['Good', 'Bad'] for i in range(len(FolderPath1)): for j in range(len(FolderPath2)): a_varname = 'sim_' + FolderPath2[j] + '_' + FolderPath1[i] df_varname = 'sim_' + FolderPath2[j] globals()[a_varname] = segment_signal(globals()[df_varname],
nosym=True) c = FixAtoms(indices=[atom.index for atom in atoms if atom.position[2]<8]) atoms.set_constraint(c) identified_images = Trajectory('identified_images.traj','a', properties = ['energy','forces']) traj_md = Trajectory('md.traj','a', properties=['energy','forces']) path = ['/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/sym70_2L70/newriver/amp-checkpoint.amp', '/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/sym70_2L65/newriver/amp-checkpoint.amp', '/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/sym70_2L60/newriver/amp-checkpoint.amp', ] fingerprints = np.loadtxt('/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/trajs/fingerprints/total_fingerp.txt') scaler_fp = MinMaxScaler(feature_range=(-1,1), copy=True) scaler_fp.fit(fingerprints) scaled_fp = scaler_fp.transform(fingerprints) atoms_chg = io.read('/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/trajs/fingerprints/identified.traj',index=':') chg = np.zeros(len(atoms_chg)*2) i = 0 for atom in atoms_chg: for index in range(12,14): chg[i]=atom.get_charges()[index] i += 1 scaler_chg = MinMaxScaler(feature_range=(-1,1), copy=True) scaler_chg.fit(chg.reshape(-1,1)) scaled_chg = scaler_chg.transform(chg.reshape(-1,1)) X_train, X_test, Y_train, Y_test = train_test_split(scaled_fp[:], scaled_chg[:], test_size = 0.2, random_state = None)
def create_segments(df, cluster_num=5, do_minmax=False, do_pca=True, generate_report=True): ''' Creates cluster segments and generates report Input: df: the final dataset that is ready for clustering (e.g. starbucks_imputed) cluster_num: number of clusters to be created do_minmax (bool): flag condition to use MinMax Scaler if True, otherwise use Standard Scaler do_pca (bool): flag condition to perfom PCA if True generate_report (bool): whether to generate report (i.e. plot segments interpreter, customer segments by size, metrics) ''' #one-hot encoding starbucks_ohe = df.copy() starbucks_ohe.drop('person', axis=1, inplace=True) categorical_col = starbucks_ohe.columns[(starbucks_ohe.dtypes == 'category') | (starbucks_ohe.dtypes == 'object')] starbucks_ohe = pd.get_dummies(starbucks_ohe, columns=categorical_col) # feature scaling if do_minmax == False: scaler = StandardScaler().fit(starbucks_ohe) else: scaler = MinMaxScaler().fit(starbucks_ohe) starbucks_scaled = scaler.transform(starbucks_ohe) # PCA if do_pca == True: pca = PCA() X_pca = pca.fit_transform(starbucks_scaled) cum_expl_var_ratio = np.cumsum(pca.explained_variance_ratio_) #choose number of components that explain ~80% of variance components_num = len(cum_expl_var_ratio[cum_expl_var_ratio <= 0.805]) print(f"number of pca components that explain 80%: {components_num}") pca = PCA(components_num).fit(starbucks_scaled) starbucks_pca = pca.transform(starbucks_scaled) # clustering clusterer = KMeans(n_clusters=cluster_num, n_init=10, init='k-means++').fit(starbucks_pca) starbucks_preds = clusterer.predict(starbucks_pca) print(f"silhouette_score for {cluster_num} clusters: {metrics.silhouette_score(starbucks_pca, clusterer.labels_, metric='euclidean'):.3f}") print(82 * '_') plot_elbow_curve(starbucks_pca) else: pca = None starbucks_pca = None # clustering clusterer = KMeans(n_clusters=cluster_num, n_init=10, init='k-means++').fit(starbucks_scaled) starbucks_preds = clusterer.predict(starbucks_scaled) print("silhouette_score:", metrics.silhouette_score(starbucks_scaled, clusterer.labels_, metric='euclidean')) print(82 * '_') plot_elbow_curve(starbucks_scaled) # assign customer segments to data starbucks_predicted = df.copy() starbucks_predicted['segments'] = starbucks_preds #generate report if generate_report == True: cluster_df = create_cluster_df(pca, clusterer, scaler, cluster_num, starbucks_ohe, do_minmax, do_pca) plot_segments_interpreter(cluster_df) plot_customer_segments(starbucks_predicted, cluster_num) plot_metrics(starbucks_predicted) #return starbucks_ohe, scaler, starbucks_scaled, pca, starbucks_pca, clusterer, starbucks_preds return starbucks_predicted
'/Users/yaofeifan/Documents/Tsinghua/Lesson/模式识别/Project2/data/北京空气质量/2019/All.csv', header=0, index_col=0) dataset2020 = read_csv( '/Users/yaofeifan/Documents/Tsinghua/Lesson/模式识别/Project2/data/北京空气质量/2020/All.csv', header=0, index_col=0) dataset = pd.concat([dataset2019], axis=0) dataset = dataset.drop(columns=["hour", "AQI", "PM10", 'grade']) order = ['PM2.5', 'CO', 'NO2', 'O3', 'SO2'] dataset = dataset[order] values = dataset.values values = values.astype('float32') # normalize features scaler = MinMaxScaler(feature_range=(0, 1)) scaled = scaler.fit_transform(values) reframed = series_to_supervised(scaled, 1, 1) reframed.drop(reframed.columns[[6, 7, 8, 9]], axis=1, inplace=True) thetrain = reframed print(thetrain.head()) # load test dataset dataset2 = read_csv( '/Users/yaofeifan/Documents/Tsinghua/Lesson/模式识别/Project2/data/北京空气质量/2014/All.csv', header=0, index_col=0) dataset2 = dataset2.drop(columns=["hour", "AQI", "PM10", 'grade']) order = ['PM2.5', 'CO', 'NO2', 'O3', 'SO2'] dataset2 = dataset2[order] values2 = dataset2.values
# -------------- from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler import pandas as pd # Code starts here df=pd.read_csv(path) #print(df.head()) print(df.attr1089.value_counts()) X=df.iloc[:,0:len(df.columns)-1] y=df.iloc[:,-1] X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=4) scaler=MinMaxScaler() scaler.fit(X_train) X_train=scaler.transform(X_train) X_test=scaler.transform(X_test) # Code ends here # -------------- from sklearn.metrics import classification_report from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score lr=LogisticRegression() lr.fit(X_train,y_train) y_pred=lr.predict(X_test) roc_score=roc_auc_score(y_test,y_pred) print(roc_score) # --------------
# Recurrent Neural Network # Part 1 - Data Preprocessing # Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd # Importing the training set dataset_train = pd.read_csv('trainingset.csv') training_set = dataset_train.iloc[:, 1:2].values # Feature Scaling from sklearn.preprocessing import MinMaxScaler sc = MinMaxScaler(feature_range=(0, 1)) training_set_scaled = sc.fit_transform(training_set) # Creating a data structure with 60 timesteps and 1 output X_train = [] y_train = [] for i in range(60, 639): X_train.append(training_set_scaled[i - 60:i, 0]) y_train.append(training_set_scaled[i, 0]) X_train, y_train = np.array(X_train), np.array(y_train) # Reshaping X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1)) # Part 2 - Building the RNN
dataX.append(a) dataY.append(dataset[i + look_back, 0]) return np.array(dataX), np.array(dataY) # fix random seed for reproducibility np.random.seed(5) # load the dataset df = read_csv(input_file, header=None, index_col=None, delimiter=',') # take close price column[5] all_y = df[5].values dataset=all_y.reshape(-1, 1) # normalize the dataset scaler = MinMaxScaler(feature_range=(0, 1)) dataset = scaler.fit_transform(dataset) # split into train and test sets, 50% test data, 50% training data train_size = int(len(dataset) * 0.5) test_size = len(dataset) - train_size train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:] # reshape into X=t and Y=t+1, timestep 240 look_back = 240 trainX, trainY = create_dataset(train, look_back) testX, testY = create_dataset(test, look_back) # reshape input to be [samples, time steps, features] trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
def preprocess_prediciton(iq): Actives = ['EURUSD','GBPUSD','EURJPY','AUDUSD'] active = 'EURUSD' main = pd.DataFrame() current = pd.DataFrame() for active in Actives: if active == 'EURUSD': main = fast_data(iq,active).drop(columns = {'from','to'}) else: current = fast_data(iq,active) current = current.drop(columns = {'from','to','open','min','max'}) current.columns = [f'close_{active}',f'volume_{active}'] main = main.join(current) df = main """ graphical analysis components """ df.isnull().sum().sum() # there are no nans df.fillna(method="ffill", inplace=True) df = df.loc[~df.index.duplicated(keep = 'first')] df['MA_20'] = df['close'].rolling(window = 20).mean() df['MA_50'] = df['close'].rolling(window = 50).mean() df['L14'] = df['min'].rolling(window=14).min() df['H14'] = df['max'].rolling(window=14).max() df['%K'] = 100*((df['close'] - df['L14']) / (df['H14'] - df['L14']) ) df['%D'] = df['%K'].rolling(window=3).mean() df['EMA_20'] = df['close'].ewm(span = 20, adjust = False).mean() df['EMA_50'] = df['close'].ewm(span = 50, adjust = False).mean() rsi_period = 14 chg = df['close'].diff(1) gain = chg.mask(chg<0,0) df['gain'] = gain loss = chg.mask(chg>0,0) df['loss'] = loss avg_gain = gain.ewm(com = rsi_period - 1, min_periods = rsi_period).mean() avg_loss = loss.ewm(com = rsi_period - 1, min_periods = rsi_period).mean() df['avg_gain'] = avg_gain df['avg_loss'] = avg_loss rs = abs(avg_gain/avg_loss) df['rsi'] = 100-(100/(1+rs)) """ Finishing preprocessing """ df = df.drop(columns = {'open','min','max','avg_gain','avg_loss','L14','H14','gain','loss'}) df = df.dropna() df = df.fillna(method="ffill") df = df.dropna() df.sort_index(inplace = True) scaler = MinMaxScaler() indexes = df.index df_scaled = scaler.fit_transform(df) pred = pd.DataFrame(df_scaled,index = indexes) sequential_data = [] prev_days = deque(maxlen = SEQ_LEN) for i in pred.iloc[len(pred) -SEQ_LEN :len(pred) , :].values: prev_days.append([n for n in i[:]]) if len(prev_days) == SEQ_LEN: sequential_data.append([np.array(prev_days)]) X = [] for seq in sequential_data: X.append(seq) return np.array(X)
del train['ID'] #%% # Create independent and dependent variable y = train.iloc[:, -1] x = train.iloc[:, 0:-1] #%% import matplotlib.pyplot as plt import collections cy = collections.Counter(y) plt.bar(cy.keys(), cy.values()) plt.show() #%% y = pd.get_dummies(y).values #%% # Preprocessing: Minmax scaling x_scaling = MinMaxScaler().fit_transform(x) # By scaling operation score increase from 0.49045 to score=0.52721 #%% x_train, x_test, y_train, y_test = train_test_split(x_scaling, y, test_size=0.3, random_state=10) #%% model = XGBClassifier() model.fit(x_train, y_train) # score=0.52721 #%% prediction = model.predict(x_test) #%% score = f1_score(y_test, prediction, average='weighted') #score=0.52721
class NumericColumn(BaseEstimator, TransformerMixin): ''' Take a numeric value column and standardize it. ''' def __init__(self): ''' Set up the internal transformation. ''' self._transformer = MinMaxScaler() def fit(self, X, y=None): ''' Fit the standardization. ''' zeroed = pd.DataFrame(np.array(X).reshape(-1, 1)).fillna(0) self._transformer.fit(zeroed) return self def transform(self, X): ''' Transform a column of data into numerical percentage values. Parameters ---------- X : pandas series or numpy array ''' zeroed = pd.DataFrame(np.array(X).reshape(-1, 1)).fillna(0) return self._transformer.transform(zeroed).astype(np.float32)
def predict_new(self, input): model = self.train_model() assert len(input) == 5 and type(input) == list scaler = MinMaxScaler(feature_range=(0, 1)) scaler.fit(self.data) inp = scaler.transform([input]) print(scaler.inverse_transform(model.predict(numpy.array(inp).reshape(1, 1, 5))))
def loaddataset(self,path,module): df=pd.read_csv(path) subdf = df[['PassengerId','Pclass','Sex','Age','Embarked','Fare','SibSp','Parch']] SibSp=subdf['SibSp'] Parch=subdf['Parch'] # supplement Age Age=subdf['Age'].fillna(value=subdf.Age.mean()) Fare=subdf['Fare'].fillna(value=subdf.Fare.mean()) dummies_Sex=pd.get_dummies(subdf['Sex'],prefix='Sex') dummies_Embarked = pd.get_dummies(subdf['Embarked'], prefix= 'Embarked') dummies_Pclass = pd.get_dummies(subdf['Pclass'], prefix= 'Pclass') PassengerId=subdf['PassengerId'] # Age&Fare to Scaler scaler=MinMaxScaler() age_scaled=scaler.fit_transform(Age.values) fare_scaled=scaler.fit_transform(Fare.values) Age_Scaled=pd.DataFrame(age_scaled,columns=['Age_Scaled']) Fare_Scaled=pd.DataFrame(fare_scaled,columns=['Fare_Scaled']) if module=='train': self.trainlabel=df.Survived self.trainset=pd.concat([dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1) elif module=='test': self.testset=pd.concat([PassengerId,dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)
def cluster(final_data_dict, cluster_range, list_or_dict): final_data_list= clustering_module.convert_to_list(final_data_dict) respondent_IDs = np.array(map(int, final_data_dict.keys())) feature_names = final_data_dict.values()[0].keys() final_data_list_imputed = clustering_module.preprocess(final_data_list) Scaler = MinMaxScaler() final_data_list_scaled = Scaler.fit_transform(final_data_list_imputed) #Transformed is distance of each respondent from each cluster center #Predicted is the cluster membership of each respondent merging_list = clustering_module.convert_to_list(final_data_dict,remove_NaN=0 ) data = list(merging_list) ignore_set_added = set(['ids']) for num_clusters in cluster_range: transformed, predicted, score = clustering_module.clustering(final_data_list_scaled, num_clusters) cluster_name = "%s_clusters" % num_clusters ignore_set_added.add(cluster_name) data, feature_names = clustering_module.add_new_data_to_rows(predicted, data, feature_names, [cluster_name]) data, feature_names = clustering_module.add_new_data_to_rows(respondent_IDs, data, feature_names, ["ids"], "before") if list_or_dict == "dict": temp = dictionary_conversion.create_dictionary(data, feature_names) num_converted = dictionary_conversion.convert_values_to_int(temp) #Set of features that should be different due to being categorical ignore_set_changed = set(['busgrn', 'peopgrn', 'sex', 'race', 'topprob1', 'topprob2']) verdict = compare_respondent_dicts(respondent_IDs, num_converted, final_data_dict, ignore_set_changed, ignore_set_added) return num_converted, verdict elif list_or_dict == "list": return data, feature_names
def rank_to_dict(ranks, names, order=1, ratio=1): minmax = MinMaxScaler() ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0] if np.mean(ranks) == 0: ranks+=1 ranks = map(lambda x: round(x, 2), ranks) return dict(zip(names, ranks ))
def vary_border(pred_true,y,num_iter=101): mms = MinMaxScaler() pred=pred_true.copy() pred=mms.fit_transform(pred) best_score = 0 for k1 in range(num_iter): c1 = k1/(num_iter-1) for k2 in range(num_iter): c2 = k2/(num_iter-1) for k3 in range(num_iter): c3 = k3/(num_iter-1) if c1 < c2 and c1 < c3 and c2 < c3 and c1 > 0.25 and c1 < 0.5 and c3 < 0.9: tmp_pred = pred.copy() mask1 = tmp_pred < c1 mask2 = (tmp_pred >=c1) * (tmp_pred < c2) mask3 = (tmp_pred >=c2) * (tmp_pred < c3) mask4 = tmp_pred >=c3 tmp_pred[mask1] = 1 tmp_pred[mask2] = 2 tmp_pred[mask3] = 3 tmp_pred[mask4] = 4 score = quadratic_weighted_kappa(y,tmp_pred) if score > best_score: best_score = score best_coef = [c1,c2,c3] best_pred = tmp_pred.copy() #print(best_score,best_coef) return best_pred, best_coef
def Iris(training_size, test_size, n, PLOT_DATA): class_labels = [r'A', r'B', r'C'] data, target = datasets.load_iris(True) sample_train, sample_test, label_train, label_test = train_test_split(data, target, test_size=1, random_state=42) # Now we standarize for gaussian around 0 with unit variance std_scale = StandardScaler().fit(sample_train) sample_train = std_scale.transform(sample_train) sample_test = std_scale.transform(sample_test) # Scale to the range (-1,+1) samples = np.append(sample_train, sample_test, axis=0) minmax_scale = MinMaxScaler((-1, 1)).fit(samples) sample_train = minmax_scale.transform(sample_train) sample_test = minmax_scale.transform(sample_test) # Pick training size number of samples from each distro training_input = {key: (sample_train[label_train == k, :])[:training_size] for k, key in enumerate(class_labels)} test_input = {key: (sample_train[label_train == k, :])[training_size:( training_size+test_size)] for k, key in enumerate(class_labels)} if PLOT_DATA: for k in range(0, 3): plt.scatter(sample_train[label_train == k, 0][:training_size], sample_train[label_train == k, 1][:training_size]) plt.title("Iris dataset") plt.show() return sample_train, training_input, test_input, class_labels
def scale(self): # Scaling is an important part of this process: many of our algorithms # require our data to be scaled or otherwise standardized. We # do this by scaling features to values between [0,1]. This preserves # zero entries in our sparse matrix which is always a desirable # quality when working with this sort of data. # Scaling is sort of a convoluted process because Scipy/Scikit # doesn't offer a way to do this natively. We transpose the matrix, # convert it to LIL format (which isn't inefficient in this operation), # and divide each row (column in the original matrix) by the row's # sum before transposing and converting back to CSR. # However, if the matrix is not sparse, we don't have to worry about # this and can simply use one of Scikit's utility methods. # TODO: Maybe look at profiling to ensure that this strategy really # is the least expensive one. if self.sparse: self.vecs = self.vecs.tolil() self.vecs = self.vecs.transpose() num_features, _ = self.vecs.shape for i in range(num_features): self.vecs[i] /= self.vecs[i].sum() self.vecs = self.vecs.transpose() self.vecs = self.vecs.tocsr() else: mms = MinMaxScaler(copy = False) self.vecs = mms.fit_transform(self.vecs)
def NB_coefficients(year=2010): poi_dist = getFourSquarePOIDistribution(useRatio=False) F_taxi = getTaxiFlow(normalization="bydestination") W2 = generate_geographical_SpatialLag_ca() Y = retrieve_crime_count(year=year) C = generate_corina_features() D = C[1] popul = C[1][:,0].reshape(C[1].shape[0],1) Y = np.divide(Y, popul) * 10000 f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 ) mms = MinMaxScaler(copy=False) mms.fit(f) mms.transform(f) header = C[0] + [ 'spatiallag', 'taxiflow'] + \ ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] df = pd.DataFrame(f, columns=header) np.savetxt("Y.csv", Y, delimiter=",") df.to_csv("f.csv", sep=",", index=False) # NB permute nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] ) print nbres ls = nbres.strip().split(" ") coef = [float(e) for e in ls] print coef return coef, header
def Breast_cancer(training_size, test_size, n, PLOT_DATA): class_labels = [r'A', r'B'] data, target = datasets.load_breast_cancer(True) sample_train, sample_test, label_train, label_test = train_test_split(data, target, test_size=0.3, random_state=12) # Now we standarize for gaussian around 0 with unit variance std_scale = StandardScaler().fit(sample_train) sample_train = std_scale.transform(sample_train) sample_test = std_scale.transform(sample_test) # Now reduce number of features to number of qubits pca = PCA(n_components=n).fit(sample_train) sample_train = pca.transform(sample_train) sample_test = pca.transform(sample_test) # Scale to the range (-1,+1) samples = np.append(sample_train, sample_test, axis=0) minmax_scale = MinMaxScaler((-1, 1)).fit(samples) sample_train = minmax_scale.transform(sample_train) sample_test = minmax_scale.transform(sample_test) # Pick training size number of samples from each distro training_input = {key: (sample_train[label_train == k, :])[:training_size] for k, key in enumerate(class_labels)} test_input = {key: (sample_train[label_train == k, :])[training_size:( training_size+test_size)] for k, key in enumerate(class_labels)} if PLOT_DATA: for k in range(0, 2): plt.scatter(sample_train[label_train == k, 0][:training_size], sample_train[label_train == k, 1][:training_size]) plt.title("PCA dim. reduced Breast cancer dataset") plt.show() return sample_train, training_input, test_input, class_labels
def readTrainingData(): data = np.loadtxt( 'data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) }) allY = data[:, 32] allX = data[:, 1:31] allW = data[:, 31] scale = MMS() allX = scale.fit_transform(allX) np.random.seed(42) r = np.random.rand(allY.shape[0]) xTrain = allX[r<=0.4] yTrain = allY[r<=0.4] wTrain = allW[r<=0.4] xValid = allX[r>0.7] yValid = allY[r>0.7] wValid = allW[r>0.7] v = np.random.rand(yValid.shape[0]) xCrossValid = xValid[v<=0.5] yCrossValid = yValid[v<=0.5] wCrossValid = wValid[v<=0.5] xTestValid = xValid[v>0.5] yTestValid = yValid[v>0.5] wTestValid = wValid[v>0.5] return [xTrain, yTrain, wTrain, xCrossValid, yCrossValid, wCrossValid, xTestValid, yTestValid, wTestValid]
def getips(conf, net, superpixels_num, layer='inner_product_target'): (options, args) = parser.parse_args() layer = options.layer data = net.blobs[layer].data #data = net.blobs['InnerProduct1'].data feature_len = data.shape[1] try: negative_numbers = conf.model['number_of_negatives'] except: negative_numbers = 1 reps = np.zeros((superpixels_num*negative_numbers, feature_len)) for i in xrange(superpixels_num): if i%1000==1: print i net.forward() reps[i] = np.sum(net.blobs[layer].data, axis=1) reps_slice = reps[..., 0] from sklearn.preprocessing import MinMaxScaler clf = MinMaxScaler() reps_slice = clf.fit_transform(reps_slice) if negative_numbers > 1: reps_slice = np.square(reps_slice) #reps_slice[reps_slice<np.mean(reps_slice)] = 0 for i in xrange(reps_slice.shape[0]): reps[i] = reps_slice[i] # print net.blobs['inner_product_target'].data[1:10] return reps
def fit(self, X, y): X = np.matrix(X) y = np.matrix(y) self._outputNormalizer = MinMaxScaler() self._inputNormalizer = MinMaxScaler() self._outputNormalizer = self._outputNormalizer.fit(y) self._inputNormalizer = self._inputNormalizer.fit(X) self._inputDimension = X.shape[1] self._outputDimension = y.shape[1]#For now, hardcoded to 1-dimensional regression problems. if (not self._warmStart or self._weights == None): self._initializeWeights() self._lastDelta = None batchFeatures, batchTargets = self._batchify(np.matrix(self._inputNormalizer.transform(X)), self._batchSize, np.matrix(self._outputNormalizer.transform(y))) #do for each step until the maximum steps: for i in range(self._maxSteps): reducedLearningRate = self._learningRate * self._shrinkage ** self._step for j in range(len(batchFeatures)): deltaW = self._learnFromBatch(batchFeatures[j], batchTargets[j]) if (self._lastDelta == None): self._lastDelta = deltaW for k in range(len(self._weights)): self._lastDelta[k] = ((1-self._momentum) * deltaW[k] + self._momentum * self._lastDelta[k]) self._weights[k] = self._weights[k] + reducedLearningRate * self._lastDelta[k] #self._positifyWeights() self._step += 1 #print(step) return self
def analysis_7(df_Coredata): """ 多次元多項式モデル """ #https://www.jeremyjordan.me/polynomial-regression/ X = df_Coredata[['d','e','f','g','i']] y = df_Coredata['j'] # グラフのスタイルを指定 sns.set(style = 'whitegrid', context = 'notebook') # 変数のペアの関係をプロット #sns.pairplot(df_Coredata) #plt.show() #X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 0) #lr = linear_model.LinearRegression().fit(X_train, y_train) #print("Trainng set score: {:.2f}".format(lr.score(X_train, y_train))) #print("Test set score: {:.2f}".format(lr.score(X_test, y_test))) ### データのスケール変換 # 標準化 std_Scaler = StandardScaler() data_std = std_Scaler.fit_transform(X) mmx_Scaler =MinMaxScaler() X_scaled = mmx_Scaler.fit_transform(X) #X_test_scaled = scaler.transform(X_test) #print(X_train_scaled) poly = PolynomialFeatures(degree = 2).fit(data_std) print(poly.get_feature_names())
def minmaxscaling(df): # MinMaxScaling between 0 and 1 is bad when you have outliers. # https://stats.stackexchange.com/a/10298 scaler = MinMaxScaler(feature_range=(0, 1)) # min max scaler want features in the columns and samples in the rows -> ok df = scaler.fit_transform(df) return df, scaler
def runAlgorithm(data, categories, function, iterations = 5, num_partitions = 2): results_table = np.empty([iterations*num_partitions,4], dtype=float) scaler = MinMaxScaler() data = scaler.fit_transform(data) for i in range(iterations): # Se realiza una partición aleatoria print("Iteration ", i) partition = makePartitions(data, categories, random_ppio) for j in range(num_partitions): print("Sub iteration ", j) start = time.time() training_data = partition[0][j] training_categ = partition[1][j] test_data = np.array([partition[0][k][l] for k in range(num_partitions) if k!=j for l in range(len(partition[0][k]))], float) test_categ = np.array([partition[1][k][l] for k in range(num_partitions) if k!=j for l in range(len(partition[1][k]))]) solution, train_rate = function(training_data, training_categ) end = time.time() nbrs = neighbors.KNeighborsClassifier(3) nbrs.fit(training_data[:,solution],training_categ) rate = 100*nbrs.score(test_data[:,solution], test_categ) results_table[i*num_partitions+j,0] = train_rate/len(training_data)*100 results_table[i*num_partitions+j,1] = rate results_table[i*num_partitions+j,2] = (1 - sum(solution)/len(training_data[0]))*100 results_table[i*num_partitions+j,3] = end-start print("Rate = " + str(rate) + "\nTime = " + str(end-start) + " s") return results_table
def cal_result(model,year): """ 计算1个模型的各个统计量 :param model: 模型 :return: 统计量列表 """ X = load_data(year)[0] y1 = load_data(year)[1][0] # 票房 y2= load_data(year)[1][1] # 微博评分 y3= load_data(year)[1][2] # 豆瓣评分 y4 = load_data(year)[1][3] # 时光网评分 scaler = MinMaxScaler().fit(X) X = scaler.transform(X) # print model(X, y1)[0] # print model(X, y2)[0] # print model(X, y3)[0] # print model(X, y4)[0] result = cal_one_model(model(X, y1)[0], cal_avg(model(X, y2)[0], model(X, y3)[0], model(X, y4)[0])) result1 = [] result1.append(model(X, y1)[1]) result1.append(model(X, y2)[1]) result1.append(model(X, y3)[1]) result1.append(model(X, y4)[1]) # print result1 # scaler = StandardScaler().fit(result1) # result1 = scaler.transform(result1) return result, result1
def train(mode): if mode == "NextWeek": DATA = "MLprojectOutput/week34567to8Formated/part-00000" else: DATA = "MLprojectOutput/week34567to9Formated/part-00000" X, Y = readData(DATA, 10000, -1) X_Scaler = MinMaxScaler().fit(X) joblib.dump(X_Scaler, 'Predict{0}_Scaler.pkl'.format(mode)) X = X_Scaler.transform(X) dtrain = xgb.DMatrix(X, label = Y) param = { 'booster':"gbtree", 'eta':0.3, 'max_depth':6, 'subsample':0.85, 'colsample_bytree':0.7, 'silent':0, 'objective':'reg:linear', 'nthread':10, 'eval_metric':'rmse'} __model = xgb.train(param.items(), dtrain) __model.save_model('Predict{0}.model'.format(mode)) X_TEST, Y_TEST = readData(DATA, 0, 10000) X_TEST = X_Scaler.transform(X_TEST) dtest = xgb.DMatrix(X_TEST) Y_pred = list(map(lambda x: int(x), __model.predict(dtest))) evaluate(Y_TEST,Y_pred)
def uniform_to_normal(df, continuous_features): scaler = MinMaxScaler() df_scaled = pd.DataFrame(scaler.fit_transform(df[continuous_features].dropna()), columns=continuous_features) uniform = set() alpha = 0.05 for c in continuous_features: statistic, pvalue = kstest(df_scaled[c], scipy.stats.uniform().cdf) if statistic < alpha: uniform.add(c) zero_to_one = [f for f in uniform if df[f].min() > 0 and df[f].min() < 0.001 and df[f].max() < 1 and df[f].max() > 0.999] zero_to_ten = [f for f in uniform if df[f].min() > 0 and df[f].min() < 0.01 and df[f].max() < 10 and df[f].max() > 9.99] zero_to_hundred = [f for f in uniform if df[f].min() > 0 and df[f].min() < 0.1 and df[f].max() < 100 and df[f].max() > 99.9] for f in uniform: min = 0 if f in zero_to_one or f in zero_to_ten or f in zero_to_hundred else df[f].min() max = 1 if f in zero_to_one else (10 if f in zero_to_ten else 100 if f in zero_to_hundred else df[f].max()) df[f] = df[f].map(lambda x: norm.ppf((x - min) / ( max - min))) # we could use df_scaled but this should give us better results since what we think are the actual min and max, and not the observed min and max df.replace([np.inf, -np.inf], np.nan, inplace=True) df.dropna(inplace=True) return uniform
def _scaled_data(self): """Load scaled data. Args: None Returns: (scaler, train, test): Tuple of list of train and test data """ # Initialize key variables (_train, _test) = self._data() # Fit scaler scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(_train) # Transform train train = _train.reshape(_train.shape[0], _train.shape[1]) train_scaled = scaler.transform(train) # Transform test test = _test.reshape(_test.shape[0], _test.shape[1]) test_scaled = scaler.transform(test) # Return return scaler, train_scaled, test_scaled
def normalize_data(tr_x,ts_x,normz=None,axis=0): if normz is 'scale': tr_x = scale(tr_x,axis=axis) ts_x = scale(ts_x,axis=axis) elif normz is 'minmax': minmax_scaler = MinMaxScaler() if axis==0: for c_i in range(tr_x.shape[1]): tr_x[:,c_i] = minmax_scaler.fit_transform(tr_x[:,c_i]) ts_x[:,c_i] = minmax_scaler.fit_transform(ts_x[:,c_i]) elif axis==1: for r_i in range(tr_x.shape[0]): tr_x[r_i,:] = minmax_scaler.fit_transform(tr_x[r_i,:]) ts_x[r_i,:] = minmax_scaler.fit_transform(ts_x[r_i,:]) elif normz is 'sigmoid': if axis==0: col_max = np.max(tr_x,axis=0) cols_non_norm = np.argwhere(col_max>1).tolist() tr_x[:,cols_non_norm] = -0.5 + (1 / (1 + np.exp(-tr_x[:,cols_non_norm]))) # TODO: implement col_max col_non_norm for test set ts_x[:,cols_non_norm] = -0.5 + (1/(1+np.exp(-ts_x[:,cols_non_norm]))) elif axis==1: row_max = np.max(tr_x,axis=1) rows_non_norm = np.argwhere(row_max>1).tolist() tr_x[rows_non_norm,:] = -0.5 + (1 / (1 + np.exp(-tr_x[rows_non_norm,:]))) # TODO: implement row_max row_non_norm for test set ts_x[rows_non_norm,:] = -0.5 + (1/(1+np.exp(-ts_x[rows_non_norm,:]))) return tr_x,ts_x
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) # Scale features if(scale_features): scaler = MinMaxScaler() features = scaler.fit_transform(features) cv = StratifiedShuffleSplit(labels, folds, random_state = 42) true_negatives = 0 false_negatives = 0 true_positives = 0 false_positives = 0 for train_idx, test_idx in cv: features_train = [] features_test = [] labels_train = [] labels_test = [] for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) ### fit the classifier using training set, and test on test set clf.fit(features_train, labels_train) predictions = clf.predict(features_test) for prediction, truth in zip(predictions, labels_test): if prediction == 0 and truth == 0: true_negatives += 1 elif prediction == 0 and truth == 1: false_negatives += 1 elif prediction == 1 and truth == 0: false_positives += 1 elif prediction == 1 and truth == 1: true_positives += 1 else: print "Warning: Found a predicted label not == 0 or 1." print "All predictions should take value 0 or 1." print "Evaluating performance for processed predictions:" break try: total_predictions = true_negatives + false_negatives + false_positives + true_positives accuracy = 1.0*(true_positives + true_negatives)/total_predictions precision = 1.0*true_positives/(true_positives+false_positives) recall = 1.0*true_positives/(true_positives+false_negatives) f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives) f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall) print 'Total predictions: '+str(total_predictions) print 'Accuracy: '+str(accuracy) print 'Precision: '+str(precision) print 'Recall: '+str(recall) print 'F1: '+str(f1) print 'F2: '+str(f2) print "" except: print "Got a divide by zero when trying out:", clf print "Precision or recall may be undefined due to a lack of true positive predicitons."
def plot_prediction_relevance(results, EFA=True, classifier='ridge', rotate='oblimin', change=False, size=4.6, dpi=300, ext='png', plot_dir=None): """ Plots the relevant relevance of each factor for predicting all outcomes """ predictions = results.load_prediction_object(EFA=EFA, change=change, classifier=classifier, rotate=rotate)['data'] targets = list(predictions.keys()) predictors = predictions[targets[0]]['predvars'] importances = abs(np.vstack([predictions[k]['importances'] for k in targets])) # scale to 0-1 scaler = MinMaxScaler() scaled_importances = scaler.fit_transform(importances.T).T # make proportion scaled_importances = scaled_importances/np.expand_dims(scaled_importances.sum(1),1) # convert to dataframe scaled_df = pd.DataFrame(scaled_importances, index=targets, columns=predictors) melted = scaled_df.melt(var_name='Factor', value_name='Importance') plt.figure(figsize=(8,12)) f=sns.boxplot(y='Factor', x='Importance', data=melted, width=.5) if plot_dir is not None: filename = 'prediction_relevance' save_figure(f, path.join(plot_dir, filename), {'bbox_inches': 'tight', 'dpi': dpi}) plt.close()
def test_min_max_scaler_zero_variance_features(): """Check min max scaler on toy data with zero variance features""" X = [[0., 1., 0.5], [0., 1., -0.1], [0., 1., 1.1]] X_new = [[+0., 2., 0.5], [-1., 1., 0.0], [+0., 1., 1.5]] # default params scaler = MinMaxScaler() X_trans = scaler.fit_transform(X) X_expected_0_1 = [[0., 0., 0.5], [0., 0., 0.0], [0., 0., 1.0]] assert_array_almost_equal(X_trans, X_expected_0_1) X_trans_new = scaler.transform(X_new) X_expected_0_1_new = [[+0., 1., 0.500], [-1., 0., 0.083], [+0., 0., 1.333]] assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2) # not default params scaler = MinMaxScaler(feature_range=(1, 2)) X_trans = scaler.fit_transform(X) X_expected_1_2 = [[1., 1., 1.5], [1., 1., 1.0], [1., 1., 2.0]] assert_array_almost_equal(X_trans, X_expected_1_2)
def get_training_data_by_category(category, limit=0): limit_pos = limit*0.2 limit_neg = limit*0.8 N_pos = DataDAO.count_training_data_by_category(category) if N_pos < limit_pos: limit_pos = N_pos limit_neg = N_pos*5 training_data = [] training_target = [] positive = DataDAO.get_training_data_by_category(category) for ind, sample in enumerate(positive): if limit != 0 and ind >= limit_pos: break training_data.append(sample) training_target.append(1) negative = DataDAO.get_training_data_by_other_categories(category) for ind, sample in enumerate(negative): if limit != 0 and ind >= limit_neg: break training_data.append(sample) training_target.append(0) scaler = MinMaxScaler() training_data_scaled = scaler.fit_transform(training_data) # training_data_scaled = scale(training_data,axis=0) tr_data_sparse = csr_matrix(training_data_scaled) return tr_data_sparse, training_target, scaler
def train_model(feats_csv): df = pd.DataFrame() df = pd.read_csv(feats_csv).iloc[:,1:] y = np.ravel(df.iloc[:,-1:]) X = np.array(df.iloc[:,:-1]) ############ 15 Best selected features using ANOVA F-value score function ############### X_new = SelectKBest(f_classif, k=15).fit_transform(X, y) selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True) ############ KNN manhattan ############### ##### preprocessing: data scaling######## min_max_scaler = MinMaxScaler() X_new = min_max_scaler.fit_transform(X_new) model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform') model.fit(X_new,y) newdir = '../kNN_clfr' os.mkdir(newdir) joblib.dump(model, os.path.join(newdir,'kNN.pkl')) return
def sdae_syn(X_s,P,h_layer,activations,noise,epoch,loss,batch_size): """Generate synthetic samples using stacked De-noising Encoders Parameters ---------- X_s: positive class sample (Numpy Array) (Input Must be in within range of 0 to 1) P: Over Sampling Percentage h_layer: hidden layer (list) activation: activation functions list (same length as hidden layer) noise : [None,Gaussian,mask] epoch: epoch for each layer (list with same size as hidden layer) loss: 'rmse' or 'cross-entropy' batch_size = mini_batch size For more detaisl on input parameters https://github.com/rajarsheem/libsdae """ n_samples=int(X_s.shape[0]*P/100) print "generating %d samples" %(n_samples) X_init=np.random.standard_normal(size=(n_samples,X_s.shape[1])) scaler=MinMaxScaler() X_init=scaler.fit_transform(X_init) model = StackedAutoEncoder(dims=h_layer, activations=activations, noise=noise, epoch=epoch,loss=loss, batch_size=batch_size, lr=0.007, print_step=2000) model.fit(X_s) syn_Z=model.transform(X_init) return syn_Z