def sample_from_generator(history, nb_samples, latent_dim=12, valid_split=0.3, random_split=True, hidden_dims=None, **kwargs): scaler = MinMaxScaler() scaler.fit(history) scaled = scaler.transform(history) nb_train = history.shape[0] if not valid_split: nb_valid = 0 elif isinstance(valid_split, float): nb_valid = nb_train - int(np.floor(nb_train*valid_split)) else: nb_valid = valid_split if nb_valid > 0: if random_split: ind = np.arange(nb_train) np.random.shuffle(ind) x_valid = scaled[ind[-nb_valid:], :] x_train = scaled[ind[:-nb_valid], :] else: x_valid = scaled[-nb_valid:, :] x_train = scaled[:-nb_valid, :] else: x_valid = None x_train = scaled _, generator = build_model(latent_dim, x_train, x_valid=x_valid, hidden_dims=hidden_dims, **kwargs) normal_sample = np.random.standard_normal((nb_samples, latent_dim)) draws = generator.predict(normal_sample) return scaler.inverse_transform(draws)
def scale_data(pitchers): num_data = pitchers[['Decisions', 'Wins_Over_Decisions', 'Wins_Over_Starts', 'Relief_Appearances', 'Shutout_Percentage', 'Outs_Recorded_Per_Appearance', 'Hits_Allowed_Per_Appearance', 'Earned_Runs_Per_Appearance', 'Runs_Per_Appearance', 'Home_Runs_Per_Appearance', 'Walks_Per_Appearance', 'Strikeouts_Per_Appearance', 'ERA']] scaler = MinMaxScaler() scaler.fit(num_data) num_data = scaler.transform(num_data) num_data = pd.DataFrame(num_data) num_data.columns = ['Decisions', 'Wins_Over_Decisions', 'Wins_Over_Starts', 'Relief_Appearances', 'Shutout_Percentage', 'Outs_Recorded_Per_Appearance', 'Hits_Allowed_Per_Appearance', 'Earned_Runs_Per_Appearance', 'Runs_Per_Appearance', 'Home_Runs_Per_Appearance', 'Walks_Per_Appearance', 'Strikeouts_Per_Appearance', 'ERA'] pitchers = pitchers[['Player_and_Year']] pitchers = pd.merge(pitchers, num_data, how='inner', left_index=True, right_index=True) return pitchers
def data_organizer( instances, outcomes ): """ Operations to organize data as desired """ excluded_features = set([]) #print( "Using only SAT subject tests" ) #included_features = set(["SATCRDG", "SATMATH", "SATWRTG"]) #print( "Using SAT total and HSGPA" ) #included_features = set(["SATTotal", "HSGPA"]) #print( "Using gender, firstgen, famincome, firstlang" ) #included_features = set(["gender", "Firgen", "famincome", "FirstLang"]) print( "Using all features" ) included_features = set(["gender", "Firgen", "famincome", "SATCRDG", "SATMATH", "SATWRTG", "SATTotal", "HSGPA", "ACTRead", "ACTMath", "ACTEngWrit", "APIScore", "FirstLang", "HSGPAunweighted"]) #print( "SAT subject tests and HSGPA" ) #included_features = set(["SATCRDG", "SATMATH", "SATWRTG", "HSGPA" ]) # Remove instances without GPA data new_instances = [] new_outcomes = [] for instance,outcome in zip(instances,outcomes): temp={} for name,val in zip(ALL_LABELS, instance): temp[name] = val u1,u2,gpa = outcome if not math.isnan( gpa ): temp_list = [] skip = False for key in temp.keys(): if key in included_features: if math.isnan(temp[key]): skip = True temp_list.append( temp[key] ) if not skip: new_outcomes.append( [value for value in outcome] ) new_instances.append( temp_list ) instances = new_instances outcomes = new_outcomes # Fill in NaN values with median instance_list = [] for idx,instance in enumerate(instances): instance_list.append( [ value for value in instance ] ) bandaid = Imputer( strategy='median' ) instances = bandaid.fit_transform( instance_list ) # Scale to [0,1] scaler = MinMaxScaler( feature_range=(0,1), copy=False) scaler.fit( instances ) instances = scaler.fit_transform(instances) return instances, outcomes, scaler
def transform(self, fp): fm = FeaturePool(fp).meta() x = FeaturePool(fp).array() scaler = MinMaxScaler(feature_range = self.feature_range) scaler.fit(x) for f in FeaturePool.from_array(fm, scaler.transform(x)): yield f
def preprocess_datasets(X_train, X_test, args): if 'scale' in args.preprocessing: print('Scaling features to range [-1,1] ...') scaler = MinMaxScaler(feature_range=(-1, 1)) scaler.fit(np.vstack(X_train)) X_train = [scaler.transform(X_curr) for X_curr in X_train] X_test = [scaler.transform(X_curr) for X_curr in X_test] return X_train, X_test
def preprocess_datasets(train, test, args): if 'scale' in args.preprocessing: print('Scaling features to range [-1,1] ...') scaler = MinMaxScaler(feature_range=(-1, 1)) scaler.fit(np.vstack(train.X)) processed_train = Dataset([scaler.transform(X_curr) for X_curr in train.X], train.y, train.target_names, train.groups) processed_test = Dataset([scaler.transform(X_curr) for X_curr in test.X], test.y, test.target_names, test.groups) else: processed_train = train processed_test = test return processed_train, processed_test
def test_minmaxscaler_vs_sklearn(): # Compare msmbuilder.preprocessing.MinMaxScaler # with sklearn.preprocessing.MinMaxScaler minmaxscalerr = MinMaxScalerR() minmaxscalerr.fit(np.concatenate(trajs)) minmaxscaler = MinMaxScaler() minmaxscaler.fit(trajs) y_ref1 = minmaxscalerr.transform(trajs[0]) y1 = minmaxscaler.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def preprocess_data(X, scaler=None): if not scaler: #add log to data X = np.log(1+X) scaler = MinMaxScaler() scaler.fit(X) X = scaler.transform(X) #add gaussian noise mu, sigma = 0, 0.1 # mean and standard deviation s = np.random.normal(mu, sigma) #X = X + s return X, scaler
class SerialDataScaler: def __init__(self, data): data = numpy.reshape(data, (len(data), 1)) data = data.astype("float32") self.scaler = MinMaxScaler(feature_range=(0, 1)) self.scaler.fit(data) def transform(self, X): #return X return self.scaler.transform(numpy.reshape(X, (len(X), 1))) def inverse_transform(self, x): return self.scaler.inverse_transform(x)
def organize_data(train_size=59872): #Used 59872, which is 80%, rounded in a fashion to use large mini-batches that align in size with open('dev_df.pkl', 'r') as f: dev_df = pd.DataFrame(cPickle.load(f)) # Training/CV set gender_age_train = pd.read_csv('gender_age_train.csv', index_col=0).drop(['gender', 'age'], axis=1) gender_age_train = gender_age_train.join(dev_df) # Test set gender_age_test = pd.read_csv('gender_age_test.csv', index_col=0) gender_age_test = gender_age_test.join(dev_df) # Labels will be in y array; features will be in X matrix; need to encode labels # for phone_brand, device_model, and group X = np.array(gender_age_train) X_test = np.array(gender_age_test) # Row 0 is the group to be classified, so put it in y array then delete it y = X[:,0] from sklearn.preprocessing import LabelEncoder le_y = LabelEncoder() y = le_y.fit_transform(y) X = np.delete(X,0,1) # Reformat all labeled columns with label encoders le_phone_brand = LabelEncoder() le_phone_brand.fit(np.hstack((X[:,0], X_test[:,0]))) X[:,0] = le_phone_brand.transform(X[:,0]) X_test[:,0] = le_phone_brand.transform(X_test[:,0]) le_device_model = LabelEncoder() le_device_model.fit(np.hstack((X[:,1], X_test[:,1]))) X[:,1] = le_device_model.transform(X[:,1]) X_test[:,1] = le_device_model.transform(X_test[:,1]) # Standardize features from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(np.vstack((X, X_test))) X = scaler.transform(X) X_test = scaler.transform(X_test) # Create CV set from sklearn.cross_validation import train_test_split X_train, X_cv, y_train, y_cv = train_test_split(X, y, train_size=train_size, random_state=0) return X_train, X_cv, y_train, y_cv, X_test
class log_minmax(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin): '''Transformer that first takes log1p(X) then calls the minMaxScaler transformer''' def __init__(self): self.mm_tran = MinMaxScaler() def fit(self, X, y=None): self.mm_tran.fit(np.log1p(X),y) return self def transform(self, X): Xt = self.mm_tran.transform(np.log1p(X)) return Xt def fit_transform(self, X, y=None): self.fit(X) return self.transform(X)
def pipeline(): # test = data[data.watch==1] test_a_b = test[['item_id','store_code','a','b']] test_y = test.label test_x = test.drop(['label','watch','item_id','store_code','a','b'],axis=1) test_x.fillna(test_x.median(),inplace=True) train = data[(data.watch!=0)&(data.watch!=1)] train_y = train.label a = list(train.a) b = list(train.b) train_weight = [] for i in range(len(a)): #train_weight.append(min(a[i],b[i])) train_weight.append(a[i]+b[i]) train_weight = np.array(train_weight) train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1) train_x.fillna(train_x.median(),inplace=True) scaler = MinMaxScaler() scaler.fit(train_x) train_x = scaler.transform(train_x) test_x = scaler.transform(test_x) #model = SVR(kernel='rbf',cache_size=2000,gamma=0.01,C=3.5)#1 #model = SVR(kernel='rbf',cache_size=2000,gamma=0.01,C=3.5)#2 #model = SVR(kernel='rbf',cache_size=2000,gamma=0.01,C=3.5)#3 model = SVR(kernel='rbf',cache_size=2000,gamma=0.01,C=3.5) #train model.fit(train_x,train_y, sample_weight=train_weight) #predict test set test_a_b['pred'] = model.predict(test_x) test_a_b['y'] = test_y cost = cal_cost(test_y.values,test_a_b.pred.values,test_a_b.a.values,test_a_b.b.values) test_a_b.to_csv('test/val_{0}.csv'.format(cost[1]),index=None)
class NumericalFeatureMinMaxScaler(object): def __init__(self, feature_names): self.feature_names = feature_names self.scaler = None def fit(self, data): self.scaler = MinMaxScaler(copy=True) self.scaler.fit(np.asarray(data.loc[:, self.feature_names])) return self def transform(self, data): SCALE_SUFFIX = "%s_SCALED" scaled_data = self.scaler.transform(np.asarray(data.loc[:, self.feature_names])) scaled_data = pd.DataFrame( data=scaled_data, columns=[SCALE_SUFFIX % f for f in self.feature_names], index=data.index ) data = pd.concat([data, scaled_data], axis=1) return data
def feature_scaling(feature_matrix,target,reductor=None,scaler=None): lda = LDA(n_components=2) minmax = MinMaxScaler(feature_range=(-1,1)) if not reductor: reductor = lda.fit(feature_matrix,target) feature_matrix_lda = reductor.transform(feature_matrix) if not scaler: scaler = minmax.fit(feature_matrix_lda) feature_matrix_scaled = scaler.transform(feature_matrix_lda) return feature_matrix_scaled,reductor,scaler
def organize_data_kf(): with open('dev_df.pkl', 'r') as f: dev_df = pd.DataFrame(cPickle.load(f)) # Training/CV set gender_age_train = pd.read_csv('gender_age_train.csv', index_col=0).drop(['gender', 'age'], axis=1) gender_age_train = gender_age_train.join(dev_df) # Test set gender_age_test = pd.read_csv('gender_age_test.csv', index_col=0) gender_age_test = gender_age_test.join(dev_df) # Labels will be in y array; features will be in X matrix; need to encode labels # for phone_brand, device_model, and group X = np.array(gender_age_train) X_test = np.array(gender_age_test) # Row 0 is the group to be classified, so put it in y array then delete it y = X[:,0] from sklearn.preprocessing import LabelEncoder le_y = LabelEncoder() y = le_y.fit_transform(y) X = np.delete(X,0,1) # Reformat all labeled columns with label encoders le_phone_brand = LabelEncoder() le_phone_brand.fit(np.hstack((X[:,0], X_test[:,0]))) X[:,0] = le_phone_brand.transform(X[:,0]) X_test[:,0] = le_phone_brand.transform(X_test[:,0]) le_device_model = LabelEncoder() le_device_model.fit(np.hstack((X[:,1], X_test[:,1]))) X[:,1] = le_device_model.transform(X[:,1]) X_test[:,1] = le_device_model.transform(X_test[:,1]) # Standardize features from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(np.vstack((X, X_test))) X = scaler.transform(X) X_test = scaler.transform(X_test) return X, y, X_test
def features_pca(features_fin, n_comp): scaler = MinMaxScaler() finance_scaler = scaler.fit(features_fin) features_fin = finance_scaler.transform(features_fin) pca = PCA(n_components=n_comp).fit(features_fin) #print 'pca', pca features_fin = pca.transform(features_fin) features_fin = np.array(features_fin) return pca, features_fin
def get_prediction_of_classifier( name, classifier, train_features_partial, train_labels_partial, test_features_partial, ): """Calculate prediction and write a result to the csv file Use partial or full data depending on situation @param train_labels_partial: maybe the same as train_features """ #normalize normalizer = MinMaxScaler() normalizer.fit(np.concatenate([train_features_partial, test_features_partial])) # todo: check train_features_partial = normalizer.transform(train_features_partial) test_features_partial = normalizer.transform(test_features_partial) #predict classifier.fit(train_features_partial, train_labels_partial) test_labels_predicted = classifier.predict(test_features_partial) write_vector(name, test_labels_predicted) return test_labels_predicted
def predict(self, test_X): # fitting done here # not efficient on the long term test_X = np.array(test_X) enc = OneHotEncoder() scal = MinMaxScaler() data = np.vstack((self.train_X, test_X)) enc.fit(self.get_cal(data)) scal.fit(self.get_cant(data)) new_train_X1 = enc.transform(self.get_cal(self.train_X)) new_train_X2 = scal.transform(self.get_cant(self.train_X)) new_train_X = scipy.sparse.hstack((new_train_X1, new_train_X2)) new_test_X1 = enc.transform(self.get_cal(test_X)) new_test_X2 = scal.transform(self.get_cant(test_X)) new_test_X = scipy.sparse.hstack((new_test_X1, new_test_X2)) self.model.fit(new_train_X, self.train_Y) R = self.model.predict(new_test_X) return R
def scale(train, test): # fit scaler scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(train) # transform train train = train.reshape(train.shape[0], train.shape[1]) train_scaled = scaler.transform(train) # transform test test = test.reshape(test.shape[0], test.shape[1]) test_scaled = scaler.transform(test) return scaler, train_scaled, test_scaled
def splitTrainTest(data,Labels,train_percent,random_state, minmax=False): random.seed(random_state) indexList = range(len(data)) random.shuffle(indexList) trainIndexList = indexList[:int(len(data)*train_percent)] testIndexList = indexList[int(len(data)*train_percent):] train, trainLabels, test, testLabels = data[trainIndexList], Labels[trainIndexList], data[testIndexList], Labels[testIndexList] if minmax: fit = MinMaxScaler.fit(train) train = fit.transform(train) test = fit.transform(test) return train, trainLabels, test, testLabels
def scale_data(train, test): scaler = MinMaxScaler(feature_range=(-1, 1)) # determine max and min values on training set (per feature) (scale training set with it) scaler = scaler.fit(train) train_scaled = scaler.transform(train) # apply the found parameters to test set (DO NOT compute them again) test_scaled = scaler.transform(test) return train_scaled, test_scaled
def transform_and_split(features, labels): ######## # Takes in two dataframes for the features and labels of a dataset and # outputs a dictionary with training and keys relating to training testing sets for each ######## print('Performing prelimianry datasplit') x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=33) scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) scaler2 = MinMaxScaler() scaler2.fit(x_train) x_train = scaler2.transform(x_train) x_test = scaler2.transform(x_test) data_dict = {'x_test': x_test, 'x_train': x_train, 'y_test': y_test, 'y_train': y_train} return data_dict
def plot_relative_scaling(): # make synthetic data X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2) # split it into training and test set X_train, X_test = train_test_split(X, random_state=5, test_size=.1) # plot the training and test set fig, axes = plt.subplots(1, 3, figsize=(13, 4)) axes[0].scatter(X_train[:, 0], X_train[:, 1], c='b', label="training set", s=60) axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', c='r', label="test set", s=60) axes[0].legend(loc='upper left') axes[0].set_title("original data") # scale the data using MinMaxScaler scaler = MinMaxScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # visualize the properly scaled data axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c='b', label="training set", s=60) axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^', c='r', label="test set", s=60) axes[1].set_title("scaled data") # rescale the test set separately, so that test set min is 0 and test set max is 1 # DO NOT DO THIS! For illustration purposes only test_scaler = MinMaxScaler() test_scaler.fit(X_test) X_test_scaled_badly = test_scaler.transform(X_test) # visualize wrongly scaled data axes[2].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c='b', label="training set", s=60) axes[2].scatter(X_test_scaled_badly[:, 0], X_test_scaled_badly[:, 1], marker='^', c='r', label="test set", s=60) axes[2].set_title("improperly scaled data")
class Normalizer(BaseEstimator, TransformerMixin): def __init__(self, method='standardize'): """ Constructor @param method: method of normalization. The ones currently supported are: 'standardize': (x-mean)/sd 'rescale': (x-min)/(max-min) @return: nothing. """ assert method in ['standardize', 'rescale'], 'Unexpected method %s'%method self.method = method if method == 'standardize': self._scaler = StandardScaler() else: self._scaler = MinMaxScaler() def fit(self, X, y=None, **params): """ @return: the caller itself """ self._scaler.fit(X, y) return self def transform(self, X, **params): """ @return: transformed data """ return self._scaler.transform(X) def fit_transform(self, X, y=None, **params): """ @return: transformed data """ return self._scaler.fit_transform(X, y, **params)
def pipeline(): # test = data[data.watch==0] test_a_b = test[['item_id','store_code','a','b']] test_x = test.drop(['label','watch','item_id','store_code','a','b'],axis=1) test_x.fillna(test_x.median(),inplace=True) train = data[data.watch!=0] train_y = train.label a = list(train.a) b = list(train.b) train_weight = [] for i in range(len(a)): #train_weight.append(min(a[i],b[i])) train_weight.append(a[i]+b[i]) train_weight = np.array(train_weight) train_x = train.drop(['label','watch','item_id','store_code','a','b'],axis=1) train_x.fillna(train_x.median(),inplace=True) scaler = MinMaxScaler() scaler.fit(train_x) train_x = scaler.transform(train_x) test_x = scaler.transform(test_x) model = SVR(kernel='linear',cache_size=2000) #train model.fit(train_x,train_y, sample_weight=train_weight) #predict test set test_a_b['pred'] = model.predict(test_x) test_a_b.to_csv('test/test_all.csv',index=None)
def get_input(self): # Input data. # Load the training, validation and test data into constants that are # attached to the graph. self.x_train, self.y_train,self.x_validation,self.y_validation = self.get_train_validationset() self.x_train, self.y_train,self.x_validation,self.y_validation = self.x_train.as_matrix(), self.y_train.as_matrix().reshape((-1,1)),\ self.x_validation.as_matrix(),self.y_validation.as_matrix().reshape((-1,1)) # self.x_train, self.y_train,self.x_validation,self.y_validation = self.x_train.astype(np.float32), self.y_train.astype(np.float32),\ # self.x_validation.astype(np.float32),self.y_validation.astype(np.float32) sc = MinMaxScaler() sc.fit(self.x_train) self.x_train= sc.transform(self.x_train) self.x_validation= sc.transform(self.x_validation) self.inputlayer_num = len(self.get_used_features()) self.outputlayer_num = 1 # Input placehoolders with tf.name_scope('input'): self.x = tf.placeholder(tf.float32, [None, self.inputlayer_num], name='x-input') self.y_true = tf.placeholder(tf.float32, [None, self.outputlayer_num ], name='y-input') self.keep_prob = tf.placeholder(tf.float32, name='drop_out') return
from sklearn.model_selection import train_test_split,cross_val_score x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8, shuffle = True, random_state = 66 ) # 1 from tensorflow.keras.utils import to_categorical y_train = to_categorical(y_train) y_test = to_categorical(y_test) print(x_train.shape) print(x_test.shape) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train).reshape(142, 13) #x_train만 trans 후 바뀐수치 x_train에 다시넣기 x_test = scaler.transform(x_test).reshape(36, 13) #x_test 따로 trans 후 바뀐수치 x_test에 다시넣기 # 2 def build_model(drop = 0.5, optimizer = 'adam'): inputs = Input(shape=(13,), name='input') x = Dense(512, activation='relu', name='hidden1')(inputs) x = Dropout(drop)(x) x = Dense(256, activation='relu', name='hidden2')(x) x = Dropout(drop)(x) x = Dense(128,activation='relu', name='hidden3')(x) x = Dropout(drop)(x) outputs = Dense(3, activation='softmax', name='output')(x) model = Model(inputs=inputs, outputs=outputs)
import plotly import plotly.graph_objs as go import plotly.plotly as py import tensorflow as tf from keras.layers import LSTM, Dense from keras.models import Sequential from keras.preprocessing.sequence import TimeseriesGenerator from plotly.offline import plot from sklearn.preprocessing import MinMaxScaler filename = 'AggregatedData/Data_Combined.csv' df = pd.read_csv(filename) df['Mean'] = df['Mean'].astype('int32') scaler = MinMaxScaler() scaler.fit(df['Mean'].values.reshape((-1, 1))) df['Mean'] = scaler.transform(df['Mean'].values.reshape((-1, 1))).reshape(-1) # scaler = MinMaxScaler() # scaler.fit(df['Count'].values.reshape((-1,1))) # df['Count'] = scaler.transform(df['Count'].values.reshape((-1,1))).reshape(-1) locs = df['Neighbourhood'].unique() drop_locations = [ 'HARLEM-WEST', 'JAVITS CENTER', 'SOUTHBRIDGE', 'MANHATTAN-UNKNOWN', 'ROOSEVELT ISLAND' ] locations = [] for x in locs: if x not in drop_locations: locations.append(x)
# In[195]: # Scatter plots for continuous data print("Scatter plots for continuous data and the target variable") import seaborn as sns sns.set(style="ticks", color_codes=True) g = sns.pairplot(df_for_plots[continuous_column]) plt.show() # In[196]: # Scaling/Normalization print("Performing Scaling/Normalization of the dataset") from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(df[continuous_column]) df[continuous_column] = scaler.transform(df[continuous_column]) # In[222]: # Probability Distribution of all the continuous values columns print("Probability Distribution of all the continuous values columns") df[continuous_column].hist() plt.show() # # Correlation based independence check for columns (drop columns > 0.3) # In[202]: print("Checking if there are any columns with correlation greater than 0.4") corr = df.corr().iloc[:, :3]
def main(): # List files available print(os.listdir("../input/")) # Training data app_train = pd.read_csv('../input/application_train.csv') print('Training data shape: ', app_train.shape) app_train.head() # Testing data features app_test = pd.read_csv('../input/application_test.csv') print('Testing data shape: ', app_test.shape) app_test.head() # 样本不均衡 target_counts = app_train['TARGET'].value_counts() target_ratio0 = target_counts[0] / (target_counts[0] + target_counts[1]) target_ratio1 = target_counts[1] / (target_counts[0] + target_counts[1]) print('target_ratio0=', target_ratio0, ', target_ratio1=', target_ratio1) app_train['TARGET'].astype(int).plot.hist() plt.xlabel('TARGET') #缺失值处理 missing_values = missing_values_table(app_train) app_train = fillNanData(app_train, missing_values) app_test = fillNanData(app_test, missing_values) app_test = fillNanData(app_test, missing_values_table(app_test)) # 类别分析 # Number of each type of column print(app_train.dtypes.value_counts()) print(app_test.dtypes.value_counts()) # Number of unique classes in each object column print( app_train.select_dtypes(include=['object']).apply(pd.Series.nunique, axis=0)) print( app_test.select_dtypes(include=['object']).apply(pd.Series.nunique, axis=0)) # Create a label encoder object le = LabelEncoder() le_count = 0 # Iterate through the columns for col in app_train: if app_train[col].dtype == 'object': print(col, ":", len(list(app_train[col].unique()))) # If 2 or fewer unique categories if len(list(app_train[col].unique())) <= 2: # Train on the training data le.fit(app_train[col]) # Transform both training and testing data app_train[col] = le.transform(app_train[col]) app_test[col] = le.transform(app_test[col]) # Keep track of how many columns were label encoded le_count += 1 print('%d columns were label encoded.' % le_count) # one-hot encoding of categorical variables app_train = pd.get_dummies(app_train) app_test = pd.get_dummies(app_test) print('Training Features shape: ', app_train.shape) print('Testing Features shape: ', app_test.shape) # 特征对齐 train_labels = app_train['TARGET'] # Align the training and testing data, keep only columns present in both dataframes app_train, app_test = app_train.align(app_test, join='inner', axis=1) print('Training Features shape: ', app_train.shape) print('Testing Features shape: ', app_test.shape) print('Testing Features shape: ', app_test.shape) app_train['TARGET'] = train_labels #相关性分析 # Find correlations with the target and sort correlations = app_train.corr()['TARGET'] # Display correlations abscorrelations = abs(correlations) abscorrelations.plot() plt.ylabel('correlations') plt.show() abscorrelations = abscorrelations.sort_values() print('Most Positive Correlations: \n', correlations.tail(15)) # 数据填充与归一化 from sklearn.preprocessing import MinMaxScaler, Imputer # Drop the target from the training data if 'TARGET' in app_train: train = app_train.drop(['TARGET'], axis=1).copy() else: train = app_train.copy() features = list(train.columns) # Copy of the testing data test = app_test.copy() # Median imputation of missing values imputer = Imputer(strategy='median') # Scale each feature to 0-1 scaler = MinMaxScaler(feature_range=(0, 1)) # Fit on the training data imputer.fit(train) # Transform both training and testing data train = imputer.transform(train) test = imputer.transform(app_test) # Repeat with the scaler scaler.fit(train) train = scaler.transform(train) test = scaler.transform(test) print('Training data shape: ', train.shape) print('Testing data shape: ', test.shape) # LR from sklearn.linear_model import LogisticRegression # Make the model with the specified regularization parameter log_reg = LogisticRegression(penalty='l2', C=0.0001, class_weight='balanced', max_iter=500, solver='sag', verbose=1, n_jobs=-1) # Train on the training data log_reg.fit(train, train_labels) # predict 返回两列,第一列为0的概率,第二列为1的概率 log_reg_pred = log_reg.predict_proba(test)[:, 1] submit = app_test[['SK_ID_CURR']] submit['TARGET'] = log_reg_pred # Save the submission to a csv file submit.to_csv('log_reg_baseline.csv', index=False) print('save log_reg_baseline.csv to file!')
def preprocessing_features(df_train, df_test, process_continuous): to_delete_features = ['default','pdays'] continuous_features = ['age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'] categorical_ordered_features = ['education', 'housing', 'loan', 'contact','month', 'day_of_week','poutcome'] categorical_unordered_features = ['job', 'marital'] unknown_present_features = ['job','marital','education','housing','loan'] ### Delete Features for feat in to_delete_features: print "\n--------- deleting feature --------- ",feat del df_train[feat] del df_test[feat] ### Fill unknowns in the features with feature-mode for feat in unknown_present_features: print "\n--------- Replacing unknowns in feature --------- ",feat feature_value_counts = df_train[feat].value_counts() print "Replaced with: ",feature_value_counts.idxmax() df_train.loc[df_train[feat] == "unknown",feat] = feature_value_counts.idxmax() df_test.loc[df_test[feat] == "unknown",feat] = feature_value_counts.idxmax() ### Label Categorical Ordered Features label_dict = {'education':{'illiterate':0, 'basic.4y':4, 'basic.6y':6, 'basic.9y':9, 'high.school':11, 'professional.course':13, 'university.degree':14}, 'housing':{'no':0,'yes':1}, 'loan':{'no':0,'yes':1}, 'contact':{'telephone':0,'cellular':1}, 'month':{'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,'sep':9,'oct':10,'nov':11,'dec':12}, 'day_of_week':{'mon':1,'tue':2,'wed':3,'thu':4,'fri':5,'sat':6,'sun':7}, 'poutcome':{'nonexistent':0,'failure':1,'success':2}} for feat in categorical_ordered_features: print "\n--------- Labelling feature --------- ",feat df_train = df_train.replace({feat:label_dict[feat]}) df_test = df_test.replace({feat:label_dict[feat]}) print "Labelled as: ",label_dict[feat] ### One hot encoding Categorical Un-ordered Features for feat in categorical_unordered_features: print "\n--------- One Hot Encoding feature --------- ",feat label_encoder = LabelEncoder() label_encoder.fit(df_train[feat]) df_train[feat] = label_encoder.transform(df_train[feat]) df_test[feat] = label_encoder.transform(df_test[feat]) one_hot_encoder = OneHotEncoder(sparse=False) one_hot_encoder.fit(df_train[categorical_unordered_features]) one_hot_encoded_array_train = one_hot_encoder.transform(df_train[categorical_unordered_features]) one_hot_encoded_df_train = pd.DataFrame(one_hot_encoded_array_train, index=df_train.index) one_hot_encoded_array_test = one_hot_encoder.transform(df_test[categorical_unordered_features]) one_hot_encoded_df_test = pd.DataFrame(one_hot_encoded_array_test, index=df_test.index) df_train = pd.concat([df_train,one_hot_encoded_df_train], axis=1) #concatenate old columns with new one hot encoded columns df_test = pd.concat([df_test,one_hot_encoded_df_test], axis=1) #concatenate old columns with new one hot encoded columns df_train = df_train.drop(categorical_unordered_features, axis=1) #Delete columns which were one hot encoded df_test = df_test.drop(categorical_unordered_features, axis=1) #Delete columns which were one hot encoded ### Normalization or Standardization of Continuous Features if process_continuous == "Standardize": print "\n--------- Standardizing Continuous Features (Mean=0, Standard Deviation=1) --------- " standardization = StandardScaler() standardization.fit(df_train[continuous_features]) df_train[continuous_features] = standardization.transform(df_train[continuous_features]) df_test[continuous_features] = standardization.transform(df_test[continuous_features]) elif process_continuous == "Normalize": print "\n--------- Normalizing Continuous Features (Min=0, Max=1) --------- " min_max_scaling = MinMaxScaler() min_max_scaling.fit(df_train[continuous_features]) df_train[continuous_features] = min_max_scaling.transform(df_train[continuous_features]) df_test[continuous_features] = min_max_scaling.transform(df_test[continuous_features]) ### Return pre-processed df return df_train, df_test
if using_difference == True: Diff = '_Diff' if using_difference == True: # using dataset_diference for training dataset = dataset_difference else: # using dataset for training dataset = ts_values_array # split into train and test sets train_size = int(len(dataset) * 0.8) print('train_size: %i' % train_size) datset = atleast_2d(dataset).T scaler = MinMaxScaler(feature_range=(0, 1)) scaler = scaler.fit(datset) dataset_scaled = scaler.fit_transform(datset) train, test = dataset_scaled[0:train_size, :], dataset_scaled[ train_size:, :] # data shape should be (lens_ts, n_features) train_input = train[:-1, :] train_target = train[1:, :] test_input = test[:-1, :] test_target = test[1:, :] model_esn = SimpleESN(n_readout=1000, n_components=1000,
print('#================================================#') print(' Training Datasize: '+str(X_train.shape[0])+' and test datasize: ' + str(X_test.shape[0]) + '. ') print('#================================================#') # # Set First Run to Off First_run = False # #### Pre-Process Data # In[10]: # Initialize Scaler scaler = MinMaxScaler() scaler.fit(X_train) # Train Scaler X_train_scaled = scaler.transform(X_train) # Map to Test Set X_test_scaled = scaler.transform(X_test) # ### Visualize Data # #### Train # In[11]: if is_visuallty_verbose:
plt.xlabel("eso") plt.ylabel("poi") #plt.show() else: print( "outlierCleaner() is returning an empty list, no refitting to be done") ### Task 3: Create new feature(s) ### Store to my_dataset for easy export below. # create np arrays for the rescaler salary = np.nan_to_num(np.reshape((np.array(Salary)), (len(Salary), 1))) bonus = np.nan_to_num(np.reshape((np.array(Bonus)), (len(Bonus), 1))) eso = np.nan_to_num(np.reshape((np.array(ESO)), (len(ESO), 1))) #rescale salary, bonus and exercised sti=ock options scaler = MinMaxScaler() scaled = scaler.fit(salary) scaled_salary = scaled.transform(salary) scaled = scaler.fit(bonus) scaled_bonus = scaled.transform(bonus) scaled = scaler.fit(eso) scaled_eso = scaled.transform(eso) #append data_dict with rescaled values and create new #value for the percent of emails received from POI count = 0 for name in names: data_dict[name]['salary'] = scaled_salary[count][0] data_dict[name]['bonus'] = scaled_bonus[count][0] data_dict[name]['exercised_stock_options'] = scaled_eso[count][0] count = count + 1 data_dict[name]['Percent_Emails_from_POI'] = float( data_dict[name]['from_poi_to_this_person']) / float(
def minmax_scaling(df): scale = MinMaxScaler() scale.fit(df) df = scale.fit_transform(df) return df
class MNIST: def __init__(self): # Load the dataset (self.x_train, self.y_train), (self.x_test, self.y_test) = mnist.load_data() self.x_train_ = None self.x_val = None self.y_train_ = None self.y_val = None # Convert to float32 self.x_train = self.x_train.astype(np.float32) self.y_train = self.y_train.astype(np.float32) self.x_test = self.x_test.astype(np.float32) self.y_test = self.y_test.astype(np.float32) # Reshape the x-Data to shape (num_examples, width, height, depth) self.x_train = np.expand_dims(self.x_train, axis=-1) # 1 dim mehr für die depth info self.x_test = np.expand_dims(self.x_test, axis=-1) # Save important data attributes as variables self.train_size = self.x_train.shape[0] self.test_size = self.x_test.shape[0] self.val_size = 0 self.width = self.x_train.shape[1] self.height = self.x_train.shape[2] self.depth = self.x_train.shape[3] self.num_classes = 10 # np.max(self.y_train) +1 # Reshape the y-Data to One-Hot encoding self.y_train = to_categorical(self.y_train, num_classes=self.num_classes) self.y_test = to_categorical(self.y_test, num_classes=self.num_classes) def get_train_set(self): return self.x_train, self.y_train def get_test_set(self): return self.x_test, self.y_test def get_splitted_train_validation_set(self): # train = 60.000 # train_: 40.200, val: 19.800 self.x_train_, self.x_val, self.y_train_, self.y_val = train_test_split(self.x_train, self.y_train, test_size=0.33) self.val_size = self.x_val.shape[0] self.train_splitted_size = self.x_train_.shape[0] return self.x_train_, self.x_val, self.y_train_, self.y_val def data_augmentation(self, augment_size=5000): # Create an instance of the image data genrator class image_generator = ImageDataGenerator( rotation_range=10, # 15 Grad +/- drehen zoom_range=0.05, # 10% zoomen width_shift_range=0.05, # schieben hor. 10% height_shift_range=0.05, # schieben vert. 10% fill_mode='constant', cval=0.0) # fit the data generator image_generator.fit(self.x_train, augment=True) # Get random train images for the data augmentation rand_idxs = np.random.randint(self.train_size, size=augment_size) x_augmented = self.x_train[rand_idxs].copy() y_augmented = self.y_train[rand_idxs].copy() x_augmented = image_generator.flow(x_augmented, np.zeros(augment_size), batch_size = augment_size, shuffle=False).next()[0] # next() gibt eine Liste zurück und die daten sind an stelle 0 # Append the augmented images to the train set self.x_train = np.concatenate((self.x_train, x_augmented)) self.y_train = np.concatenate((self.y_train, y_augmented)) self.train_size = self.x_train.shape[0] def data_preprocessing(self, preprocess_mode='standard'): # Preprcess the data if preprocess_mode == 'standard': self.scaler = StandardScaler() else: self.scaler = MinMaxScaler(feature_range=(0 ,1)) self.scaler.fit(self.x_train.reshape(self.train_size, 784)) self.x_train = self.scaler.transform(self.x_train.reshape(self.train_size, 784)) self.x_test = self.scaler.transform(self.x_test.reshape(self.test_size, 784)) self.x_train = self.x_train.reshape(self.train_size, self.width, self.height, self.depth) self.x_test = self.x_test.reshape(self.test_size, self.width, self.height, self.depth)
clf = classifiers[1] f.write("Random Forest:\n") print(cross_val_score(clf,x_all,y_all,scoring='accuracy',cv=10)) accuracy=cross_val_score(clf,x_all,y_all,scoring='accuracy',cv=10).mean()*100 f.write("CV accuracy score = {0:.3f}\n".format(accuracy)) clf = classifiers[2] f.write("Logistic Regression:\n") print(cross_val_score(clf,x_all,y_all,scoring='accuracy',cv=10)) accuracy=cross_val_score(clf,x_all,y_all,scoring='accuracy',cv=10).mean()*100 f.write("CV accuracy score = {0:.3f}\n".format(accuracy)) #Split off test and train data and normalize data x_trn, x_tst, y_trn, y_tst = train_test_split(x_all, y_all, test_size=0.4, random_state=42) print(x_trn.shape) scaler = MinMaxScaler() scaler.fit(x_trn) x_trn_n=scaler.transform(x_trn) x_tst_n=scaler.transform(x_tst) #Build the model and predict the parameters for Desicion Tree and calculate the weights of top10 features contributing to income greater than 50k on the screen clf = classifiers[0] model=clf.fit(x_trn_n,y_trn) imp1=model.feature_importances_ var2imp1=dict(zip(list(df1),imp1)) var2imp1_sorted=pd.DataFrame(columns=['variable','weight']) for key in sorted(var2imp1, key=lambda k:abs(var2imp1[k]),reverse=True): temp=pd.DataFrame([[key,var2imp1[key]]],columns=['variable','weight']) var2imp1_sorted=var2imp1_sorted.append(temp) print("Top 10 important variables-Decision Tree:") print(var2imp1_sorted[0:10]) f.write("Top 10 Weighted Variables - Decision Tree:"+"\n") f.write("Rank\tVariable\tWeight\n")
def data_preprocess( file_name: str, args: Dict, impute_method: str = "mode", scaling_method: str = "minmax", ): """Load the data and preprocess into 3d numpy array. Preprocessing includes: 1. Remove outliers 2. Extract sequence length for each patient id 3. Impute missing data 4. Normalize data 5. Sort dataset according to sequence length Args: - file_name (str): CSV file name - args (dict): parameters for preprocessing data - impute_method (str): The imputation method ("median" or "mode") - scaling_method (str): The scaler method ("standard" or "minmax") Returns: - data: preprocessed data - time: ndarray of ints indicating the length for each data - params: the parameters to rescale the data """ padding_value = args.padding_value if not isinstance(padding_value, float): raise ValueError("Must provide padding value with type `float`") ######################### # Load data ######################### index = 'Idx' label = 'Label' # Load .csv file, columns are typically as follows: # | Index | Time | Feature_1 | ... | Feature_n | Label | print("Loading data...\n") ori_data = pd.read_csv(file_name) # Remove spurious column, so that column 0 is now 'Index'. if ori_data.columns[0] == "Unnamed: 0": ori_data = ori_data.drop(["Unnamed: 0"], axis=1) ######################### # Remove outliers from dataset ######################### no = ori_data.shape[0] z_scores = stats.zscore(ori_data, axis=0, nan_policy='omit') z_filter = np.nanmax(np.abs(z_scores), axis=1) < 3 ori_data = ori_data[z_filter] print(f"Dropped {no - ori_data.shape[0]} rows (outliers)\n") # Parameters uniq_id = np.unique(ori_data[index]) no = len(uniq_id) dim = len(ori_data.columns) - 1 # Ignore index ######################### # Impute, scale and pad data ######################### # Initialize scaler if scaling_method == "minmax": scaler = MinMaxScaler() scaler.fit(ori_data) params = [scaler.data_min_, scaler.data_max_] elif scaling_method == "standard": scaler = StandardScaler() scaler.fit(ori_data) params = [scaler.mean_, scaler.var_] # Imputation values if impute_method == "median": impute_vals = ori_data.median() elif impute_method == "mode": impute_vals = stats.mode(ori_data).mode[0] else: raise ValueError("Imputation method should be `median` or `mode`") # TODO: Sanity check for padding value # if np.any(ori_data == padding_value): # print(f"Padding value `{padding_value}` found in data") # padding_value = np.nanmin(ori_data.to_numpy()) - 1 # print(f"Changed padding value to: {padding_value}\n") args.padding_value = padding_value # Output initialization output = np.empty([no, args.max_seq_len, dim]) # Shape:[no, max_seq_len, dim] output.fill(args.padding_value) labels = np.empty([no, 1], dtype=np.int) time = np.empty([no], dtype=np.int) print("Preprocessing data...\n") # For each uniq id for i in tqdm(range(no)): # Extract the time-series data and label with a certain admissionid curr_data = ori_data[ori_data[index] == uniq_id[i]].to_numpy() curr_label = int(curr_data[0, -1]) # Impute missing data curr_data = imputer(curr_data, impute_vals) # Normalize data curr_data = scaler.transform(curr_data) # Extract time and assign to the preprocessed data (Excluding ID) curr_no = len(curr_data) # Pad data to `max_seq_len` if curr_no >= args.max_seq_len: output[i, :, :] = curr_data[:args.max_seq_len, 1:] # Shape: [1, max_seq_len, dim] time[i] = args.max_seq_len else: output[ i, :curr_no, :] = curr_data[:, 1:] # Shape: [1, max_seq_len, dim] time[i] = curr_no # Pad label for data labels[i] = curr_label return output, time, labels, params
# Initial values ONE_BATCH_SIZE = 60 BATCH_SIZE = 10 EPOCHS = 30 TRAIN_TEST_SPLIT_POINT = 0.8 X_TRAIN = [] Y_TRAIN = [] X_TEST = [] Y_TEST = [] # Load Dataframe data = load_standard_data_frame(get_hourly_last_3_months_data('BTC')) # Scale data sc = MinMaxScaler(feature_range=(0, 1)) sc.fit(data) data = sc.transform(data) # Split into training and test set training_set, test_set = splitter(data, TRAIN_TEST_SPLIT_POINT) # Prepare samples X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = prepare_samples(ONE_BATCH_SIZE, training_set, test_set, X_TRAIN, Y_TRAIN, X_TEST, Y_TEST, True) # Build LSTM model model = Sequential() model.add( LSTM(units=16, return_sequences=False, input_shape=(X_TRAIN.shape[1], 6)))
y = np.loadtxt('D:/ser/Train/P_Train_27KB.txt') y = np.array(y) y = pd.DataFrame(y, columns=['id', 'P']) s = y.P id = y.drop('P', axis=1) Y = np.array(s) # y=np.loadtxt('D:/ser/Train/P_Train_27KB.txt') # y=np.array(y) # y=pd.DataFrame(y,columns=['id','P']) # s=y.P # Y=np.array(s) # print X.shape Y = Y.reshape(-1, 1) le = MinMaxScaler(feature_range=(-3, 3)) le.fit(Y) Y = le.transform(Y) train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.2, random_state=0) svr_model = SVR(kernel='rbf', C=10.0, epsilon=0.01) svr_model.fit(train_x, train_y) pre_test = svr_model.predict(valid_x) print "the result of 1582 dimensional features" #metrics model mse = metrics.mean_squared_error(pre_test, valid_y) rmse = np.sqrt(mse) r2 = metrics.r2_score(pre_test, valid_y) import math
shuffle_df = shuffle(df_sample) df_train = shuffle_df[0:2400] df_test = shuffle_df[2400:] train_feature = np.array(df_train.values[:,0:29]) train_label = np.array(df_train.values[:,-1]) test_feature = np.array(df_test.values[:,0:29]) test_label = np.array(df_test.values[:,-1]) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(train_feature) train_feature_trans = scaler.transform(train_feature) test_feature_trans = scaler.transform(test_feature) from keras.layers import Dense from keras.layers import Dropout import matplotlib.pyplot as plt def show_train_history(train_history,train,validation): plt.plot(train_history.history[train]) plt.plot(train_history.history[validation]) plt.title('Train History') plt.ylabel(train) plt.xlabel('Epoch') plt.legend(['train', 'validation'], loc='best') plt.show()
class DnnModel(AbstractModel, ABC): def __init__(self, params): super().__init__(params) self.w2v_model_parameter = { 'max_len': 150, 'sg': 1, 'hs': 1, 'min_count': 0, 'window': 1, 'size': 5, 'iter': 30, 'workers': 8 } self.w2v = Word2Vector(**self.w2v_model_parameter) self.epoch = params.epoch self.scaler = None self.regression = KerasRegression(encoding_dim=1) self.data = None def build_word2vector(self, data): self.data = list(data) if self.w2v.model: self.w2v.update(self.data) else: self.w2v.fit(self.data) def fit(self, data): self.build_word2vector(data) list_vec = [] list_cost = [] for sql, duration_time in self.data: if check_illegal_sql(sql): continue filter_template = templatize_sql(sql) vector = self.w2v.str2vec(filter_template) list_vec.append(vector) list_cost.append(duration_time) features = np.array(list_vec) labels = np.array(list_cost) labels = labels.reshape(-1, 1) self.scaler = MinMaxScaler(feature_range=(0, 1)) self.scaler.fit(labels) labels = self.scaler.transform(labels) self.regression.fit(features, labels, epochs=self.epoch) def transform(self, data): feature_list = [] data_backup = list(data) error_list = [] for idx_error, sql in enumerate(data_backup): if check_illegal_sql(sql): error_list.append(idx_error) continue filter_template = templatize_sql(sql) vector = self.w2v.str2vec(filter_template) feature_list.append(vector) features = np.array(feature_list) predictions = self.regression.predict(features) predictions = np.abs(predictions) score = self.scaler.inverse_transform(predictions) if error_list: for item in error_list: score = np.insert(score, item, -1) score = np.hstack( (np.array(data_backup).reshape(-1, 1), score.reshape(-1, 1))).tolist() return score def load(self, filepath): realpath = os.path.realpath(filepath) if os.path.exists(realpath): dnn_path = os.path.join(realpath, 'dnn_model.h5') word2vector_path = os.path.join(realpath, 'w2v.model') scaler_path = os.path.join(realpath, 'scaler.pkl') self.regression.load(dnn_path) self.w2v.load(word2vector_path) with open(scaler_path, 'rb') as f: self.scaler = pickle.load(f) else: logging.error("{} not exist.".format(realpath)) def save(self, filepath): realpath = os.path.realpath(filepath) if not os.path.exists(realpath): os.makedirs(realpath, mode=0o700) if oct(os.stat(realpath).st_mode)[-3:] != '700': os.chmod(realpath, stat.S_IRWXU) dnn_path = os.path.join(realpath, 'dnn_model.h5') word2vector_path = os.path.join(realpath, 'w2v.model') scaler_path = os.path.join(realpath, 'scaler.pkl') self.regression.save(dnn_path) self.w2v.save(word2vector_path) with open(scaler_path, 'wb') as f: pickle.dump(self.scaler, f) print("DNN model is stored in '{}'".format(realpath))
import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import numpy as np %matplotlib inline mm_s = MinMaxScaler() iris = load_iris() y = iris.target[(iris.target == 0)|(iris.target == 1)] X = iris.data[(iris.target == 0)|(iris.target == 1)][:,[2,3]] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state =1, stratify=y) mm_s.fit(X) X_train_std = mm_s.transform(X_train) X_test_std = mm_s.transform(X_test) legd = LogisticsRegressionGD(eta=0.01, n_iter=1000, random_state=1) legd.fit(X_train_std, y_train) plot_decision_regions(X_train_std, y_train, classifier=legd) plt.ylabel("petal length [standardized]") plt.ylabel("petal width [standardized]") plt.legend(loc="upper left") plt.tight_layout() plt.show() from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB from sklearn.datasets import make_blobs from sklearn.model_selection import train_test_split # 导入数据预处理工具MinMaxScaler from sklearn.preprocessing import MinMaxScaler import numpy as np import matplotlib.pyplot as plt # 生成样本数量为500 ,分类数为5的数据集 X, y = make_blobs(n_samples=500, centers=5, random_state=8) # 将数据集拆分成训练集和训练集 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8) # 使用MinMaxScaler对数掘进行预处理,使数据全部为非负值 scaler = MinMaxScaler() scaler.fit(X_train) scaler.fit(X_test) X_train_scaled = scaler.transform(X_train) X_test_scaled = scaler.transform(X_test) # 用多项式朴素贝叶斯拟合数据 mnb = MultinomialNB() mnb.fit(X_train_scaled, y_train) # 限定横轴与纵轴的最大值 x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5 y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5 # 用不同的背景色表示不同的分类 xx, yy = np.meshgrid(np.arange(x_min, x_max, .02), np.arange(y_min, y_max, .02))
from sklearn.metrics import mean_squared_error, r2_score import numpy as np path = './data/' name = 'boston' x =np.load(path+name+'x.npy') y =np.load(path+name+'y.npy') print(x.shape) #506,13 print(y.shape) #506 #데이터 전처리 scaler scaler = MinMaxScaler() scaler.fit(x) x = scaler.transform(x) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8) print(x_train.shape, x_test.shape)#(404, 13) (102, 13) print(y_train.shape, y_test.shape)#(404,) (102,) #모델링 path = './save/boston/modelSave' path2 = './save/boston/' ####1. loadmodel
import matplotlib.pyplot as plot from sklearn import datasets #读取数据 housing = pd.read_csv('kc_train.csv') target = pd.read_csv('kc_train2.csv') #销售价格 t = pd.read_csv('kc_test.csv') #测试数据 #数据预处理 housing.info() #查看是否有缺失值 #特征缩放 from sklearn.preprocessing import MinMaxScaler minmax_scaler = MinMaxScaler() minmax_scaler.fit(housing) #进行内部拟合,内部参数会发生变化 scaler_housing = minmax_scaler.transform(housing) scaler_housing = pd.DataFrame(scaler_housing, columns=housing.columns) mm = MinMaxScaler() mm.fit(t) scaler_t = mm.transform(t) scaler_t = pd.DataFrame(scaler_t, columns=t.columns) #选择基于梯度下降的线性回归模型 from sklearn.linear_model import LinearRegression LR_reg = LinearRegression() #进行拟合 LR_reg.fit(scaler_housing, target) #使用均方误差用于评价模型好坏
import seaborn as sns from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt #The goal is to predict energy consumption by appliances. Data = pd.read_csv('Energy_data.csv') #print(Data.head()) #print(Data.isnull().sum().sort_values(ascending = True)) #no null values df = Data.drop(columns=['date', 'lights']) print(df) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() scaler.fit(df) scaled_feat = scaler.transform(df) df_MinMaxSc = pd.DataFrame(data = scaled_feat, columns = df.columns) features_df = df_MinMaxSc.drop(columns=['Appliances']) target_variable= df_MinMaxSc['Appliances'] x = df.iloc[:, 3].values y = df.iloc[:, 11].values from sklearn.linear_model import LinearRegression regressor = LinearRegression() #convert the data into 2 Dimensional array. #regressor.fit(X_train,Y_train) #training our machine learning model using these data.
train_data, cv_data = train_test_split(selected_data, test_size=0.3, random_state=42) train_x = train_data.drop('Churn Status', axis=1) train_y = train_data['Churn Status'] cv_x = cv_data.drop('Churn Status', axis=1) cv_y = cv_data['Churn Status'] train_x.drop(1400, inplace=True) train_y.drop(1400, inplace=True) from sklearn.preprocessing import MinMaxScaler std_scaler = MinMaxScaler() std_scaler.fit(train_x) #\train_x_std = std_scaler.transform(train_x) #cv_x_std = std_scaler.transform(cv_x) train_x_std = train_x cv_x_std = cv_x lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True) lda.fit(train_x_std, train_y) train_preds_lda = lda.predict(train_x_std) cv_preds_lda = lda.predict(cv_x_std) train_acc_lda = accuracy_score(train_preds_lda, train_y) cv_acc_lda = accuracy_score(cv_preds_lda, cv_y)
class DealingWithData: def __init__(self, df_data): self.df_data = df_data pass def count_na(self): return dict(NANs=self.df_data.isna().sum()) def fillna(self): """ 2 of the colums have equal number of nan's # TODO : find corrolation """ self.df_data.fillna(value=0, inplace=True) pass def correct_dates(self): self.df_data["time"] = pd.to_datetime(self.df_data["time"], utc=True) pass def drop_objects_col(self): for col in self.df_data.columns: if self.df_data[col].dtype == str("object"): # print(typ) self.df_data = self.df_data.drop(typ, axis=1) """ visualization should have its own class. # TODO : decouple. """ def plot_dist(self, x_plot="time", y_plot="generation biomass"): self.__viz.plot_distribution(self, x_plot=x_plot, y_plot=y_plot) pass def plot_dist_by_idx(self, idx_x=0, idx_y=1): # same as x_plot = self.df_data.columns[idx_x] y_plot = self.df_data.columns[idx_y] return self.__viz.plot_distribution(self, x_plot=x_plot, y_plot=y_plot) pass def scale_and_split_Xy(self, y_cols_idx: list, test_size=0.25) -> dict: # Extract. X = self.df_data.copy().drop("time", axis=1) self.y_cols = [X.columns[idx] for idx in y_cols_idx] y = X[self.y_cols] X = X.drop(self.y_cols, axis="columns") self.x_cols = X.columns print( f""" X data columns: {self.x_cols}\n y data colunms: {self.y_cols}\n """ ) from sklearn.preprocessing import MinMaxScaler # Scale data self.x_scaler = MinMaxScaler() self.x_scaler.fit(X) X = self.x_scaler.transform(X) self.y_scaler = MinMaxScaler() self.y_scaler.fit(y) y = self.y_scaler.transform(y) # split data from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size) return [x_train, y_train, x_test, y_test] class __viz: def plot_distribution(self, x_plot, y_plot): df_plot = dict( x=[k for k in range(self.df_data[x_plot].count())], y=self.df_data[y_plot], ) fig = px.scatter( df_plot, x="x", y="y", color="y", marginal_y="violin", marginal_x="box", trendline="ols", template="simple_white", ) return fig # fig.show() def plt_model_scater(self, x, y, y_list=[], params={}): plt.figure() plt.scatter(x, y, color="red") for y in y_list: plt.plot(x, y, color="blue") # plt.title("Generation Fossil Hard Coal vs Price Day Ahead") # plt.xlabel("Generation Fossil Hard Coal") # plt.ylabel("Price Day Ahead") return plt
continuous_features = [ 'Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen' ] data[continuous_features].describe() #convert categotrial vaiables to binary for col in categorical_features: dummies = pd.get_dummies(data[col], prefix=col) data = pd.concat([data, dummies], axis=1) data.drop(col, axis=1, inplace=True) data.head() #scale features to give equal importance to each mms = MinMaxScaler() mms.fit(data) data_transformed = mms.transform(data) Sum_of_squared_distances = [] K = range(1, 15) for k in K: km = KMeans(n_clusters=k) km = km.fit(data_transformed) Sum_of_squared_distances.append(km.inertia_) plt.plot(K, Sum_of_squared_distances, 'bx-') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow Method For Optimal k') plt.show()
# load feature and labels feat = np.load('/home/s1820002/atsit/data/feat_34_hfs.npy') vad = np.load('/home/s1820002/IEMOCAP-Emotion-Detection/y_egemaps.npy') # remove outlier, < 1, > 5 vad = np.where(vad == 5.5, 5.0, vad) vad = np.where(vad == 0.5, 1.0, vad) scaled_vad = True scaled_feature = False # standardization if scaled_vad: scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit( vad) #.reshape(vad.shape[0]*vad.shape[1], vad.shape[2])) scaled_vad = scaler.transform( vad) #.reshape(vad.shape[0]*vad.shape[1], vad.shape[2])) vad = scaled_vad else: vad = vad if scaled_feature == True: scaler = StandardScaler() scaler = scaler.fit( feat.reshape(feat.shape[0] * feat.shape[1], feat.shape[2])) scaled_feat = scaler.transform( feat.reshape(feat.shape[0] * feat.shape[1], feat.shape[2])) scaled_feat = scaled_feat.reshape(feat.shape[0], feat.shape[1], feat.shape[2]) feat = scaled_feat
# model.add(LeakyReLU()) model.add(Dropout(0.5)) model.add(Conv1D( strides=2, filters=nb_features, kernel_size=2)) ''' model.load_weights( 'weights/bitcoin2015to2017_close_CNN_2_relu-44-0.00023.hdf5') model.compile(loss='mse', optimizer='adam') # In[336]: predicted = model.predict(validation_datas) predicted_inverted = [] # In[7]: for i in range(original_datas.shape[1]): scaler.fit(original_datas[:, i].reshape(-1, 1)) predicted_inverted.append(scaler.inverse_transform(predicted[:, :, i])) print(np.array(predicted_inverted).shape) #get only the close data ground_true = ground_true[:, :, 0].reshape(-1) ground_true_times = ground_true_times.reshape(-1) ground_true_times = pd.to_datetime(ground_true_times, unit='s') # since we are appending in the first dimension predicted_inverted = np.array(predicted_inverted)[0, :, :].reshape(-1) print(np.array(predicted_inverted).shape) validation_output_times = pd.to_datetime(validation_output_times.reshape(-1), unit='s') # In[337]: ground_true_df = pd.DataFrame()
import numpy as np from keras.models import load_model from sklearn.preprocessing import MinMaxScaler from datagen import generate_data,normalize min_max_scalar = MinMaxScaler() data = generate_data(3000) np.savetxt('3k.csv',data,delimiter=',') min_max_scalar.fit(data) testcase = np.array([ [4,0,0.8,2,500], [0.1,0,0.8,2,500], [4,0,0.8,2,250], [4,0,0.8,2,550], ]) # temp = min_max_scalar.transform(testcase).reshape(-1, 1) norm_testdata = normalize(testcase,3000) norm_data = normalize(data,3000) model = load_model('94.h5') print(np.sum(abs(model.predict(norm_testdata) - norm_testdata),axis=1)) # print(np.mean(np.sum(abs(model.predict(norm_data) - norm_data),axis=1)))
fun = np.load("f_final.npy") vel = np.load("u_final.npy") velBC = vel.copy() velBC[:, :, :, 1:] = 0 # BC inputs to keep velocities zero and 1 at boundaries velBCx = velBC[:, 0, :, :].reshape(velBC.shape[0], 1, velBC.shape[-2], velBC.shape[-1]) / np.max(velBC) velBCy = velBC[:, 1, :, :].reshape(velBC.shape[0], 1, velBC.shape[-2], velBC.shape[-1]) / np.max(velBC) re_scaler = MinMaxScaler(feature_range=(0.2, 0.7)) feq_scaler = MinMaxScaler(feature_range=(0.2, 0.7)) vel_scaler = MinMaxScaler(feature_range=(0.2, 0.7)) Re_scaled = re_scaler.fit_transform(Re.reshape(Re.shape[0], 1)) feq_scaler.fit(feq.ravel()) vel_scaler.fit(vel.ravel()) num = vel.shape[0] print("Shape of inputs : ") print("Re : " + str(Re.shape)) print("feq : " + str(feq.shape)) print("fun : " + str(fun.shape)) print("vel : " + str(vel.shape)) print("Number of training samples is " + str(num)) print("Original resolution of input/output is " + str(vel.shape[2:])) #decreasing the resolution by half to make it easier to train # first removing middle rows and columns # feq = np.delete(feq,int(num/2),axis=1);feq = np.delete(feq,int(num/2),axis=2)
interval=inter, auto_adjust=True, prepost=True, threads=True, proxy=None) return data[["Open", "High", "Close"]] df = ts_download_btc("3d") df.fillna(method="ffill", inplace=True) df.dropna(inplace=True) print(df) cl = df train = cl[0:int(len(cl) * 0.80)] scl = MinMaxScaler() scl.fit(train.values.reshape(-1, 1)) cl = scl.transform(cl.values.reshape(-1, 1)) def processData(data, lb): X, Y = [], [] for i in range(len(data) - lb - 1): X.append(data[i:(i + lb), 0]) Y.append(data[(i + lb), 0]) return np.array(X), np.array(Y) lb = 10 X, y = processData(cl, lb) X_train, X_test = X[:int(X.shape[0] * 0.90)], X[int(X.shape[0] * 0.90):] y_train, y_test = y[:int(y.shape[0] * 0.90)], y[int(y.shape[0] * 0.90):]
def plot_gender_development_over_time(no_terms_or_topics_to_show=8, data='topics', display_selector='most_frequent', selected_terms_or_topics=None, show_plot=True, store_to_filename=None, title=None): """ :param no_terms_or_topics_to_show: int :param data: 'topics', 'terms', 'terms_of_topics' :param display_selector: 'most_frequent', 'most_divergent', 'most_variable' :param selected_terms_or_topics: topic_id or list of terms :param show_plot: bool :param store_to_filename: bool or str :return: """ if data == 'terms_of_topic': if not isinstance(selected_terms_or_topics, int): raise ValueError( "When displaying 'terms_of_topic', please pass a topic_id for param" "selected_terms_or_topics") # 0: find terms or topics to display d = Dataset() if data == 'topics': selected_terms_or_topics = [f'topic.{id}' for id in range(1, 71)] title_name = 'topics' elif data == 'terms': vocab = [] for t in selected_terms_or_topics: vocab.append(t) d.get_document_term_matrix(vocabulary=vocab, store_in_df=True) title_name = 'terms' elif data == 'terms_of_topic': vocab = [] topic_id = selected_terms_or_topics for term in TOPICS[topic_id]['terms_prob']: if term in d.vocabulary: vocab.append(term) selected_terms_or_topics = vocab d.get_document_term_matrix(vocabulary=vocab, store_in_df=True) title_name = f'terms of topic {topic_id}' else: raise ValueError( '"data" has to be "terms" "topics" or "terms_of_topic"') if not title: if display_selector == 'most_frequent': title = f'Most frequent {title_name} for female (top) and male authors (bottom)' elif display_selector == 'most_divergent': title = f'Most divergent {title_name} for female (top) and male authors (bottom)' else: title = f'Most variable {title_name} for female (top) and male authors (bottom)' df = d.df # 1: Load data data = {} for t in selected_terms_or_topics: data[t] = defaultdict(list) min_freq_total = 1 max_freq_total = 0 for idx, year in enumerate(range(1982, 2013)): time_slice = df[(df.ThesisYear >= year - 2) & (df.ThesisYear <= year + 2)] time_slice_female = time_slice[time_slice.AdviseeGender == 'female'] time_slice_male = time_slice[time_slice.AdviseeGender == 'male'] for t in selected_terms_or_topics: freq_total = time_slice[t].mean() freq_female = time_slice_female[t].mean() freq_male = time_slice_male[t].mean() # if t == 'gender' and year == 2008: # embed() # if a term doesn't appear, it is neutral if (freq_male + freq_female) == 0: freq_score = 0.5 else: freq_score = freq_female / (freq_female + freq_male) data[t]['year'].append(year) data[t]['freq_score'].append(freq_score) data[t]['freq_total'].append(freq_total) if freq_total < min_freq_total: min_freq_total = freq_total if freq_total > max_freq_total: max_freq_total = freq_total data[t]['mean_freq_score'] = np.mean(data[t]['freq_score']) data[t]['mean_freq_total'] = np.mean(data[t]['freq_total']) data[t]['freq_score_range'] = max(data[t]['freq_score']) - min( data[t]['freq_score']) # 2: Set up plot fig = plt.figure(figsize=(12, 12)) gs = gridspec.GridSpec(nrows=1, ncols=1, figure=fig, width_ratios=[1], height_ratios=[1], wspace=0.2, hspace=0.05) ax = fig.add_subplot(gs[0, 0]) ax.set_ylim(0, 1) ax.set_xlim(1985, 2010) ax.set_axisbelow(True) ax.grid(which='major', axis='both') dot_scaler = MinMaxScaler((0.0, 50.0)) dot_scaler.fit(np.array([min_freq_total, max_freq_total]).reshape(-1, 1)) legends = [] def draw_line(t, t_data, df): """ Draws one line depending on t (term or topic string) and t_data (dict of data belonging to t) :param t: str :param t_data: dict :return: """ y = t_data['freq_score'] x = t_data['year'] frequencies = t_data['freq_total'] if t.startswith('topic.'): legend = TOPICS[int(t[6:])]['name'] else: legend = '{:10s} ({})'.format(t, df[t].sum()) x_spline = np.linspace(min(x), max(x), (2010 - 1985 + 1) * 1000) spl = make_interp_spline(x, y, k=1) # BSpline object y_spline = spl(x_spline) line_interpolater = interp1d(x, frequencies) line_widths = line_interpolater(x_spline) line_widths = dot_scaler.transform(line_widths.reshape(-1, 1)).flatten() try: color = sns.color_palette()[len(legends)] except IndexError: color = sns.cubehelix_palette(100, start=2, rot=0, dark=0, light=.95)[len(legends)] ax.scatter(x_spline, y_spline, s=line_widths, antialiased=True, color=color) legends.append(mpatches.Patch(color=color, label=legend)) # 3: Plot if display_selector == 'most_frequent': ax.set_title(title, weight='bold', fontsize=18) sorted_items = sorted(data.items(), key=lambda k_v: k_v[1]['mean_freq_total'], reverse=True) for t, t_data in sorted_items[:no_terms_or_topics_to_show]: draw_line(t, t_data, df) elif display_selector == 'most_divergent': ax.set_title(title, weight='bold', fontsize=18) sorted_items = sorted(data.items(), key=lambda k_v: k_v[1]['mean_freq_score'], reverse=True) no_disp = no_terms_or_topics_to_show // 2 for t, t_data in sorted_items[:no_disp] + sorted_items[::-1][:no_disp]: draw_line(t, t_data, df) elif display_selector == 'most_variable': ax.set_title(title, weight='bold', fontsize=18) # sort by mean_freq_range second to preserve colors between plots sorted_items = sorted(data.items(), key=lambda k_v: k_v[1]['freq_score_range'], reverse=True) sorted_items = sorted_items[:no_terms_or_topics_to_show] sorted_items = sorted(sorted_items, key=lambda k_v: k_v[1]['mean_freq_score'], reverse=True) for t, t_data in sorted_items: draw_line(t, t_data, df) else: raise ValueError( 'display_selector has to be most_frequent, most_variable, or most_divergent' ) ax.legend(handles=legends, loc=4) if show_plot: plt.show() if store_to_filename: fig.savefig(Path('data', store_to_filename))
class NumericColumn(BaseEstimator, TransformerMixin): ''' Take a numeric value column and standardize it. ''' def __init__(self): ''' Set up the internal transformation. ''' self._transformer = MinMaxScaler() def fit(self, X, y=None): ''' Fit the standardization. ''' zeroed = pd.DataFrame(np.array(X).reshape(-1, 1)).fillna(0) self._transformer.fit(zeroed) return self def transform(self, X): ''' Transform a column of data into numerical percentage values. Parameters ---------- X : pandas series or numpy array ''' zeroed = pd.DataFrame(np.array(X).reshape(-1, 1)).fillna(0) return self._transformer.transform(zeroed).astype(np.float32)
def NB_coefficients(year=2010): poi_dist = getFourSquarePOIDistribution(useRatio=False) F_taxi = getTaxiFlow(normalization="bydestination") W2 = generate_geographical_SpatialLag_ca() Y = retrieve_crime_count(year=year) C = generate_corina_features() D = C[1] popul = C[1][:,0].reshape(C[1].shape[0],1) Y = np.divide(Y, popul) * 10000 f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 ) mms = MinMaxScaler(copy=False) mms.fit(f) mms.transform(f) header = C[0] + [ 'spatiallag', 'taxiflow'] + \ ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] df = pd.DataFrame(f, columns=header) np.savetxt("Y.csv", Y, delimiter=",") df.to_csv("f.csv", sep=",", index=False) # NB permute nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] ) print nbres ls = nbres.strip().split(" ") coef = [float(e) for e in ls] print coef return coef, header
def predict_new(self, input): model = self.train_model() assert len(input) == 5 and type(input) == list scaler = MinMaxScaler(feature_range=(0, 1)) scaler.fit(self.data) inp = scaler.transform([input]) print(scaler.inverse_transform(model.predict(numpy.array(inp).reshape(1, 1, 5))))
def _scaled_data(self): """Load scaled data. Args: None Returns: (scaler, train, test): Tuple of list of train and test data """ # Initialize key variables (_train, _test) = self._data() # Fit scaler scaler = MinMaxScaler(feature_range=(-1, 1)) scaler = scaler.fit(_train) # Transform train train = _train.reshape(_train.shape[0], _train.shape[1]) train_scaled = scaler.transform(train) # Transform test test = _test.reshape(_test.shape[0], _test.shape[1]) test_scaled = scaler.transform(test) # Return return scaler, train_scaled, test_scaled