Пример #1
0
def transform(categorical_columns, numerical_columns, data):
    cat = ('categorical', ohe(), categorical_columns)
    num = ('numeric', ss(), numerical_columns)
    col_trans = ColumnTransformer([cat, num])
    df_trans_scaled = col_trans.fit_transform(data)
    col_names = get_column_names_from_ColumnTransformer(col_trans)
    for vals in numerical_columns:
        col_names.append(vals)
    df_trans_scaled = pd.DataFrame({
        col_names[0]: df_trans_scaled[:, 0],
        col_names[1]: df_trans_scaled[:, 1],
        col_names[2]: df_trans_scaled[:, 2],
        col_names[3]: df_trans_scaled[:, 3],
        col_names[4]: df_trans_scaled[:, 4],
        col_names[5]: df_trans_scaled[:, 5],
        col_names[6]: df_trans_scaled[:, 6],
        col_names[7]: df_trans_scaled[:, 7],
        col_names[8]: df_trans_scaled[:, 8],
        col_names[9]: df_trans_scaled[:, 9],
        col_names[10]: df_trans_scaled[:, 10],
        col_names[11]: df_trans_scaled[:, 11],
        col_names[12]: df_trans_scaled[:, 12],
        col_names[13]: df_trans_scaled[:, 13],
        col_names[14]: df_trans_scaled[:, 14],
        col_names[15]: df_trans_scaled[:, 15],
        col_names[16]: df_trans_scaled[:, 16],
        col_names[17]: df_trans_scaled[:, 17]
    })
    return df_trans_scaled, col_names
Пример #2
0
 def fit(self, data):
     if not isinstance(data, pd.DataFrame):  # Needs to be dataframe
         data = pd.DataFrame(data)
     self.p = data.shape[1]
     self.cidx = np.where(data.dtypes == 'object')[0]
     self.nidx = np.where(~(data.dtypes == 'object'))[0]
     self.cenc = ohe(sparse=False,
                     dtype=int,
                     handle_unknown='ignore',
                     drop=self.drop)
     self.cenc.categories_ = [
         list(data.iloc[:, x].value_counts().index) for x in self.cidx
     ]
     self.cenc.drop_idx_ = np.repeat(0, len(self.cenc.categories_))
     # Total feature size: categories + num
     self.p2 = sum([len(x) - 1
                    for x in self.cenc.categories_]) + len(self.nidx)
     self.nenc = ss()
     self.nenc.mean_ = data.iloc[:, self.nidx].mean().values
     self.nenc.scale_ = data.iloc[:, self.nidx].std().values
     self.nenc.n_features_in_ = self.nidx.shape[0]
     self.cn = list(self.cenc.get_feature_names(data.columns[self.cidx].astype(str))) + \
               data.columns[self.nidx].to_list()
     self.lst_enc = [self.cenc, self.nenc]
     self.lst_cidx = [self.cidx, self.nidx]
     self.lst_iter = [len(z) > 0 for z in self.lst_cidx]
Пример #3
0
def xg_eval(learning_rate,n_estimators, max_depth,n_components):
    # 12.1 Make pipeline. Pass parameters directly here
    pipe_xg1 = make_pipeline (ss(),                        # Why repeat this here for each evaluation?
                              PCA(n_components=int(round(n_components))),
                              XGBClassifier(
                                           silent = False,
                                           n_jobs=2,
                                           learning_rate=learning_rate,
                                           max_depth=int(round(max_depth)),
                                           n_estimators=int(round(n_estimators))
                                           )
                             )

    # 12.2 Now fit the pipeline and evaluate
    cv_result = cross_val_score(estimator = pipe_xg1,
                                X= X_train,
                                y = y_train,
                                cv = 2,
                                n_jobs = 2,
                                scoring = 'f1'
                                ).mean()             # take the average of all results


    # 12.3 Finally return maximum/average value of result
    return cv_result
Пример #4
0
def get_data(tt_split=0.8):
    not_fraud = pd.read_csv('../data/data.csv', header='infer')
    not_fraud['is_fraud'] = 0
    not_fraud['is_not_fraud'] = 1
    fraud = pd.read_csv('../data/data_fraud.csv', header='infer')
    fraud['is_fraud'] = 1
    fraud['is_not_fraud'] = 0
    res = shuffle(pd.concat([fraud, not_fraud]))
    res['slum'] = 0
    res['middle'] = 0
    res['posh'] = 0
    res.loc[res.area == 'slum', 'slum'] = 1
    res.loc[res.area == 'middle', 'middle'] = 1
    res.loc[res.area == 'posh', 'posh'] = 1
    x = res[[
        'lower_education', 'higher_education', 'jewellery', 'car', 'bike',
        'tax', 'misc_credit', 'misc_debit', 'slum', 'middle', 'posh'
    ]]
    scaler = ss().fit(x)
    x = scaler.transform(x)
    y = res[['is_fraud', 'is_not_fraud']]
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        train_size=tt_split,
                                                        random_state=42)
    return (x_train, x_test, y_train, y_test, scaler)
Пример #5
0
def preprocess(data, flag):
    cont_v = [n for n in range(np.shape(data)[1]) if n != 3]
    normalized_data = ss().fit_transform(data[:, cont_v])
    outlier_row, outlier_col = np.where(np.abs(normalized_data) > 3)
    if (flag):
        for i in range(0, len(outlier_col)):
            normalized_data[outlier_row[i]][outlier_col[i]] = np.sign(
                normalized_data[outlier_row[i]][outlier_col[i]]) * 3
    return normalized_data
Пример #6
0
    def preprocess(self, path):
        names = []
        data = {}  # must use {}, if use [], means it is list and the index can not be string
        if 'car' in path:
            names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'cls']
            data = pd.read_csv(path, names=names, header=None, na_values='?')
        elif 'iris' in path:
            names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'cls']
            data = pd.read_csv(path, names=names, header=None, na_values='?')
        elif 'adult' in path:
            names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                     'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                     'hours-per-week', 'native-country', 'cls']
            data = pd.read_csv(path, names=names, header=None, sep=',\s', na_values='?', engine='python')

        # convert NaN to most frequent attribute value in each column
        for attr in names[:len(names) - 1]:
            value = data[attr].value_counts().idxmax()
            data = data.fillna({attr: value})

        # convert categorical values into numerical values except the 'cls' column
        import sklearn.preprocessing as pp
        enc = {}  # must use {}, if use [], means it is list and the index can not be string
        for column in data.columns[: len(names) - 1]:
            if data.dtypes[column] == np.object:
                enc[column] = pp.LabelEncoder()
                data[column] = enc[column].fit_transform(data[column])

        # get dummy variable of target value(cls), convert categorical values to numerical
        data = pd.get_dummies(data, columns=['cls'], prefix=['cls'])

        # set new header names, since after get_dummies operation, more column are added
        new_names = list(data)
        col_names = new_names[0: len(names) - 1]
        X = data[col_names]
        cls_name = new_names[len(names) - 1:]
        y = data[cls_name]
        # split data into training and test dataset
        from sklearn.model_selection import train_test_split as tt_split
        X_train, X_test, y_train, y_test = tt_split(X, y)

        # standardizing and scaling
        from sklearn.preprocessing import StandardScaler as ss
        scaler = ss()
        scaler.fit(X_train)  # only fit the training data
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # convert y_test DataFrame to numpy.ndarray, and combine attributes and target values
        y_test = y_test.as_matrix()
        test_data = np.concatenate((X_test, y_test), axis=1)
        # ','.join(new_names) is to convert header name list to str and use ',' as delimiter
        self.__processed_data_to_csv(test_data, self.test, ','.join(new_names))

        return X_train, y_train
def model_stop(df):
    """
    Create a model from a dataframe.
    """
    #create a model from a dataframe

    #df = pd.get_dummies(df,columns=['day'])
    #features = ['day_'+str(i) for i in range(0,7)]
    #for f in features:
    #    if f not in df.columns:
    #        df[f] = 0i
    df = df[df['traveltime'] > 0]
    X = df[df['traveltime'] < df['traveltime'].quantile(0.95)]
    X = df[df['traveltime'] > df['traveltime'].quantile(0.05)]
    features = ['rain', 'temp', 'hour', 'day']
    scaler_X = ss()
    X = scaler_X.fit_transform(df[features])
    scaler_Y = ss()

    Y_real = df['traveltime']
    Y = scaler_Y.fit_transform(df['traveltime'].values.reshape(-1, 1))

    model = mlp().fit(X, Y)
    return model, X, features, scaler_X, scaler_Y, Y_real
Пример #8
0
def predict_boston():
    print "回归预测房价"
    # 加载 boston 数据
    boston = lb()
    # 输出数据特征
    print boston['DESCR']
    # 验证集合样本大小占比
    cratio = 0.4
    # 输出数据主键
    print boston.keys()
    # 数据分成验证集和训练集
    X, Xtest, Y, Ytest = cv.train_test_split(boston['data'],
                                             boston['target'],
                                             test_size=cratio)
    # 加载数据归一化
    S = ss()
    X = S.fit_transform(X)  # 归一化拟合
    Xtest = S.fit_transform(Xtest)  # 归一化拟合
    # 加载岭回归
    lr = limd.Ridge()
    # 模型训练
    model = lr.fit(X, Y)
    print "模型\n", model
    print("训练拟合评分\n %.3f" % lr.score(X, Y))
    # 模型预测
    Ypred = model.predict(Xtest)
    print("预测均方误差\n %.3f" % metrics.mean_squared_error(Ytest, Ypred))
    print("系数\n %s " % lr.coef_)
    print "截距\n", lr.intercept_
    # 作图
    fig = plt.figure()
    # 画出直线 y=x 并且设置颜色和粗细
    plt.plot([min(Ytest), max(Ytest)], [min(Ytest), max(Ytest)], 'r', lw=5)
    # 设置颜色
    color = abs(Ypred - Ytest) / Ytest
    # 画出散点图
    p = plt.scatter(Ypred, Ytest, c=color, marker='.')
    # 颜色刻度
    plt.colorbar()
    plt.ylabel("Predieted Price")
    plt.xlabel("Real Price")
    # 图片显示
    plt.show()
    return
Пример #9
0
def timeseries_scaling(X, is_training_data=True, list_of_transformers=None):
    #X += epsilon
    #X.shape = (nsamples, timesteps, features)
    #is_training_data and list_of_transformers can take values 'True' and 'None, and 'False' and 'not None' only.
    X_new = np.zeros_like(X)
    for i in range(X.shape[0]):
        X_new[i] = mms().fit_transform(X[i])
    #
    #"""
    X_new2 = X_new.copy()
    if is_training_data:
        list_of_transformers = list()
    #
    for i in range(X.shape[1]):
        if is_training_data:
            tr = ss()
            tr.fit(X_new[:, i])
            list_of_transformers.append(tr)
        else:
            tr = list_of_transformers[i]
        X_new2[:, i] = tr.transform(X_new[:, i])
    return X_new2, list_of_transformers
Пример #10
0
def featureMat(data):
    # Kurtosis
    kurtosis_out = kurtosis(data)
    # print("kurtosis", kurtosis_out)

    # LAGE
    lage_out = lage(data)
    # print("LAGE: ", LAGE_out)

    # Entropy
    entropy_out = entropy(data)
    # print("Entropy:", entropy_out)

    # FFT Top5 features
    fft5_out = mfft(data)
    # print("FFT Top5", fft5_out)

    # FeatureMatrix
    featureMAT = np.hstack((fft5_out, kurtosis_out, lage_out, entropy_out))

    featureMAT = ss().fit_transform(featureMAT.real)

    return featureMAT
Пример #11
0
def predict(request):
    data = UserReport.objects.filter(user=request.user).latest('date_time')
    print(data.age)
    reportData = np.array([
        data.age, data.sex, data.cp, data.restbps, data.chol, data.fbs,
        data.ecg, data.heart_rate, data.ex_in_angina,
        data.st_depression_in_exercise, data.peak_st_segment,
        data.vessels_by_flourosopy, data.thal
    ])
    reportData = reportData.reshape(1, -1)
    sc = ss()
    trained_data = sc.fit_transform(HdpsConfig.trained_data)
    reportData = sc.transform(reportData)
    result = HdpsConfig.classifier.predict(reportData)
    if data.sex == 1:
        sex = 'MALE'
    else:
        sex = 'FEMALE'
    if data.ecg == 0:
        ecg_val = 'Normal'
    elif data.ecg == 1:
        ecg_val = 'ST-T Wave Abnormality'
    else:
        ecg_val = 'Left Ventricular Hyperthrophy'

    try:
        profile = Profile.objects.get(user=request.user)
    except:
        profile = None
    return render(
        request, 'prediction_report.html', {
            'report_data': data,
            'result': result,
            'sex': sex,
            'profile': profile,
            'ecg_val': ecg_val
        })
Пример #12
0
    col = np.genfromtxt(full_path, delimiter=',')
    col = sorted(col)
    interact = np.zeros((np.size(X, axis=0), (np.size(col)*(np.size(col)-1))/2))
    for i in range(0, np.size(col)-1):
        for j in range(i+1, np.size(col)):
            interact[:, c] = X[:, col[i]]*X[:, col[j]]
            c += 1

    X = np.append(X, np.square(X), axis=1)  # add the squares
    X = np.append(X, interact, axis=1)
    print 'Completed Full Model'


if do_noise:
    from sklearn.preprocessing import MinMaxScaler as ss
    stan = ss(feature_range=(-1, 1))

    x_norm = np.random.randn(np.size(X, axis=0), 1)
    X = np.column_stack((X, x_norm))
    X = stan.fit_transform(X)

# build train test validate sets
# seed(41)
j = 0
coef = np.zeros((np.size(X, axis=1), num_runs))
print result_title

while j < num_runs:

    trn_x, trn_y, val_x, val_y, tst_x, tst_y = tvt(X, Y)
import plotly.tools as tls
plotly.tools.set_credentials_file(username='******',
                                  api_key='JHK7zCU6FCcYgg6LcjaO')

import matplotlib.pyplot as plt
from scipy.signal import correlate

d_c = read_clean_dataset(summary=True)  #Clean data set
c_d = read_corrupted_dataset(summary=True)  #corrupted data set

#Initially import all necessary modules and functions. Please note that this
#api key for plotly is from an account I made specifically for this project
#so you can access the generated plots at username: engleberry,
#password: Droice1212. There should be only 2 graphs in the account to view for PCA.

std = ss().fit_transform(d_c[0])  #Standardization

#Covariance matrix from standardized data.
cov_mat = np.cov(std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

#Correlation matrix from raw data.
cor_mat = np.corrcoef(d_c[0].T)
eig_vals2, eig_vecs2 = np.linalg.eig(cor_mat)

#Correlation matrix from standardized data.
cor_mat2 = np.corrcoef(std.T)
eig_vals3, eig_vecs3 = np.linalg.eig(cor_mat2)

#SVD could also be used to find the eigenvectors.
#eig_vec4,s,v = np.linalg.svd(std.T)
Пример #14
0
X_test.shape         # (35014, 30)
y_train.shape        # (65025,)
y_test.shape         # (35014,)


################# CC. Create pipeline #################
#### Pipe using XGBoost


# 5 Pipeline steps
# steps: List of (name, transform) tuples
#       (implementing fit/transform) that are
#       chained, in the order in which they
#       are chained, with the last object an
#       estimator.
steps_xg = [('sts', ss() ),
            ('pca', PCA()),
            ('xg',  XGBClassifier(silent = False,
                                  n_jobs=2)        # Specify other parameters here
            )
            ]

# 5.1  Instantiate Pipeline object
pipe_xg = Pipeline(steps_xg)

# 5.2 Another way to create pipeline:
#     Not used below
pipe_xg1 = make_pipeline (ss(),
                          PCA(),
                          XGBClassifier(silent = False,
                                        n_jobs=2)
Пример #15
0
    def fit(self, x):  # Fit the encoder/scaler
        self.n = x.shape[0]
        self.p = x.shape[1]
        dt1 = pd.Series([type(x.iloc[0][kk]).__name__ for kk in range(self.p)])
        dt2 = x.dtypes.astype(str).reset_index(drop=True)
        self.dt = pd.Series(
            np.where(
                dt1.isin(['int64', 'float64'])
                & dt2.isin(['int64', 'float64']), 'float', 'str'))
        if not all(self.dt.values == 'float'):
            self.dt[~(self.dt.values == 'float')] = \
                np.where(x.loc[:, ~(self.dt.values == 'float')].apply(lambda x: x.str.contains('\\|', na=False).any()),
                 'lst',self.dt[~(self.dt.values == 'float')])
        self.cn = np.array(x.columns)
        stopifnot(all(self.dt.isin(['float', 'lst', 'str'])))
        self.cidx = np.where(self.dt == 'str')[0]
        self.nidx = np.where(self.dt == 'float')[0]
        self.tidx = np.where(self.dt == 'lst')[0]
        stopifnot(
            all(
                np.sort(reduce(np.union1d, [self.cidx, self.nidx, self.tidx]))
                == np.arange(self.p)))
        self.iter = {'cenc': True, 'nenc': True, 'tenc': True}
        self.all_enc = {}

        #############################################################
        # --- Encoder (i): Categorical/ordinal integer features --- #

        if len(self.cidx) > 0:
            self.cenc = ohe(sparse=self.sparse,
                            dtype=self.dtype,
                            handle_unknown='ignore',
                            drop=None)
            self.cenc.categories_ = [
                np.unique(x.iloc[:, kk]) for kk in self.cidx
            ]
            self.cmode = [x.iloc[:, kk].mode()[0] for kk in self.cidx]
            cmode_idx = np.array([
                np.where(vec == mm)[0][0]
                for vec, mm in zip(self.cenc.categories_, self.cmode)
            ])
            cum_idx = np.append([0],
                                np.cumsum(
                                    [len(z) for z in self.cenc.categories_]))
            self.cenc.drop_idx = []
            self.cenc.drop_idx_ = None
            self.cenc.p = cum_idx.max() - len(
                self.cenc.drop_idx
            )  # How many features after dropping most common
            self.cenc.cn = list(
                np.delete(self.cenc.get_feature_names(self.cn[self.cidx]),
                          self.cenc.drop_idx))
            self.all_enc['cenc'] = self.cenc
        else:
            self.iter['cenc'] = False

        ###############################################
        # --- Encoder (ii): Continuous numerical ---- #

        if len(self.nidx) > 0:
            if self.quantize:
                u_nidx = np.array(
                    [len(x.iloc[:, kk].unique()) for kk in self.nidx])
                self.nidx1 = self.nidx[u_nidx > 31]  # quantize
                self.nidx2 = self.nidx[u_nidx <= 31]  # one-hot-encode
                self.nenc = {'enc': {}, 'cn': {}}
                if len(self.nidx1) > 0:
                    self.nenc1 = KD(n_bins=self.nbins, strategy='quantile')
                    if not self.sparse:
                        self.nenc1.encode = 'onehot-dense'
                    self.nenc1.fit(x.iloc[:, self.nidx1])
                    self.nenc1.cn = ljoin([
                        cn + '_q' + pd.Series(qq).astype(str)
                        for cn, qq in zip(self.cn[self.nidx1], [
                            np.arange(len(z) - 1) + 1
                            for z in self.nenc1.bin_edges_
                        ])
                    ])
                    self.nenc['enc']['nenc1'] = self.nenc1
                    self.nenc['cn']['nenc1'] = self.nenc1.cn
                if len(self.nidx2) > 0:
                    self.nenc2 = ohe(sparse=self.sparse,
                                     handle_unknown='ignore',
                                     drop=None)
                    self.nenc2.fit(x.iloc[:, self.nidx2])
                    self.nenc2.cn = self.nenc2.get_feature_names(
                        self.cn[self.nidx2])
                    self.nenc['enc']['nenc2'] = self.nenc2
                    self.nenc['cn']['nenc2'] = self.nenc2.cn
                self.nenc['cn'] = ljoin(list(self.nenc['cn'].values()))
                self.all_enc['nenc'] = self.nenc
            else:
                self.nenc = ss(copy=False)
                self.nenc.mean_ = x.iloc[:, self.nidx].mean(axis=0).values
                self.nenc.scale_ = x.iloc[:, self.nidx].std(axis=0).values
                self.nenc.n_features_in_ = self.nidx.shape[0]
                self.nenc.p = self.nidx.shape[0]
                self.nenc.cn = list(self.cn[self.nidx])
                self.all_enc['nenc'] = self.nenc
        else:
            self.iter['nenc'] = False

        ################################################
        # --- Encoder (iii): Tokenize text blocks ---- #

        if len(self.tidx) > 0:
            self.tenc = dict(
                zip(self.cn[self.tidx], [
                    cv(tokenizer=lambda x: tok_fun(x),
                       lowercase=False,
                       token_pattern=None,
                       binary=True) for z in range(self.tidx.shape[0])
                ]))
            self.tenc = {'cv': self.tenc}
            for kk, jj in enumerate(self.cn[self.tidx]):
                self.tenc['cv'][jj].fit(x.loc[:, jj].astype('U'))
            self.tenc['p'] = sum(
                [len(z.vocabulary_) for z in self.tenc['cv'].values()])
            self.tenc['cn'] = ljoin([
                l + '_' + pd.Series(list(z.vocabulary_.keys())) for z, l in
                zip(self.tenc['cv'].values(), self.tenc['cv'].keys())
            ])
            self.all_enc['tenc'] = self.tenc
        else:
            self.iter['tenc'] = False

        # Store all in dictionary to iteration over self.iter
        self.enc_transform = {
            'cenc': self.cenc_transform,
            'nenc': self.nenc_transform,
            'tenc': self.tenc_transform
        }
        # Get the valid categories
        self.tt = np.array(list(self.iter.keys()))[np.where(
            list(self.iter.values()))[0]]
        # Get full feature names
        cn = []
        for ee in self.tt:
            if hasattr(self.all_enc[ee], 'cn'):
                cn.append(self.all_enc[ee].cn)
            else:
                cn.append(self.all_enc[ee]['cn'])
        cn = ljoin(cn)
        self.cn_transform = cn
from sklearn.preprocessing import LabelEncoder as lbe, OneHotEncoder as ode
# Input column
labencoder_country = lbe()
ind_features[:, 0] = labencoder_country.fit_transform(ind_features[:, 0])

hotencoder = ode(categorical_features=[0])
ind_features = hotencoder.fit_transform(ind_features).toarray()

# Output column
labencoder_output = lbe()
output_label = labencoder_output.fit_transform(output_label)

# Split the data into training set and test set =======================================
from sklearn.cross_validation import train_test_split as tts
ind_train_set, ind_test_set, out_train_set, out_test_set = tts(ind_features,
                                                               output_label,
                                                               test_size=0.3)

# Put all the real parameters with the same scale
# The dummy variable scaling opens a great discussion.
# Largely depends on the context. This time, they will be scaled.
"""
Normalization:
	new_value = (old_value - min)/(max - min)
Standartization:
	new_value = (old_value - mean)/(standard_devitation)
"""
from sklearn.preprocessing import StandardScaler as ss
datascaler = ss()
ind_train_set = datascaler.fit_transform(ind_train_set)
ind_test_set = datascaler.transform(ind_test_set)
Пример #17
0
df.loc[2] = [3500, 2, 2, 1, 0, 'Skinny']
df.loc[3] = [1400, 0, 1, 0, 3, 'Skinny']
df.loc[4] = [1600, 1, 0, 2, 0, 'Normal']
df.loc[5] = [3200, 1, 2, 1, 1, 'Fat']
df.loc[6] = [1750, 1, 0, 0, 1, 'Skinny']
df.loc[7] = [1600, 1, 0, 0, 0, 'Skinny']
print(df)
# Split feature vectors and labels
x = df[['Calory', 'Breakfast', 'Lunch', 'Dinner', 'Excercise']]
y = df[['Body Shape']]
print(y)
# The mean of 'Calory' will be very high compared to the means of the other 4 columns....so we have to normalise it
# Using StandardScaler (-1,1)

from sklearn.preprocessing import StandardScaler as ss
x_std = ss().fit_transform(x)
print(x_std)
# Covariance matrix of features

# Features are columns from x_std

import numpy as np
features = x_std.T
covariance_matrix = np.cov(features)
print(covariance_matrix)

#PCA believed that the points which have close cov or corr have more impact
#Eigen vector- the respected features....not suppressed

# .t means transform

# 9.1 Which columns are numerical and which categorical?
num_columns = X.select_dtypes(include = ['float64','int64']).columns
num_columns

cat_columns = X.select_dtypes(include = ['object']).columns
cat_columns



# 10. Start creating transformation objects
# 10.1 Tuple for categorical columns
cat = ("cattrans", ohe(), cat_columns)
# 10.2 tuple for numeric columns
num = ("numtrans", ss() , num_columns)
# 10.3 Instantiate column transformer object
colTrans = ct([num,cat])

# 10.4 Fit and transform
X_trans = colTrans.fit_transform(X)
X_trans.shape              # 19100 X 19


## 11.0 Label encoding
#  11.1  Map labels to 1 and 0
y = y.map({"continue" : 1, "drop" : 0})
y.head()


Пример #19
0
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
print("total :\n",missing_data)
"""

#take some values as training and predict output of some test cases
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0)

print("x:\n", x)

print("x_train before scaling:\n", x_train)

print("x_test before scaling:\n", x_test)

#scale x and y to make them meaningful in calculations
SS = ss()
x_train = SS.fit_transform(x_train)
x_test = SS.transform(x_test)

print("x_train after sclaing:\n", x_train)

print("x_test after scaling:\n", x_test)

print("y:\n", y)

print("y_train:\n", y_train)

print("y_test:\n", y_test)

X_set, y_set = x_train, y_train
X1, X2 = np.meshgrid(
Пример #20
0
# 5.3
X.columns

# 5.4
y=bc.loc[:,'diagnosis']
# OR
y=bc.iloc[:, 0]


# 5.5 y has two unique values
y.unique()  # Number of unique values in y

# 6 Center and scale
#  Initialize the centering/scaling object
scaler=ss()           
# 6.1 Use the object to create model
model=scaler.fit(X)
# 6.2 And now transform data
data_trans=model.transform(X)

# 6.3 Check
data_trans.shape
data_trans.mean()
type(data_trans)

#### 7. PCA now
pca=PCA()  # PCA object first. Instantiate the class

# 7.1 Get PCA model now       
pca_model=pca.fit(data_trans)
Пример #21
0
# import the dataset
dataset = pd.read_csv('data\Data.csv')

X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# replace missing data in X using mean of the whole column
imputer = im(missing_values='NaN', strategy='mean',
                            axis=0)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# encode categorical data
labelencode_X = le()
X[:, 0] = labelencode_X.fit_transform(X[:, 0])
# dummy encoding the data
ohotencode = ohe(categorical_features=[0])
X = ohotencode.fit_transform(X).toarray()

labelencode_Y = le()
y = labelencode_Y.fit_transform(y)

# splitting the data into train and test set
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2,
                                       random_state=0)

# feature scaling
standardscale_X = ss()
X_train = standardscale_X.fit_transform(X_train)
X_test = standardscale_X.transform(X_test)
Пример #22
0
 def scal(self):
     ss1 = ss()
     self.xtr = ss1.fit_transform(self.xtr)
     self.xte = ss1.transform(self.xte)
     return self.xtr, self.xte
Пример #23
0
def delt(beta, vect):
    # use optimization to find the minimum positive delta with constraints
    # (1-d)||b||_2^2 <= ||Xb||_2^2 <= (1+d)||b||_2^2
    return np.abs(np.sum(np.square(np.dot(vect, beta))) - 1)


# get the data
data = np.genfromtxt(csv_path, delimiter=",")
# remove rows that have NaN values (not ideal but IDGAF yet)
data = data[~np.isnan(data).any(axis=1)]  # from stack overflow: https://bit.ly/1QhfcmZ
Y = np.array([x[1] - 1 for x in data])  # y values in the second column
X = np.array([x[2:] for x in data])
del x
del data
stan = ss()
x_norm = norm(stan.fit_transform(X.astype("float")), axis=0)
n_feat = np.size(X, axis=1)
n_row = np.size(X, axis=0)
del X
del Y

# test instances
if test_runs:
    x_norm = norm(stan.fit_transform(np.random.normal(size=(n_row, n_feat))), axis=0)

results = np.zeros((num_runs, max_s))
d_quant = np.zeros((max_s, 4))

while run and b <= max_s:
    print b
#print(df2.head())

df2 = pd.get_dummies(df2)
#print(df2.columns)
df2 = df2.drop(columns=["pay_schedule_semi-monthly"])

response = df2["e_signed"]
users = df2["entry_id"]
df2 = df2.drop(columns=["e_signed", "entry_id"])

#splitting data
x_train, x_test, y_train, y_test = train_test_split(df2,
                                                    response,
                                                    test_size=0.3,
                                                    random_state=0)
sc_x = ss()
x_train2 = pd.DataFrame(sc_x.fit_transform(x_train))
x_test2 = pd.DataFrame(sc_x.fit_transform(x_test))
x_train2.columns = x_train.columns
x_test2.columns = x_test.columns
#print(x_train2)
x_train = x_train2
x_test = x_test2

#Model
#Logistice Regression
LR = LogisticRegression(random_state=0, penalty="l1")
LR.fit(x_train, y_train)

y_pred = LR.predict(x_test)
    

if __name__=='__main__':

    iris = datasets.load_iris()
    X = iris.data[:,[2,3]]
    y = iris.target
    
    #spliting the data for test(30%) and training(70%) using tts 
    X_train,X_test,y_train, y_test = \
            tts(X,y,test_size=0.3, random_state=0)    


    #Standardising the feature (feature scaling) using ss 
    sc =ss()
    #Using fit to estimate 'sample mean','standard deviation' to do feature scaling 
    #for each feature dimension using training data 
    sc.fit(X_train)
    #tranform is used to standardize the trainig data (TrDS) and test data(TsDS)
    #Note: we have used same parameter for feature scaling 
    X_train_std = sc.transform(X_train)
    X_test_std  = sc.transform(X_test)


    #n_iter:-  Number of Epochs(passes over the TrDS set)
    #eta0/eta:-learning rate
    #reproducibility of initial shuffling of TrDS after each epoch  
    ppn = Perceptron(n_iter=40,eta0=0.1, random_state=0)
    #training using fit 
    ppn.fit(X_train_std,y_train)
Пример #26
0
def delt(beta, vect):
    # use optimization to find the minimum positive delta with constraints
    # (1-d)||b||_2^2 <= ||Xb||_2^2 <= (1+d)||b||_2^2
    return np.abs(np.sum(np.square(np.dot(vect, beta)))-1)


# get the data
data = np.genfromtxt(csv_path, delimiter=',')
# remove rows that have NaN values (not ideal but IDGAF yet)
data = data[~np.isnan(data).any(axis=1)]  # from stack overflow: https://bit.ly/1QhfcmZ
Y = np.array([x[1]-1 for x in data])  # y values in the second column
X = np.array([x[2:] for x in data])
del x
del data
stan = ss()
x_norm = norm(stan.fit_transform(X.astype('float')), axis=0)
n_feat = np.size(X, axis=1)
n_row = np.size(X, axis=0)
del X
del Y

# test instances
if test_runs:
    x_norm = norm(stan.fit_transform(np.random.normal(size=(n_row,  n_feat))), axis=0)

results = np.zeros((num_runs, max_s))
d_quant = np.zeros((max_s, 4))

while run and b <= max_s:
    print b
Пример #27
0
    col = np.genfromtxt(full_path, delimiter=',')
    col = sorted(col)
    interact = np.zeros(
        (np.size(X, axis=0), (np.size(col) * (np.size(col) - 1)) / 2))
    for i in range(0, np.size(col) - 1):
        for j in range(i + 1, np.size(col)):
            interact[:, c] = X[:, col[i]] * X[:, col[j]]
            c += 1

    X = np.append(X, np.square(X), axis=1)  # add the squares
    X = np.append(X, interact, axis=1)
    print 'Completed Full Model'

if do_noise:
    from sklearn.preprocessing import MinMaxScaler as ss
    stan = ss(feature_range=(-1, 1))

    x_norm = np.random.randn(np.size(X, axis=0), 1)
    X = np.column_stack((X, x_norm))
    X = stan.fit_transform(X)

# build train test validate sets
# seed(41)
j = 0
coef = np.zeros((np.size(X, axis=1), num_runs))
print result_title

while j < num_runs:

    trn_x, trn_y, val_x, val_y, tst_x, tst_y = tvt(X, Y)
Пример #28
0
)

from sklearn.model_selection import train_test_split as tts

factors = csv[["EstimatedSalary", "Age"]]

print(max(csv[["EstimatedSalary"]].values))

purchased = csv[["Purchased"]]

factor_training, factor_testing, purchased_training, purchased_testing = tts(
    factors, purchased, test_size=0.25, random_state=0)

from sklearn.preprocessing import StandardScaler as ss

factor_training_score = ss().fit_transform(factor_training)
factor_testing_score = ss().fit_transform(factor_testing)

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0)

classifier.fit(factor_training_score, purchased_training)

predictedPurchase = classifier.predict(factor_testing_score)

from sklearn.metrics import accuracy_score

accuracy_score(purchased_testing, predictedPurchase)

import numpy as np
Пример #29
0
from sklearn.preprocessing import StandardScaler as ss

df = pd.read_csv('Social_Network_Ads.csv')

#sns.scatterplot(df['EstimatedSalary'],df['Purchased'])
df.drop('User ID', inplace=True, axis=1)
#print(df.info())
gen = pd.get_dummies(df['Gender'], drop_first=True)
df.drop('Gender', inplace=True, axis=1)
#print(gen.head())
dff = pd.concat([df, gen], axis=1)
#print(dff.info())
x = dff.drop('Purchased', axis=1)
y = dff['Purchased']
print(y.head())

sss = ss()
xx = sss.fit_transform(x)

xtrain, xtest, ytrain, ytest = train_test_split(xx,
                                                y,
                                                test_size=0.3,
                                                random_state=101)

cm = knc(n_neighbors=3)
cm.fit(xtrain, ytrain)
pdata = cm.predict(xtest)

creport = cr(ytest, pdata)

print(creport)
Пример #30
0
#Preprocessing

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

from sklearn.preprocessing import StandardScaler as ss

sc = ss()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

###############################################################################################################################################
print("GuassianNB")

from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
Пример #31
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import Imputer, OneHotEncoder
from sklearn.preprocessing import LabelEncoder as le
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler as ss
from sklearn.linear_model import LinearRegression as lr
from sklearn.preprocessing import PolynomialFeatures as pf
from sklearn.svm import SVR
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
dataset = pd.read_csv("Position_Salaries.csv")
X = dataset.iloc[:, 1:2].values
y = dataset.iloc[:, 2:3].values
sc_X = ss()
sc_y = ss()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)
regressor = SVR(kernel="rbf")
regressor.fit(X, y)
y_pred = regressor.predict([[6.5]])
y_pred = sc_y.inverse_transform(y_pred)

plt.scatter(X, y, color="blue")
plt.plot(X, regressor.predict(X), color="green")
plt.title("支持向量回归")
plt.xlabel("级别")
plt.ylabel("工资")
plt.show()
X_grid = np.arange(min(X), max(X), 0.01)
Пример #32
0
#Logistic Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
dataset=pd.read_csv('Social_Network_Ads.csv')
x=dataset.iloc[:,[2,3]].values
y=dataset.iloc[:,4].values
from sklearn.model_selection import train_test_split as tts
xTrain,xTest,yTrain,yTest=tts(x,y,test_size=0.25,random_state=0)
from sklearn.preprocessing import StandardScaler as ss
scale=ss()
xTrain=scale.fit_transform(xTrain)
xTest=scale.transform(xTest)
from sklearn.linear_model import LogisticRegression as lr
classifier=lr(random_state=0)
classifier.fit(xTrain,yTrain)
yPred=classifier.predict(xTest)
from sklearn.metrics import confusion_matrix as cm
cm=cm(yTest,yPred)
from matplotlib.colors import ListedColormap
X_set, y_set = xTrain, yTrain
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
Пример #33
0
fig3 = plt.figure(figsize=(12,12))
ax3 = fig3.add_subplot(1,4,1)
sns.boxenplot(x='diagnosis',y='symmetry_mean',data=df)
ax3 = fig3.add_subplot(1,4,2)
sns.boxenplot(x='diagnosis',y='fractal_dimension_mean',data=df)

# Selecting the Columns 
X = df.loc[:, 'radius_mean' : 'fractal_dimension_worst']
X.isnull().sum()
X.head()
X.shape

# Scale the Numeric data

scaleit = ss()
s=scaleit.fit_transform(df.loc[:, 'radius_mean' : 'fractal_dimension_worst'])
s=scaleit.fit_transform(X)


pca = PCA()
principleComp = pca.fit_transform(X)
principleComp.shape
pca.explained_variance_ratio_
X = pca.explained_variance_ratio_.cumsum()
X

# Plotting the Distplot graph
ns.distplot(X,bins=5)

X = principleComp[:,0:11]