示例#1
0
文件: part2.py 项目: rbaxter1/CS7641
 def best_ica_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ica = FastICA(n_components=X_train_scl.shape[1])
     X_train_transformed = ica.fit_transform(X_train_scl, y_train)
     X_test_transformed = ica.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/wine_ica_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_ica_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
示例#2
0
文件: part2.py 项目: rbaxter1/CS7641
    def ica_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)
        
        ##
        ## ICA
        ##
        ica = FastICA(n_components=X_train_scl.shape[1])
        X_ica = ica.fit_transform(X_train_scl)
        
        ##
        ## Plots
        ##
        ph = plot_helper()

        kurt = kurtosis(X_ica)
        print(kurt)
        
        title = 'Kurtosis (FastICA) for ' + data_set_name
        name = data_set_name.lower() + '_ica_kurt'
        filename = './' + self.out_dir + '/' + name + '.png'
        
        ph.plot_simple_bar(np.arange(1, len(kurt)+1, 1),
                           kurt,
                           np.arange(1, len(kurt)+1, 1).astype('str'),
                           'Feature Index',
                           'Kurtosis',
                           title,
                           filename)
示例#3
0
文件: part2.py 项目: rbaxter1/CS7641
 def best_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
     X_train_transformed = rp.fit_transform(X_train_scl, y_train)
     X_test_transformed = rp.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_rp_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
示例#4
0
def processing(df):
    dummies_df = pd.get_dummies(df["City Group"])

    def add_CG(name):
        return "CG_" + name

    dummies_df = dummies_df.rename(columns=add_CG)
    # print dummies_df.head()
    df = pd.concat([df, dummies_df.iloc[:, 0]], axis=1)

    dummies_df = pd.get_dummies(df["Type"])

    def add_Type(name):
        return "Type_" + name

    dummies_df = dummies_df.rename(columns=add_Type)
    df = pd.concat([df, dummies_df.iloc[:, 0:3]], axis=1)

    # try to put in age as a column
    def add_Age(string):
        age = datetime.datetime.now() - datetime.datetime.strptime(string, "%m/%d/%Y")
        return age.days

    df["Age"] = df["Open Date"].map(add_Age)
    df = df.drop(["Id", "Open Date", "City", "City Group", "Type", "revenue"], axis=1)
    # scaler = StandardScaler().fit(df)
    scaler = RobustScaler().fit(df)
    df = scaler.transform(df)

    # print df.head()
    return df
示例#5
0
def num_scaler(d_num,t_num):
    scl = RobustScaler()
    scl.fit(d_num)
    d_num = scl.transform(d_num)
    t_num = scl.transform(t_num)
    
    return d_num, t_num
示例#6
0
文件: tpot.py 项目: vsolano/tpot
    def _robust_scaler(self, input_df):
        """Uses Scikit-learn's RobustScaler to scale the features using statistics that are robust to outliers

        Parameters
        ----------
        input_df: pandas.DataFrame {n_samples, n_features+['class', 'group', 'guess']}
            Input DataFrame to scale

        Returns
        -------
        scaled_df: pandas.DataFrame {n_samples, n_features + ['guess', 'group', 'class']}
            Returns a DataFrame containing the scaled features

        """
        training_features = input_df.loc[input_df['group'] == 'training'].drop(['class', 'group', 'guess'], axis=1)

        if len(training_features.columns.values) == 0:
            return input_df.copy()

        # The scaler must be fit on only the training data
        scaler = RobustScaler()
        scaler.fit(training_features.values.astype(np.float64))
        scaled_features = scaler.transform(input_df.drop(['class', 'group', 'guess'], axis=1).values.astype(np.float64))

        for col_num, column in enumerate(input_df.drop(['class', 'group', 'guess'], axis=1).columns.values):
            input_df.loc[:, column] = scaled_features[:, col_num]

        return input_df.copy()
示例#7
0
文件: part5.py 项目: rbaxter1/CS7641
 def nn_wine_orig(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     self.part4.nn_analysis(X_train_scl, X_test_scl, y_train, y_test, 'Wine', 'Neural Network Original')
示例#8
0
文件: part2.py 项目: rbaxter1/CS7641
 def lda_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     scores = []
     train_scores = []
     rng = range(1, X_train_scl.shape[1]+1)
     for i in rng:
         lda = LinearDiscriminantAnalysis(n_components=i)
         cv = KFold(X_train_scl.shape[0], 3, shuffle=True)
         
         # cross validation
         cv_scores = []
         for (train, test) in cv:
             lda.fit(X_train_scl[train], y_train[train])
             score = lda.score(X_train_scl[test], y_train[test])
             cv_scores.append(score)
         
         mean_score = np.mean(cv_scores)
         scores.append(mean_score)
         
         # train score
         lda = LinearDiscriminantAnalysis(n_components=i)
         lda.fit(X_train_scl, y_train)
         train_score = lda.score(X_train_scl, y_train)
         train_scores.append(train_score)
         
         print(i, mean_score)
         
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (LDA) for ' + data_set_name
     name = data_set_name.lower() + '_lda_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(rng,
                    [scores, train_scores],
                    [None, None],
                    ['cross validation score', 'training score'],
                    cm.viridis(np.linspace(0, 1, 2)),
                    ['o', '*'],
                    title,
                    'n_components',
                    'Score',
                    filename)
def test_robustscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.RobustScaler
    # with sklearn.preprocessing.RobustScaler

    robustscalerr = RobustScalerR()
    robustscalerr.fit(np.concatenate(trajs))

    robustscaler = RobustScaler()
    robustscaler.fit(trajs)

    y_ref1 = robustscalerr.transform(trajs[0])
    y1 = robustscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
示例#10
0
文件: part3.py 项目: rbaxter1/CS7641
 def best_lda_cluster_wine(self):
     dh = data_helper()
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data_lda_best()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     ##
     ## K-Means
     ##
     km = KMeans(n_clusters=4, algorithm='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_kmeans_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
     
     ##
     ## GMM
     ##
     gmm = GaussianMixture(n_components=4, covariance_type='full')
     X_train_transformed = km.fit_transform(X_train_scl)
     X_test_transformed = km.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_gmm_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_gmm_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
示例#11
0
文件: part2.py 项目: rbaxter1/CS7641
 def best_lda_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     lda = LinearDiscriminantAnalysis(n_components=2)
     X_train_transformed = lda.fit_transform(X_train_scl, y_train)
     X_test_transformed = lda.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/nba_lda_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_lda_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
示例#12
0
文件: part2.py 项目: rbaxter1/CS7641
 def best_pca_wine(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_wine_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     pca = PCA(n_components=3)
     X_train_transformed = pca.fit_transform(X_train_scl, y_train)
     X_test_transformed = pca.transform(X_test_scl)
     
     # save
     filename = './' + self.save_dir + '/wine_pca_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/wine_pca_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
示例#13
0
文件: part2.py 项目: rbaxter1/CS7641
    def pca_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)
        X_test_scl = scl.transform(X_test)
        
        ##
        ## PCA
        ##
        pca = PCA(n_components=X_train_scl.shape[1], svd_solver='full')
        X_pca = pca.fit_transform(X_train_scl)
        
        ##
        ## Plots
        ##
        ph = plot_helper()
        
        ##
        ## Explained Variance Plot
        ##
        title = 'Explained Variance (PCA) for ' + data_set_name
        name = data_set_name.lower() + '_pca_evar_err'
        filename = './' + self.out_dir + '/' + name + '.png'        
        self.plot_explained_variance(pca, title, filename)

        ##
        ## Reconstruction Error
        ##
        all_mses, rng = self.reconstruction_error(X_train_scl, PCA)
        
        title = 'Reconstruction Error (PCA) for ' + data_set_name
        name = data_set_name.lower() + '_pca_rec_err'
        filename = './' + self.out_dir + '/' + name + '.png'
        ph.plot_series(rng,
                    [all_mses.mean(0)],
                    [all_mses.std(0)],
                    ['mse'],
                    ['red'],
                    ['o'],
                    title,
                    'Number of Features',
                    'Mean Squared Error',
                    filename)
        
        
        ##
        ## Manually compute eigenvalues
        ## 
        cov_mat = np.cov(X_train_scl.T)
        eigen_values, eigen_vectors = np.linalg.eig(cov_mat)
        print(eigen_values)
        sorted_eigen_values = sorted(eigen_values, reverse=True)

        title = 'Eigen Values (PCA) for ' + data_set_name
        name = data_set_name.lower() + '_pca_eigen'
        filename = './' + self.out_dir + '/' + name + '.png'
        
        ph.plot_simple_bar(np.arange(1, len(sorted_eigen_values)+1, 1),
                           sorted_eigen_values,
                           np.arange(1, len(sorted_eigen_values)+1, 1).astype('str'),
                           'Principal Components',
                           'Eigenvalue',
                           title,
                           filename)
        
        ## TODO Factor this out to new method
        ##
        ## Scatter
        ##
        '''
示例#14
0
def train(filename_train,
          filename_model,
          n_events_train=-1,
          simple=False,
          n_features=7,
          n_hidden=30,
          n_epochs=5,
          batch_size=64,
          step_size=0.01,
          decay=0.7,
          random_state=1):
    # Initialization
    gated = not simple
    logging.info("Calling with...")
    logging.info("\tfilename_train = %s" % filename_train)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_events_train = %d" % n_events_train)
    logging.info("\tgated = %s" % gated)
    logging.info("\tn_features = %d" % n_features)
    logging.info("\tn_hidden = %d" % n_hidden)
    logging.info("\tn_epochs = %d" % n_epochs)
    logging.info("\tbatch_size = %d" % batch_size)
    logging.info("\tstep_size = %f" % step_size)
    logging.info("\tdecay = %f" % decay)
    logging.info("\trandom_state = %d" % random_state)
    rng = check_random_state(random_state)

    # Make data
    logging.info("Loading data...")

    fd = open(filename_train, "rb")
    X, y = pickle.load(fd, encoding='latin1')
    fd.close()
    y = np.array(y)

    if n_events_train > 0:
        indices = check_random_state(123).permutation(len(X))[:n_events_train]
        X = [X[i] for i in indices]
        y = y[indices]

    logging.info("\tfilename = %s" % filename_train)
    logging.info("\tX size = %d" % len(X))
    logging.info("\ty size = %d" % len(y))

    # Preprocessing
    logging.info("Preprocessing...")
    X = [extract(permute_by_pt(rewrite_content(jet))) for jet in X]
    tf = RobustScaler().fit(np.vstack([jet["content"] for jet in X]))

    for jet in X:
        jet["content"] = tf.transform(jet["content"])

    # Split into train+validation
    logging.info("Splitting into train and validation...")

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=5000,
                                                          random_state=rng)

    # Training
    logging.info("Training...")

    if gated:
        predict = grnn_predict_gated
        init = grnn_init_gated
    else:
        predict = grnn_predict_simple
        init = grnn_init_simple

    trained_params = init(n_features, n_hidden, random_state=rng)
    n_batches = int(np.ceil(len(X_train) / batch_size))
    best_score = [-np.inf]  # yuck, but works
    best_params = [trained_params]

    def loss(X, y, params):
        y_pred = predict(params, X)
        l = log_loss(y, y_pred).mean()
        return l

    def objective(params, iteration):
        rng = check_random_state(iteration % n_batches)
        start = rng.randint(len(X_train) - batch_size)
        idx = slice(start, start + batch_size)
        return loss(X_train[idx], y_train[idx], params)

    def callback(params, iteration, gradient):
        if iteration % 25 == 0:
            roc_auc = roc_auc_score(y_valid, predict(params, X_valid))

            if roc_auc > best_score[0]:
                best_score[0] = roc_auc
                best_params[0] = copy.deepcopy(params)

                fd = open(filename_model, "wb")
                pickle.dump(best_params[0], fd)
                fd.close()

            logging.info(
                "%5d\t~loss(train)=%.4f\tloss(valid)=%.4f"
                "\troc_auc(valid)=%.4f\tbest_roc_auc(valid)=%.4f" %
                (iteration, loss(X_train[:5000], y_train[:5000], params),
                 loss(X_valid, y_valid, params), roc_auc, best_score[0]))

    for i in range(n_epochs):
        logging.info("epoch = %d" % i)
        logging.info("step_size = %.4f" % step_size)

        trained_params = adam(ag.grad(objective),
                              trained_params,
                              step_size=step_size,
                              num_iters=1 * n_batches,
                              callback=callback)
        step_size = step_size * decay
示例#15
0
y_total = df.iloc[:, -1:].values
x_total = df.iloc[:, :-1].values
y_test = y_total[-test_size:, :]
x_test = x_total[-test_size:, :]
y_train = y_total[:-val_size - test_size, :]
x_train = x_total[:-val_size - test_size, :]
y_val = y_total[-val_size - test_size - 1:-test_size, :]
x_val = x_total[-val_size - test_size - 1:-test_size, :]
n_samples = x_train.shape[0]
m = len(y_train)

scalerX = RobustScaler(quantile_range=(10, 90))
scalerY = RobustScaler(quantile_range=(10, 90))
x_train = scalerX.fit_transform(x_train)
y_train = scalerY.fit_transform(y_train)
x_val = scalerX.transform(x_val)
y_val = scalerY.transform(y_val)
x_test = scalerX.transform(x_test)
y_test = scalerY.transform(y_test)
tempo = time.time()
epochs = 200
learning_rate = 0.01
batch_size = m

# random seed
os.environ['PYTHONHASHSEED'] = '0'
seed = 123456
if seed is not None:
    np.random.seed(seed)
    rn.seed(seed)
    tf.set_random_seed(seed)
示例#16
0
import pandas as pd
import matplotlib.pyplot as plt

from sgmcmc_ssm.models.gauss_hmm import GaussHMMSampler
from tqdm import tqdm

np.random.seed(12345)

# Load and Scale Data
from scipy.io import loadmat
ion_data = loadmat('data/alamethicin.mat')

from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
observations = scaler.fit_transform(ion_data['originaldata'][1095:-3000])
filtered_observations = scaler.transform(ion_data['filtereddata'])
T = len(observations)

# Plot Data
fig, ax = plt.subplots(1, 1)
ax.plot(np.arange(T)[::50], observations[::50], '-', label='scaled data')
ax.plot(np.arange(T)[::50],
        filtered_observations[::50],
        '-',
        label='scaled filtered data')
ax.set_title('Scaled Ion Data')
ax.set_xlabel('Time')
ax.set_ylabel('Voltage (Scaled)')
ax.legend()

# Process all
示例#17
0
# Drop the encrypted phone number (LineNumber), and the Call category (As labeled by data team)
athena = athena.drop(['LineNumber', 'CallCategory'], axis=1)

# Split into subgroups, as training on the entire dataset breaks my computer
group = np.array_split(athena, 4)

# Iterate through each group
for i in range(len(group)):
    print('======= GROUP {} ======'.format(i))
    subdata = group[i]

    ## Scale the data to have mean=0 and unit variance:
    print('Scaling Data')
    scaler = RobustScaler().fit(athena)
    scaler.transform(athena)

    ## Reduce data for clustering
    print('Reducing dimensions')
    model = umap.UMAP(n_neighbors=20, min_dist=0.15, metric='braycurtis')
    data_2d = model.fit_transform(subdata)

    print('Clustering Data')
    cluster = DBSCAN(eps=3, min_samples=2).fit(subdata)

    print('Configuring data to clusters')
    subdata['PCA1'] = data_2d[:, 0]
    subdata['PCA2'] = data_2d[:, 1]
    cluster.labels_[cluster.labels_ > 0] = 1
    subdata['cluster'] = cluster.labels_
示例#18
0
numsamples = 50000

# Set a random seed for reproducibility
randomseed = 5

# Separate data into training and testing data
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(X_fSelect, y, 
                                                                                 range(len(y)),
                                                                                 train_size=numsamples,
                                                                                 random_state=randomseed)

# Create input data scaler based only on training set
scaler_X = RobustScaler()
scaler_X = scaler_X.fit(X_train)

X_train_scaled = scaler_X.transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

# Create the SVM model
clf = svm.SVC(kernel='rbf',C=0.1,gamma=0.01,class_weight={1:50},probability=True)

clf.fit(X_train_scaled,y_train)



## Step 5: SVM evaluation

We use a variety of evaluation metrics to gauge model performance, but emphasize the Total Skill Score (TSS) here due to its insensitivity on class imbalance ratio [[Bloomfield et al., 2012](http://iopscience.iop.org/article/10.1088/2041-8205/747/2/L41/meta "Bloomfield - TSS")]. All metrics require the use of the entries of the contingency, or confusion matix. For the scintillation/no-scintillation classification problem the matrix is https://github.com/rmcgranaghan/machine-learning-with-GNSS-data/blob/master/confusion_matrix_schematic.png. 


示例#19
0
    else:
        break

for ds_i, ds in enumerate(DS):
    if input(f'Run {DSNAMES[ds_i]}? >') == 'y':
        X, y = ds
        if scaler == 'rs':
            scale = RobustScaler().fit(X)
        elif scaler == 'ss':
            scale = StandardScaler().fit(X)
        elif scaler == 'qt':
            scale = QuantileTransformer(n_quantiles=np.min([1000, X.shape[0]]),
                                        output_distribution='uniform').fit(X)
        else:
            raise ValueError('Improper scaling method chosen')
        X = scale.transform(X)
        Path(os.path.join(OUTFILE, DSNAMES[ds_i])).mkdir(parents=True,
                                                         exist_ok=True)
        outpath = os.path.join(OUTFILE, DSNAMES[ds_i])
        print(f'Running {DSNAMES[ds_i]}\n')

        # Part 1 - Cluster data
        print('Running Clustering')
        if not (os.path.isfile(os.path.join(outpath, 'KM_est.pkl'))
                and os.path.isfile(os.path.join(outpath, 'EM_est.pkl'))):
            KM_est, EM_est = handle_clusters(X, outpath)
            handle_cluster_visualization(KM_est, X, y, outpath)
            handle_cluster_visualization(EM_est, X, y, outpath)
            with open(os.path.join(outpath, 'KM_est.pkl'), 'wb') as kmpk:
                pickle.dump(KM_est, kmpk, -1)
            with open(os.path.join(outpath, 'EM_est.pkl'), 'wb') as empk:
示例#20
0
#values that prediction is based on
x = csv[["AirTemp", "Press", "UMR"]]

#values to be predicted
y = csv[["NO", "NO2", "O3", "PM10"]]

#return four marix: two for learn and two for test
x_learn, x_test, y_learn, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

#transformer for transforming the values
transformer = RobustScaler().fit(x_learn)
#scalar type of the x_learn matrix
x_learn_scalar = transformer.transform(x_learn)
#scalar type of the x_test matrix
x_test_scalar = transformer.transform(x_test)

model = LinearRegression(fit_intercept=True,
                         normalize=True).fit(x_learn_scalar, y_learn)
#returns coefficient of determination
determ_coef = model.score(x_test_scalar, y_test)
#returns the intercept for each value
intercept = model.intercept_
#returns the slope for each value
slope = model.coef_

print("Coefficient of determination: ", determ_coef)
print("Intercept: ", intercept)
print("Slope: ", slope)
示例#21
0
labels = train_df['y']

train_df = train_df.drop('y',axis =1 )

#%%
all_data = pd.concat([train_df,test_df],axis=0,ignore_index=True)
#%%
all_data["galaxy"] = all_data["galaxy"].astype('category')
all_data["galaxy"] = all_data["galaxy"].cat.codes

#%%

all_data_without_year_name = all_data.drop(['galactic year','galaxy'],axis=1)
#%%
scaler = RobustScaler().fit(all_data_without_year_name)
all_data_without_year_name_scaled = scaler.transform(all_data_without_year_name)
#%%
year_name = all_data[['galactic year','galaxy']]
all_data_without_year_name_scaled_df = pd.DataFrame(all_data_without_year_name_scaled,columns=all_data_without_year_name.columns)
#%%
all_data_scaled = pd.concat([year_name,all_data_without_year_name_scaled_df],axis=1,sort=False)
# all_data_scaled['galactic year'] =all_data_scaled['galactic year'] - all_data_scaled['galactic year'][0]
#%%
#all_data_scaled = all_data_scaled.fillna(0) 
#%%
X_train = all_data_scaled[0:len(train_df)]
X_test = all_data_scaled[len(train_df):]

# %%
Non_data_col = ['galaxy','y']
predictors = [x for x in all_data_scaled.columns if x not in Non_data_col]
示例#22
0
#%% Prepare train and test sets for the model
train_set = traintest_set[:train_len]
test_set = traintest_set[train_len:]

train_set = train_set.drop('Id', axis=1)
test_set = test_set.drop('Id', axis=1)

X = train_set.drop('SalePrice', axis=1)
y = train_set['SalePrice']

test_set = test_set.drop('SalePrice', axis=1)

sc = RobustScaler()
X = sc.fit_transform(X)
test_set = sc.transform(test_set)

#%% Build the model
model = Lasso(alpha=.001, random_state=1)
model.fit(X, y)

#%% Kaggle submission
pred = model.predict(test_set)
preds = np.exp(pred)

print(model.score(X, y))

output = pd.DataFrame({'Id': test2.Id, 'SalePrice': preds})
output.to_csv('submission.csv', index=False)

output.head()
示例#23
0
loan_default1=train['loan_default']
df_training=pd.concat([training_data,loan_default1],axis=1)
print(df_training.columns)
# ########################################NOR FOR TRAINING :- df_training , FOR TESTING test_data##############################33
x_train= df_training.drop(['loan_default'],axis=1)
y_train=df_training['loan_default']
print(df_training.dtypes)

print(x_train.shape)
#y_prediction=pd.DataFrame(y_prediction, columns=["loan_default"])
#print(y_prediction.tail())
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(x_train)
#scaler.fit(y_train)
xscale=scaler.transform(x_train)
#yscale=scaler.transform(y)
scaler.fit(test_data)
test_scaled=scaler.transform(test_data)




#y_prediction.to_csv("C:/Users/hp/Desktop/lt/submitl22.csv")
####################MAKING TEST SET TO SAME TYPE###########
#test= pd.read_csv("C:/Users/hp/Desktop/lt/test_bqCt9Pv.csv")
################################################USING RANDOM FOREST ###########################
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow
示例#24
0
#rescales the data set such that all feature values are in the range [0, 1] 
#For large outliers, it compresses lower values to too small numbers.
#Sensitive to outliers.
scaler2 = MinMaxScaler()
scaler2.fit(X)
X2 = scaler2.transform(X)
df2 = pd.DataFrame(data=X2, columns=column_names)
print(df2.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df2, xlim=[0,1], ylim=[0,0.005])  #Data scaled but outliers still exist

#3 RobustScaler
# the centering and scaling statistics of this scaler are based on percentiles 
#and are therefore not influenced by a few number of very large marginal outliers.
scaler3 = RobustScaler()
scaler3.fit(X)
X3 = scaler3.transform(X)
df3 = pd.DataFrame(data=X3, columns=column_names)
print(df3.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df3, xlim=[-2,3], ylim = [-2,3]) #Range -2 to 3


#4 PowerTransformer
# applies a power transformation to each feature to make the data more Gaussian-like
scaler4 = PowerTransformer()
scaler4.fit(X)
X4 = scaler4.transform(X)
df4 = pd.DataFrame(data=X4, columns=column_names)
print(df4.describe())
sns.jointplot(x='MedInc', y='AveOccup', data=df4) #

#5 QuantileTransformer
def train(filename_train,filename_valid,filename_model,n_train=1200000,n_valid=400000,n_features=7,
                n_hidden=40,n_epochs=18,batch_size=128,step_size=0.005,decay=0.9):
   
    logging.info("Calling with...")
    logging.info("\tfilename_train = %s" % filename_train)
    logging.info("\tfilename_valid = %s" % filename_valid)
    logging.info("\tfilename_model = %s" % filename_model)
    logging.info("\tn_train = %d" % n_train)
    logging.info("\tn_valid = %d" % n_valid)
    logging.info("\tn_features = %d" % n_features)
    logging.info("\tn_hidden = %d" % n_hidden)
    logging.info("\tn_epochs = %d" % n_epochs)
    logging.info("\tbatch_size = %d" % batch_size)
    logging.info("\tstep_size = %f" % step_size)
    logging.info("\tdecay = %f" % decay)
    ####################### Reading the train data #################################
    logging.info("Loading train data")
    
    fd = open(filename_train, "rb")
    X, y = pickle.load(fd,encoding='latin-1')
    fd.close()
    y = np.array(y)
    
    indices = torch.randperm(len(X)).numpy()[:n_train]
    X = [X[i] for i in indices]
    y = y[indices]
    
    print("\tfilename = %s" % filename_train)
    print("\tX size = %d" % len(X))
    print("\ty size = %d" % len(y))



    # Preprocessing  # feature scaling
    logging.info("Preprocessing the train data")
    X = [extract(pt_order(rewrite_content(jet))) for jet in X]
    transfer_feature= RobustScaler().fit(np.vstack([jet["content"] for jet in X]))
    for jet in X:
        jet["content"] = transfer_feature.transform(jet["content"])

    X_train=X
    y_train=y

    '''----------------------------------------------------------------------- '''
    logging.info("Loading validation data")
    
    fd = open(filename_valid, "rb")
    X, y = pickle.load(fd,encoding='latin-1')
    fd.close()
    y = np.array(y)
    
    indices = torch.randperm(len(X)).numpy()[:n_valid]
    X = [X[i] for i in indices]
    y = y[indices]

    print("\tfilename = %s" % filename_valid)
    print("\tX size = %d" % len(X))
    print("\ty size = %d" % len(y))
    logging.info("Preprocessing the train data")
    X = [extract(pt_order(rewrite_content(jet))) for jet in X]
    for jet in X:
        jet["content"] = transfer_feature.transform(jet["content"])
    X_valid=X
    y_valid=y

###########################################Define MODEL ##############################

    logging.info("Initializing model...")
    model = Predict(n_features,n_hidden)
    if torch.cuda.is_available():
       logging.warning("Moving model to GPU")
       model.cuda()
       logging.warning("Moved model to GPU")

###########################OPTIMIZER AND LOSS ##########################################
    logging.info("Building optimizer...")
    optimizer = Adam(model.parameters(), lr=step_size)
    scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=decay)
    
    n_batches = int(len(X_train) // batch_size)
    best_score = [-np.inf]  
    best_model_state_dict = copy.deepcopy(model.state_dict())  # intial parameters of model
    
        
        
###############################VALIDATION OF DATA ########################################
    def callback(epoch, iteration, model):
        
        if iteration % n_batches == 0:
            model.eval()
            offset = 0; train_loss = []; valid_loss = []
            yy, yy_pred, accuracy_train, accuracy_valid = [], [],[],[]
            for i in range(len(X_valid) // batch_size):
                idx = slice(offset, offset+batch_size)
                Xt, yt = X_train[idx], y_train[idx]
                X_var = wrap_X(Xt); y_var = wrap(yt)
                tl = unwrap(loss(model(X_var), y_var)); train_loss.append(tl)
                y_pred_train = model(X_var)
                y = unwrap(y_var); y_pred_train = unwrap(y_pred_train)
                X = unwrap_X(X_var)

                Xv, yv = X_valid[idx], y_valid[idx]
                X_var = wrap_X(Xv); y_var = wrap(yv)
                y_pred = model(X_var)
                vl = unwrap(loss(y_pred, y_var)); valid_loss.append(vl)
                Xv = unwrap_X(X_var); yv = unwrap(y_var); y_pred = unwrap(y_pred)
                yy.append(yv); yy_pred.append(y_pred)
                y_pred=np.column_stack(y_pred).ravel()
                accuracy_valid.append(np.sum(np.rint(y_pred)==yv)/float(len(yv)))
                offset+=batch_size
        
            train_loss = np.mean(np.array(train_loss))
            valid_loss = np.mean(np.array(valid_loss))
            accuracy_valid=np.mean(np.array(accuracy_valid))
            print("accuracy_valid:",accuracy_valid)
            print("train_loss:",train_loss)
            roc_auc = roc_auc_score(np.column_stack(yy).ravel(), np.column_stack(yy_pred).ravel())
            print("roc_auc:",roc_auc)
            if roc_auc > best_score[0]:
               best_score[0]=roc_auc
               best_model_state_dict[0] = copy.deepcopy(model.state_dict())
               with open(filename_model, 'wb') as f:
                    torch.save(best_model_state_dict[0], f)
            scheduler.step(valid_loss)
            model.train()

 ###############################TRAINING ########################################
    logging.warning("Training the data")
    iteration=1
    for i in range(n_epochs):
        print("epoch = %d" % i)
        print("step_size = %.4f" % step_size)
        t0 = time.time()
        for _ in range(n_batches): ## mini batch
            iteration += 1
            model.train()
            optimizer.zero_grad()
            start = torch.round(torch.rand(1) * (len(X_train) - batch_size)).numpy()[0].astype(np.int32)
            idx = slice(start, start+batch_size)
            X, y = X_train[idx], y_train[idx]
            X_var = wrap_X(X); y_var = wrap(y) ## wrap_X, wrap moves to GPU
            l = loss(model(X_var), y_var)
            l.backward()
            optimizer.step()
            X = unwrap_X(X_var); y = unwrap(y_var) ## unwrap_X, unwrap moves to GPU
            callback(i, iteration, model)
            t1 = time.time() ###
        print(f'Epoch took {t1-t0} seconds')
        scheduler.step()
        step_size = step_size * decay
示例#26
0
文件: kit.py 项目: kingjr/jr-tools
def least_square_reference(
    inst, empty_room=None, max_times_samples=2000, bad_channels=None, scaler=None, mrk=None, elp=None, hsp=None
):
    """
    Fits and applies Least Square projection of the reference channels
    (potentially from an empty room) and removes the corresponding component
    from the recordings of a subject.

    Parameters
    ----------
        inst : Raw | str
            Raw instance or path to raw data.
        empty_room : str | None
            Path to raw data acquired in empty room.
        max_times_samples : int
            Number of time sample to use for pinv. Defautls to 2000
        bad_channels : list | array, shape (n_chans) of strings
            Lists bad channels
        scaler : function | None
            Scaler functions to normalize data. Defaults to
            sklearn.preprocessing.RobustScaler.

    Returns
    -------
        inst : Raw

    adapted from Adeen Flinker 6/2013 (<*****@*****.**>) LSdenoise.m

    Main EHN
        - Automatically detects channel types.
        - Allows flexible scaler; Robust by default.
        - The data is projected back in Tesla.
        - Allows memory control.
    TODO:
        - Allow other kind of MNE-Python inst
        - Allow baseline selection (pre-stim instead of empty room)
        - Clean up memory
        - Allow fancy solver (l1, etc)
    """
    from scipy.linalg import pinv
    from mne.io import read_raw_kit
    from mne.io import _BaseRaw

    # Least square can be fitted on empty room or on subject's data
    if empty_room is None:
        if not isinstance(inst, _BaseRaw):
            raw = read_raw_kit(inst, preload=True)
        else:
            raw = inst
    else:
        if not isinstance(empty_room, _BaseRaw):
            raw = read_raw_kit(empty_room, preload=True)
        else:
            raw = empty_room

    # Parameters
    n_chans, n_times = raw._data.shape
    chan_info = raw.info["chs"]

    # KIT: axial gradiometers (equiv to mag)
    ch_mag = np.where([ch["coil_type"] == 6001 for ch in chan_info])[0]
    # KIT: ref magnetometer
    ch_ref = np.where([ch["coil_type"] == 6002 for ch in chan_info])[0]
    # Other channels
    ch_misc = np.where([ch["coil_type"] not in [6001, 6002] for ch in chan_info])[0]
    # Bad channel
    ch_bad = np.empty(0)
    if (bad_channels is not None) and len(bad_channels):
        if np.all([isinstance(ch, int) for ch in bad_channels]):
            bad_channels = np.array(bad_channels)
        elif np.all([isinstance(ch, str) for ch in bad_channels]):
            bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in bad_channels]
        else:
            raise ValueError("bad_channels needs array of int or array of str")
    else:
        bad_channels = []
    default_bad_channels = [ii for ii, ch in enumerate(raw.ch_names) if ch in raw.info["bads"]]
    bad_channels = np.array(default_bad_channels + bad_channels, int)

    print("bad channels:", [raw.ch_names[bad] for bad in bad_channels])
    # To avoid memory error, let's subsample across time
    sel_times = slice(0, n_times, int(np.ceil(n_times // max_times_samples)))

    # Whiten data
    if scaler is None:
        from sklearn.preprocessing import RobustScaler

        scaler = RobustScaler()
    data_bsl = scaler.fit_transform(raw._data.T)

    # Fit Least Square coefficients on baseline data
    empty_sensors = data_bsl[:, ch_mag]
    if len(ch_bad):
        empty_sensors[:, ch_bad] = 0  # remove bad channels
    coefs = np.dot(pinv(data_bsl[sel_times, ch_ref]), empty_sensors[sel_times, :])
    empty_sensors, data_bsl = None, None  # clear memory

    # Apply correction on subject data
    if empty_room is not None:
        del raw
        raw = read_raw_kit(inst, preload=True)

    data_subject = scaler.transform(raw._data.T)
    subject_sensors = data_subject[:, ch_mag] - np.dot(data_subject[:, ch_ref], coefs)

    # Remove bad channels
    if len(ch_bad):
        subject_sensors[:, ch_bad] = 0

    # Reproject baseline
    new_ref = np.dot(subject_sensors, pinv(coefs))

    # Un-whiten data to get physical units back
    data = np.concatenate((subject_sensors, new_ref, raw._data[ch_misc, :].T), axis=1)
    data = scaler.inverse_transform(data)

    # Output
    raw._data = data.T
    return raw
示例#27
0
# In[ ]:

scaler = RobustScaler()

# In[ ]:

n_train = train.shape[0]

X = data_pipe[:n_train]
test_X = data_pipe[n_train:]
y = train.SalePrice

X_scaled = scaler.fit(X).transform(X)
y_log = np.log(train.SalePrice)
test_X_scaled = scaler.transform(test_X)

# ## Feature Selection

# + __I have to confess, the feature engineering above is not enough, so we need more.__
# + __Combining different features is usually a good way, but we have no idea what features should we choose. Luckily there are some models that can provide feature selection, here I use Lasso, but you are free to choose Ridge, RandomForest or GradientBoostingTree.__

# In[ ]:

lasso = Lasso(alpha=0.001)
lasso.fit(X_scaled, y_log)

# In[ ]:

FI_lasso = pd.DataFrame({"Feature Importance": lasso.coef_},
                        index=data_pipe.columns)
示例#28
0
 def scale_data_robust(self):
     scaler = RobustScaler().fit(self.X_train)
     self.X_train = scaler.transform(self.X_train)
     self.X_validation = scaler.transform(self.X_validation)
示例#29
0
                         "Balanced Accuracy", "MSE", "r2", "spearmanr"
                     ])

for split in np.arange(numsplits):
    print("Evaluating fold " + str(split))
    train_index = kfolds["fold_" + str(split)]["train"]
    test_index = kfolds["fold_" + str(split)]["test"]

    X_train, X_test = features_nosurv.iloc[train_index], features_nosurv.iloc[
        test_index]
    y_train, y_test = surv_days[train_index], surv_days[test_index]

    # scale target with a quantile transform
    qtfm = RobustScaler()
    y_train = np.squeeze(qtfm.fit_transform(y_train.values.reshape(-1, 1)))
    y_test = np.squeeze(qtfm.transform(y_test.values.reshape(-1, 1)))
    # y_train, y_test = surv_classes[train_index], surv_classes[test_index]

    # for every split, perform feature selection
    for sel_name, sel in zip(selectornames_short, selectors):
        print('#####')
        print(sel_name)
        print('#####')

        if sel_name is "CHSQ":
            # shift X values to be non-negative for chsq feature selection
            X_train_tmp = X_train + np.abs(X_train.min())
            selscore = sel(X_train_tmp, y_train)
            selidx = np.argsort(selscore)[::-1]
            selidx = selidx[0:numfeat]
            selscore = selscore[selidx]
var_dums = pd.get_dummies(all_data["Variety"])
all_data = all_data.drop(columns="Variety")
all_data = pd.concat([all_data, var_dums], axis=1)

all_data = all_data.drop(columns="Site ID")
all_data = all_data.dropna()
all_data = all_data[all_data["Assessment Score"] != '*']

#split features and target
Y = all_data["Assessment Score"]
X = all_data.drop(columns="Assessment Score")

#scale features
from sklearn.preprocessing import RobustScaler
transformer = RobustScaler().fit(X)
X = transformer.transform(X)

Y = np.array(Y)
Y[Y == ''] = 0.0
Y = Y.astype(np.float)

#make dense network model
import neural_net

NeuralNet = neural_net.NeuralNet

#crop_score_model = NeuralNet(X, Y, 6, 256, "r", 20)

#check accuracy
from sklearn.metrics import mean_squared_error
'''
示例#31
0

# split x, y
x = train.loc[:, 'rho':'990_dst']
test = test.loc[:, 'rho':'990_dst']

y = train.loc[:, 'hhb':'na']

# split train, test
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.9,
                                                    random_state=0)

# scalling
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

test = scaler.transform(test)

# search model parameters
parameters = { 'n_estimators': [310, 350, 390], 'max_depth': [4, 5, 6],
               'learning_rate': [0.06, 0.11], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
               'colsample_bylevel': [0.6, 0.7, 0.8] }

# name_ls ( y columns == class 4 values)
name_ls = ['hhb','hbo2','ca','na']

# final predict values (submit DataFrame)
tmp_dic = dict()

# xgb model feature importance
示例#32
0
# 使用Z-标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
regressor = KNeighborsRegressor()
regressor.fit(X_train_scaled, Y_train)
Y_est = regressor.predict(X_test_scaled)
print("MAE=", mean_squared_error(Y_test, Y_est))

# In[13]:

# 鲁棒性缩放
scaler2 = RobustScaler()
X_train_scaled = scaler2.fit_transform(X_train)
X_test_scaled = scaler2.transform(X_test)
regressor = KNeighborsRegressor()
regressor.fit(X_train_scaled, Y_train)
Y_est = regressor.predict(X_test_scaled)
print("MAE=", mean_squared_error(Y_test, Y_est))

# In[14]:

# 对特定特征使用非线性修正
non_linear_feat = 5
X_train_new_feat = np.sqrt(X_train[:, non_linear_feat])
X_test_new_feat = np.sqrt(X_test[:, non_linear_feat])

X_train_new_feat.shape = (X_train_new_feat.shape[0], 1)
X_train_extended = np.hstack([X_train, X_train_new_feat])
示例#33
0
                                                            random_state=6)
ca_x, ca_x_test, ca_y, ca_y_test = train_test_split(ca_x,
                                                    ca_y,
                                                    test_size=0.1,
                                                    random_state=6)
na_x, na_x_test, na_y, na_y_test = train_test_split(na_x,
                                                    na_y,
                                                    test_size=0.1,
                                                    random_state=6)

# scalling
scaler = RobustScaler()
# scaler = MinMaxScaler()

hhb_x = scaler.fit_transform(hhb_x)
hhb_x_test = scaler.transform(hhb_x_test)
x_pred_hhb = scaler.transform(x_pred_hhb)

hbo2_x = scaler.fit_transform(hbo2_x)
hbo2_x_test = scaler.transform(hbo2_x_test)
x_pred_hbo2 = scaler.transform(x_pred_hbo2)

ca_x = scaler.fit_transform(ca_x)
ca_x_test = scaler.transform(ca_x_test)
x_pred_ca = scaler.transform(x_pred_ca)

na_x = scaler.fit_transform(na_x)
na_x_test = scaler.transform(na_x_test)
x_pred_na = scaler.transform(x_pred_na)

# modelling
示例#34
0
df2_test = df2_test[df2_test['activity'] != 'r2.Dress']

#Separating the label from the data
Y = df2['activity']
np.unique(Y)
Y = label_encoder.fit_transform(Y)
Y = Y.reshape(Y.shape[0], 1)
np.unique(Y)

df2.shape
X = df2[df2.columns[:-1]]
X.shape

#Scaling the data
transformer = RobustScaler().fit(X)
X = transformer.transform(X)
X
###validation split
X, valX, Y, valY = train_test_split(X, Y, test_size=0.2, random_state=0)
X.shape

model = keras.Sequential([
    keras.layers.Dense(1000, activation='relu', input_dim=32),
    keras.layers.Dense(800, activation='relu'),
    keras.layers.Dense(640, activation='relu'),
    keras.layers.Dense(580, activation='relu'),
    keras.layers.Dense(330, activation='relu'),
    keras.layers.Dense(250, activation='relu'),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(42, activation='softmax')
])
def main(args):
    out_file_name = "results.log"

    if args.classify:
        # Cast to list to keep it all in memory
        train = list(csv.reader(open(args.train_file, 'r')))
        test = list(csv.reader(open(args.test_file, 'r')))

        x_train = np.array(train[1:], dtype=float)
        
        x_test = np.array(test[1:], dtype=float)
        
        train_labels_file = open(args.train_labels)
        y_train = np.array([int(x.strip()) for x in train_labels_file.readlines()])

        test_labels_file = open(args.test_labels)
        y_test = np.array([int(x.strip()) for x in test_labels_file.readlines()])
        train_labels_file.close()
        test_labels_file.close()

        if args.sampling_technique:
            print "Attempting to use sampling technique: " + args.sampling_technique
            if args.sampling_ratio == float('NaN'):
                print "Unable to use sampling technique. Ratio is NaN."
            else:
                x_train, y_train = __get_sample_transformed_examples(args.sampling_technique,
                                                                     x_train, y_train,
                                                                     args.sampling_ratio)

        if args.scale:
            scaler = RobustScaler()
            x_train = scaler.fit_transform(x_train)
            x_test = scaler.fit_transform(x_test)
        for classifier in args.classifiers:
            model = __get_classifier_model(classifier, args)
            print "Using classifier " + classifier
            print "Fitting data to model"
            if args.grid_search:
                print "Applying parameter tuning to model"
                if classifier == LOG_REG:
                    parameters = {'loss':('log','hinge'), 'penalty':('l2', 'l1'), 'shuffle':[True], 'n_iter':[5], 'n_jobs':[-1], 'random_state':[179]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == SVM:
                    parameters = {'kernel':('rbf', 'poly'), 'cache_size':[8096], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == ADA_BOOST:
                    parameters = {'n_estimators':[300], 'random_state':[13]}
                    model = grid_search.GridSearchCV(model, parameters, scoring=roc_auc_score, verbose=2)
                elif classifier == RF:
                    parameters = {'criterion':('gini', 'entropy'), 'n_jobs':[-1], 'n_estimators':[300], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == GRADIENT_BOOST:
                    parameters = {'n_estimators':[300], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == EXTRA_TREES:
                    parameters = {'n_estimators':[300], 'random_state':[17], 'n_jobs':[-1], 'criterion':('gini', 'entropy'), 'max_features':('log2', 40, 0.4), 'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == BAGGING:
                    parameters = {'n_estimators':[300], 'random_state':[17], 'max_samples': [.4, 30],'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False], 'n_jobs':[-1]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                print "Best params: " + str(model.best_params_)
                    
            clf = model.fit(x_train, y_train)
            print "Parameters used in model:"
            #print clf.get_params(deep=False)
            if args.select_best:
                # Unable to use BaggingClassifier with SelectFromModel
                if classifier != BAGGING:
                    print "Selecting best features"
                    sfm = SelectFromModel(clf, prefit=True)
                    x_train = sfm.transform(x_train)
                    x_test = sfm.transform(x_test)
                    clf = model.fit(x_train, y_train)
            __print_and_log_results(clf, classifier, x_train, x_test, y_test,
                                    out_file_name, args)

    elif args.cross_validate:
        # Cast to list to keep it all in memory
        labels_file = open(args.labels)
        labels = np.array([int(x.strip()) for x in labels_file.readlines()])
        labels_file.close()
        data_file = open(args.data_file, 'r')
        data = list(csv.reader(data_file))
        data_file.close()
        examples = np.array(data[1:], dtype=float)
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(examples, labels, test_size=0.1)

        if args.sampling_technique:
            print "Attempting to use sampling technique: " + args.sampling_technique
            if args.sampling_ratio == float('NaN'):
                print "Unable to use sampling technique. Ratio is NaN."
            else:
                X_train, y_train = __get_sample_transformed_examples(args.sampling_technique,
                                                                     X_train, y_train,
                                                                     args.sampling_ratio)
        if args.scale:
            scaler = StandardScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
        for classifier in args.classifiers:
            print "Using classifier " + classifier
            model = __get_classifier_model(classifier, args)
            print "Fitting model"
            if args.grid_search:
                print "Applying parameter tuning to model"
                if classifier == LOG_REG:
                    parameters = {'loss':('log','hinge'), 'penalty':('l2', 'l1'), 'shuffle':[True], 'n_iter':[5], 'n_jobs':[-1], 'random_state':[179]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == SVM:
                    parameters = {'kernel':('rbf', 'poly'), 'cache_size':[8096], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == ADA_BOOST:
                    parameters = {'n_estimators':[300], 'random_state':[13]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == RF:
                    parameters = {'criterion':('gini', 'entropy'), 'n_jobs':[-1], 'n_estimators':[300], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == GRADIENT_BOOST:
                    parameters = {'n_estimators':[300], 'random_state':[17]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == EXTRA_TREES:
                    parameters = {'n_estimators':[300], 'random_state':[17], 'n_jobs':[-1], 'criterion':('gini', 'entropy'), 'max_features':('log2', 40, 0.4), 'max_features':[40, 0.4], 'bootstrap':[True, False], 'bootstrap_features':[True, False]}
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
                elif classifier == BAGGING:
                    #parameters = {'n_estimators' : [400], 'random_state' : [17],
                    #              'max_samples' : np.arange(0.5, 0.9, 0.1),
                    #              'max_features' : np.arange(0.5, 0.9, 0.1),
                    #              'bootstrap':[False], 'bootstrap_features':[False], 'n_jobs':[-1]}
                    parameters = {"base_estimator__criterion" : ["gini", "entropy"],
                                  "base_estimator__splitter" : ["best", "random"],
                                  "base_estimator__max_depth" : [10, 15, 20, 25], 
                                  "base_estimator__class_weight" : ['balanced'],
                                  "base_estimator__max_features" : ['auto', 'log2']
                                  }
                    model = grid_search.GridSearchCV(model, parameters, scoring='roc_auc', verbose=2)
            clf = model.fit(X_train, y_train)
            if args.grid_search:
                print "Best params: " + str(model.best_params_)
            if args.select_best:
                if classifier != BAGGING:
                    print "Selecting best features"
                    sfm = SelectFromModel(clf, prefit = True)
                    X_train = sfm.transform(X_train)
                    X_test = sfm.transform(X_test)
                    clf = model.fit(X_train, y_train)
            print "Evaluating results"
            __print_and_log_results(clf, classifier, X_train, X_test, y_test,
                                    out_file_name, args)
    elif args.kfold:
        # Cast to list to keep it all in memory
        data_file = open(args.data_file, 'r')
        data = list(csv.reader(data_file))
        data_file.close()
        labels_file = open(args.labels)
        labels = np.array([int(x.strip()) for x in labels_file.readlines()])
        labels_file.close()
        X = np.array(data[1:], dtype=float)
        kf = KFold(len(X), n_folds=10, shuffle=True, random_state=42)
        for train, test in kf:
            print "kfold loop iterate"
            X_train, X_test, y_train, y_test = X[train], X[test], labels[train], labels[test]

            if args.sampling_technique:
                print "Attempting to use sampling technique: " + args.sampling_technique
                if args.sampling_ratio == float('NaN'):
                    print "Unable to use sampling technique. Ratio is NaN."
                else:
                    X_train, y_train = __get_sample_transformed_examples(args.sampling_technique,
                                                                     X_train, y_train,
                                                                     args.sampling_ratio)
            if args.scale:
                scaler = StandardScaler().fit(X_train)
                X_train = scaler.transform(X_train)
                X_test = scaler.transform(X_test)

            for classifier in args.classifiers:
                print "Using classifier " + classifier
                model = __get_classifier_model(classifier, args)
                print "Fitting model"
                clf = model.fit(X_train, y_train)
                if args.select_best:
                    if classifier != BAGGING:
                        sfm = SelectFromModel(clf, prefit = True)
                        X_train = sfm.transform(X_train)
                        X_test = sfm.transform(X_test)
                        clf = model.fit(X_train, y_train)
                print "Evaluating results"
                __print_and_log_results(clf, classifier, X_train, X_test, y_test,
                                        out_file_name, args)
        print "kfold loop done"
示例#36
0
def main():

    protocol = 'modeller_fast'

    with open('descriptors-avg.json', 'r') as fp:
        alldata = json.load(fp)

    desc = set()
    for cpx, cpxdata in alldata.items():
        desc |= set(cpxdata[protocol].keys())
    desc = [
        d for d in desc if not d.startswith('>') and d != 'NRES'
        and d != 'AGBNP' and d != 'GBMV_POL' and d != 'SOAP-Protein-OD'
    ]
    print('Number of descriptors:', len(desc))
    cpxs = [c for c in alldata.keys() if c.startswith('FY')]

    data = np.zeros((len(cpxs), len(desc)), dtype=float)
    for i, d in enumerate(desc):
        for j, c in enumerate(cpxs):
            data[j][i] = alldata[c][protocol][d]

    scaler = RobustScaler().fit(data)
    X = scaler.transform(data)
    X = X.transpose()

    # Histograms
    numd = len(desc)
    ncols = 10
    nrows = math.ceil(numd / ncols)
    plt.figure(figsize=(3 * ncols, 3 * nrows - 0.5))
    plt.subplots_adjust(hspace=0.4, wspace=0.3)
    for n, d in enumerate(desc):
        plt.subplot(nrows, ncols, n + 1)
        plt.title(d)
        plt.hist(X[n], bins='auto')
    plt.savefig('fig/histograms.png', bbox_inches='tight', dpi=300)
    plt.clf()

    # Dendogram
    method = 'complete'  # complete or average seem better
    Z = linkage(X, method=method, metric='correlation', optimal_ordering=True)
    fig = plt.figure(figsize=(6, 10))
    dn = dendrogram(Z, orientation='right', labels=desc)
    plt.savefig('fig/dendogram-%s.png' % (method),
                bbox_inches='tight',
                dpi=300)
    plt.clf()

    # Reorder based on dendogram
    labels = list(reversed(dn['ivl']))
    ndx = [desc.index(l) for l in labels]
    X = X[ndx, :]

    # Cross-correlation matrix
    size = len(desc)
    mtx = np.ones((size, size), dtype=float)
    for i in range(size):
        for j in range(i + 1, size):
            rp = stats.pearsonr(X[i], X[j])[0]
            rs = stats.spearmanr(X[i], X[j])[0]
            mtx[i][j] = rp
            mtx[j][i] = rs
    plot_crosscorr(mtx, labels, 'fig/crosscorr-%s.png' % (method))

    # Explained_variance
    pca = sklearn.decomposition.PCA().fit(X.transpose())
    nc = pca.n_components_
    cumul = np.zeros(nc)
    for i in range(nc):
        cumul[i] = pca.explained_variance_ratio_[i]
        if (i > 0): cumul[i] += cumul[i - 1]
    cut = 25
    plt.figure(figsize=(6.4, 4.8))
    plt.plot(range(1, cut + 1),
             cumul[:cut],
             'ro-',
             label='Cumulative explained variance')
    plt.bar(range(1, cut + 1),
            pca.explained_variance_ratio_[:cut],
            label='Explained variance ratio')
    plt.ylim(-0.05, 1.05)
    plt.xlabel('Number of components')
    plt.ylabel('Explained Variance')
    plt.xticks(range(1, cut + 1))
    plt.legend()
    plt.savefig('fig/explained_variance.png', bbox_inches='tight', dpi=300)
    plt.clf()
X1 = np.random.multivariate_normal(mean=mu1, cov=Cov, size=n_datapoints)
X2 = np.random.multivariate_normal(mean=mu2, cov=Cov, size=n_datapoints)
Y_test = np.hstack([[-1]*n_datapoints, [1]*n_datapoints])
X_test = np.vstack([X1, X2])

X_train[0, 0] = -1000  # a fairly large outlier


# Scale data
standard_scaler = StandardScaler()
Xtr_s = standard_scaler.fit_transform(X_train)
Xte_s = standard_scaler.transform(X_test)

robust_scaler = RobustScaler()
Xtr_r = robust_scaler.fit_transform(X_train)
Xte_r = robust_scaler.transform(X_test)


# Plot data
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].scatter(X_train[:, 0], X_train[:, 1],
              color=np.where(Y_train > 0, 'r', 'b'))
ax[1].scatter(Xtr_s[:, 0], Xtr_s[:, 1], color=np.where(Y_train > 0, 'r', 'b'))
ax[2].scatter(Xtr_r[:, 0], Xtr_r[:, 1], color=np.where(Y_train > 0, 'r', 'b'))
ax[0].set_title("Unscaled data")
ax[1].set_title("After standard scaling (zoomed in)")
ax[2].set_title("After robust scaling (zoomed in)")
# for the scaled data, we zoom in to the data center (outlier can't be seen!)
for a in ax[1:]:
    a.set_xlim(-3, 3)
    a.set_ylim(-3, 3)
示例#38
0
print('is_anomaly_test_count',total_rows)

# Remove is_anomaly column from train and test data since semi-supervised learning
del train['is_anomaly']
del test['is_anomaly']

print(train.shape, test.shape)

#-------------------------
# Step 5 Scaling of data
#-------------------------
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler = scaler.fit(train[['value']])

train['value'] = scaler.transform(train[['value']])
test['value'] = scaler.transform(test[['value']])

#-----------------------------------
# Step 6 - Prepare Input for LSTM
#-----------------------------------
# We’ll split the data into sub-sequences - changing input to a shape as accepted by
# lstm autoencoder

def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)
devtest='./exp/ivectors_semeval_devtest_NGMM_2048_W_2_DIM_200/feats.txt'
dev='./exp/ivectors_semeval_dev_NGMM_2048_W_2_DIM_200/feats.txt'
train='./exp/ivectors_semeval_train_NGMM_2048_W_2_DIM_200/feats.txt'



trainy,trainx=imdb_bag_of_word_libs.loadFeatsText(train)
trainy=imdb_bag_of_word_libs.kaldiID_2_LB(trainy)
evaly,evalx=imdb_bag_of_word_libs.loadFeatsText(dev)
evaly=imdb_bag_of_word_libs.kaldiID_2_LB(evaly)

evaly2,evalx2=imdb_bag_of_word_libs.loadFeatsText(devtest)
evaly2=imdb_bag_of_word_libs.kaldiID_2_LB(evaly2)


robust_scaler = RobustScaler()
trainx=robust_scaler.fit_transform(trainx)
evalx=robust_scaler.transform(evalx)

clf= LinearDiscriminantAnalysis() #
clf.fit(trainx,trainy)
predictValue=clf.predict(evalx)

print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEV)

evalx2=robust_scaler.transform(evalx2)
predictValue=clf.predict(evalx2)


print semeval2016_libs.scoreSameOrder(predictValue,configure.SCORE_REF_DEVTEST)
示例#40
0
文件: part1.py 项目: rbaxter1/CS7641
 def gmm_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='GMM'):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     em_bic = []
     em_aic = []
     em_completeness_score = []
     em_homogeneity_score = []
     em_measure_score = []
     em_adjusted_rand_score = []
     em_adjusted_mutual_info_score = []
     
     cluster_range = np.arange(2, max_clusters+1, 1)
     for k in cluster_range:
         print('K Clusters: ', k)
         
         ##
         ## Expectation Maximization
         ##
         em = GaussianMixture(n_components=k, covariance_type='full')
         em.fit(X_train_scl)
         em_pred = em.predict(X_train_scl)
         
         em_bic.append(em.bic(X_train_scl))
         em_aic.append(em.aic(X_train_scl))        
     
         # metrics
         y_train_score = y_train.reshape(y_train.shape[0],)
         
         em_homogeneity_score.append(homogeneity_score(y_train_score, em_pred))
         em_completeness_score.append(completeness_score(y_train_score, em_pred))
         em_measure_score.append(v_measure_score(y_train_score, em_pred))
         em_adjusted_rand_score.append(adjusted_rand_score(y_train_score, em_pred))
         em_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, em_pred))
         
     
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     ##
     ## BIC/AIC Plot
     ##
     title = 'Information Criterion Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_ic'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     ph.plot_series(cluster_range,
                 [em_bic, em_aic],
                 [None, None],
                 ['bic', 'aic'],
                 cm.viridis(np.linspace(0, 1, 2)),
                 ['o', '*'],
                 title,
                 'Number of Clusters',
                 'Information Criterion',
                 filename)
     
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(cluster_range,
                 [em_homogeneity_score, em_completeness_score, em_measure_score, em_adjusted_rand_score, em_adjusted_mutual_info_score],
                 [None, None, None, None, None, None],
                 ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'],
                 cm.viridis(np.linspace(0, 1, 5)),
                 ['o', '^', 'v', '>', '<', '1'],
                 title,
                 'Number of Clusters',
                 'Score',
                 filename)
示例#41
0
X_to_predict = X_to_predict.values
Y_labels = Y_labels.values

#Normalized
#transformer = Normalizer().fit(X_train_test)
#X_train_test = transformer.transform(X_train_test)
#X_to_predict = transformer.transform(X_to_predict)

#Scaling
#transformer = PowerTransformer().fit(X_train_test)
#X_train_test = transformer.transform(X_train_test)
#X_to_predict = transformer.transform(X_to_predict)

#Scaling
transformer = RobustScaler().fit(X_train_test)
X_train_test = transformer.transform(X_train_test)
X_to_predict = transformer.transform(X_to_predict)

#PCA
#pca_model = PCA(n_components=10, svd_solver='full')
#X_train_test = pca_model.fit_transform(X_train_test, Y_labels)
#X_to_predict = pca_model.transform(X_to_predict)


#Select best features
#selecter = SelectKBest(chi2, k=6)
#X_train_test = selecter.fit_transform(X_train_test, Y_labels)
#X_to_predict = selecter.transform(X_to_predict)


gettingDistributionOfDatas()
示例#42
0
    print(cv_results['test_score'])
    print("mean of CV scores:")
    print(mean(cv_results['test_score']))

    print("cross_validation scores:", file=res)
    print(cv_results['test_score'], file=res)
    print("mean of CV scores:", file=res)
    print(mean(cv_results['test_score']), file=res)

    # TEST

    test_set = pd.read_csv("X_test.csv")
    x_test = test_set.drop('id', axis=1)
    # missing values
    x_test_filled = imputer.transform(x_test)
    x_test = pd.DataFrame(x_test_filled)
    # scaling
    x_test_scaled = scaler.transform(x_test)
    cols = list(x_test.columns.values)
    x_test = pd.DataFrame(data=x_test_scaled, columns=cols)
    # feature selection
    x_test = pd.DataFrame(data=x_test, columns=new_features)
    # prediction
    y_test = reg.predict(x_test)
    Id = test_set['id']
    df = pd.DataFrame(Id)
    df.insert(1, "y", y_test)
    df.to_csv(('solution1.csv_' + str(n_estimators) + 'estimators_' +
               str(max_iter) + 'max_iter'),
              index=False)
示例#43
0
test.drop('Id', axis=1, inplace=True)

x = train.drop('SalePrice', axis=1)  #Drop Target feature from train.
y = train['SalePrice']
test = test.drop('SalePrice', axis=1)

#known outliers(some from author notes and some from notebook guides)
outliers = [30, 88, 462, 631, 1322]
x = x.drop(x.index[outliers])
y = y.drop(y.index[outliers])

x = x.drop('MSSubClass_150', axis=1)
test = test.drop('MSSubClass_150', axis=1)

#Robustscalar normalizes the data so it is more robust to outliers.
sc = RobustScaler()
x = sc.fit_transform(x)
test = sc.transform(test)

#Train
model = Lasso(alpha=0.0005, random_state=1)  #other alphas were tried too .
model.fit(x, y)

#Predict
pred = model.predict(test)
predFinal = np.exp(pred)  #Revert the log.

#Data export
output = pd.DataFrame({'Id': test2.Id, 'SalePrice': predFinal})
output.to_csv('submission.csv', index=False)
output.head()
示例#44
0
文件: part1.py 项目: rbaxter1/CS7641
 def kmeans_analysis(self, X_train, X_test, y_train, y_test, data_set_name, max_clusters, analysis_name='K-Means'):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     km_inertias = []
     km_completeness_score = []
     km_homogeneity_score = []
     km_measure_score = []
     km_adjusted_rand_score = []
     km_adjusted_mutual_info_score = []
     
     cluster_range = np.arange(2, max_clusters+1, 1)
     for k in cluster_range:
         print('K Clusters: ', k)
         ##
         ## KMeans
         ##
         km = KMeans(n_clusters=k, algorithm='full', n_jobs=-1)
         km.fit(X_train_scl)
         
         # inertia is the sum of distances from each point to its center   
         km_inertias.append(km.inertia_)
         
         # metrics
         y_train_score = y_train.reshape(y_train.shape[0],)
         
         km_homogeneity_score.append(homogeneity_score(y_train_score, km.labels_))
         km_completeness_score.append(completeness_score(y_train_score, km.labels_))
         km_measure_score.append(v_measure_score(y_train_score, km.labels_))
         km_adjusted_rand_score.append(adjusted_rand_score(y_train_score, km.labels_))
         km_adjusted_mutual_info_score.append(adjusted_mutual_info_score(y_train_score, km.labels_))
         
         ##
         ## Silhouette Plot
         ##
         title = 'Silhouette Plot (' + analysis_name + ', k=' + str(k) + ') for ' + data_set_name
         name = data_set_name.lower() + '_' + analysis_name.lower() + '_silhouette_' + str(k)
         filename = './' + self.out_dir + '/' + name + '.png'
         
         self.silhouette_plot(X_train_scl, km.labels_, title, filename)
         
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     ##
     ## Elbow Plot
     ##
     title = 'Elbow Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_elbow'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     # line to help visualize the elbow
     lin = ph.extended_line_from_first_two_points(km_inertias, 0, 2)
     
     ph.plot_series(cluster_range,
                 [km_inertias, lin],
                 [None, None],
                 ['inertia', 'projected'],
                 cm.viridis(np.linspace(0, 1, 2)),
                 ['o', ''],
                 title,
                 'Number of Clusters',
                 'Inertia',
                 filename)
     
     ##
     ## Score Plot
     ##
     title = 'Score Summary Plot (' + analysis_name + ') for ' + data_set_name
     name = data_set_name.lower() + '_' + analysis_name.lower() + '_score'
     filename = './' + self.out_dir + '/' + name + '.png'
                 
     ph.plot_series(cluster_range,
                 [km_homogeneity_score, km_completeness_score, km_measure_score, km_adjusted_rand_score, km_adjusted_mutual_info_score],
                 [None, None, None, None, None, None],
                 ['homogeneity', 'completeness', 'measure', 'adjusted_rand', 'adjusted_mutual_info'],
                 cm.viridis(np.linspace(0, 1, 5)),
                 ['o', '^', 'v', '>', '<', '1'],
                 title,
                 'Number of Clusters',
                 'Score',
                 filename)
示例#45
0
        
train.to_csv(path_or_buf= filepath + "/trainfinal.csv", index=False)
test.to_csv(path_or_buf= filepath + "/testfinal.csv", index=False)
print("Exported")
train = []
test = []

#Obtaining the columns required for training the model
train = pd.read_csv(filepath + "/trainfinal.csv")
test = pd.read_csv(filepath + "/testfinal.csv")
cols = [c for c in train.columns if c not in ['is_churn','msno']]

#Pre-processing the file with Robust Scaler
scaler = RobustScaler()
scaler.fit(train[cols])
train_x = scaler.transform(train[cols])
test_x = scaler.transform(test[cols])
train_y = train['is_churn']
print("Pre-processing completed")

#Training Random Forest Classifier
model = RandomForestClassifier(n_estimators = 50)
model.fit(train_x,train_y)
print("Training Completed")

#Predicting the test data with the trained model
predictions = model.predict(test_x)

#Exporting the msno and predicted values to a csv file
submission = pd.DataFrame()
submission['msno'] = test['msno']
示例#46
0
y = eeg_dataset[['class']].values.ravel()

# Segmentar los datos

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=.7,
                                                    test_size=.3,
                                                    random_state=25)

# Escalado de caracteristicas
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

scaler.fit(X_train)
x_train = scaler.transform(x_train)

x_test = scaler.transform(x_test)

# Arquitectura de modelo
max_features = 512

model = Sequential()
model.add(Embedding(max_features, output_dim=64))
model.add(LSTM(64))
model.add(Dropout(0.8))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
    """
    Method to generate box plot
    :param data: Pandas dataframe to be plotted
    """
    assert data is not None
    data2 = pd.melt(data, id_vars='Label')
    sns.boxplot(x='variable',
                y='value',
                hue='Label',
                vert=False,
                data=data2,
                showfliers=False)
    plt.show()
    plt.savefig('Figures/Boxplot.png')


if __name__ == "__main__":
    train_data, train_weights, train_labels, test_data, *ret = import_from_csv(
        path='Datasets', drop_labels=False)

    # subsample data to 10%
    frac_train_data = train_data.sample(frac=0.1)

    # Normalize data
    rs = RobustScaler()
    rs = rs.fit(train_data.iloc[:, :-1])
    train_data.iloc[:, :-1] = rs.transform(train_data.iloc[:, :-1])

    box_plot_data(data=train_data)
    print("plot complete")
             'i_SN_3', 'log_i_err_SN_3', 'z_SN_3', 'log_z_err_SN_3',
             'y_SN_3', 'log_y_err_SN_3']

feat_SN_4 = ['g_SN_4', 'log_g_err_SN_4', 'r_SN_4', 'log_r_err_SN_4',
             'i_SN_4', 'log_i_err_SN_4', 'z_SN_4', 'log_z_err_SN_4',
             'y_SN_4', 'log_y_err_SN_4']

feat_SN_5 = ['g_SN_5', 'log_g_err_SN_5', 'r_SN_5', 'log_r_err_SN_5',
             'i_SN_5', 'log_i_err_SN_5', 'z_SN_5', 'log_z_err_SN_5',
             'y_SN_5', 'log_y_err_SN_5']

###  training features with robust scaler ###
X_train = RS.fit_transform(df_train[feat_train])

### validation features in different noise levels ###
X_valid_SN_1 = RS.transform(df_valid[feat_SN_1])
X_valid_SN_2 = RS.transform(df_valid[feat_SN_2])
X_valid_SN_3 = RS.transform(df_valid[feat_SN_3])
X_valid_SN_4 = RS.transform(df_valid[feat_SN_4])
X_valid_SN_5 = RS.transform(df_valid[feat_SN_5])

### The targets that we wish to learn ###
Y_train = df_train['redshift']
Y_valid = df_valid['redshift']

### Some scaling of the target between 0 and 1 ###
### so we can model it with a beta function ###
### given that Beta function is not defined ###
### at 0 or 1 I've come up with this ulgy hack ###
max_train_Y = Y_train.max() + 0.00001
min_train_Y = Y_train.min() - 0.00001
# --------------
# Scaling features to lie between a given minimum and maximum value, often between 0 and 1
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

print("\nMinMaxScalar:" "\n=============" "\nX_train:", X_train)
print('\nX_test:', X_test)

# --------------
#  ROBUSTSCALAR |
# --------------
# This removed the median and scaled the data according to the quantile range
robust_scaler = RobustScaler()
X_train = robust_scaler.fit_transform(X_train)
X_test = robust_scaler.transform(X_test)

print("\nRobustScalar:" "\n=============" "\nX_train:", X_train)
print('\nX_test:', X_test)

# --------------
#  NORMALIZER   |
# --------------
# Normalize samples individually to unit norm
# Each sample (each row of the data matrix) with at least one non zero component is rescaled
# indepentently o other samples so that its norm (|1 or |2) equals 1
normalizer_scaler = Normalizer()
X_train = normalizer_scaler.fit_transform(X_train)
X_test = normalizer_scaler.transform(X_test)

print("\nNormalizer:" "\n===========" "\nX_train:", X_train)
print 'done in',time.time()-ts,len(x),len(y)

y=imdb_bag_of_word_libs.kaldiID_2_LB(y)
print y[0],x[0]


x=np.array(x)
y=np.array(y)



trainx,trainy=x,y

robust_scaler = RobustScaler()
trainx=robust_scaler.fit_transform(trainx)
evalx=robust_scaler.transform(testx)
clf= LinearDiscriminantAnalysis()
clf.fit(trainx,trainy)
predictValue=clf.predict(evalx)

sdict=dict()
ptrue=list()
for id,score in zip(testy,predictValue):
    sdict[id]=score
    #print id,score
    truevalue=int(id.split('_')[2])
    if truevalue>=5:
        ptrue.append('1')
    else:
        ptrue.append('0')
示例#51
0
from sklearn.preprocessing import RobustScaler

f_columns = [
    '2_prev', '3_prev', '4_prev', '5_prev', '6_prev', '7_prev', '8_prev',
    '9_prev', '10_prev', '11_prev', '12_prev', 'MONTH', 'HOUR', 'WEEKDAY',
    'WEEKEND', 'Demand Forecast', 'SPOT Market Volume', 'Wind Forecast',
    'RoR Forecast', 'Yuk Tahmin Planı (MWh)', 'Market Clearing Price'
]

f_transformer = RobustScaler()
cnt_transformer = RobustScaler()

f_transformer = f_transformer.fit(train[f_columns].to_numpy())
cnt_transformer = cnt_transformer.fit(train[['NetOrder']])

train.loc[:, f_columns] = f_transformer.transform(train[f_columns].to_numpy())
train['NetOrder'] = cnt_transformer.transform(train[['NetOrder']])

test.loc[:, f_columns] = f_transformer.transform(test[f_columns].to_numpy())
test['NetOrder'] = cnt_transformer.transform(test[['NetOrder']])


def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs).astype(np.float32), np.array(ys).astype(np.float32)

示例#52
0
def get_evoked_feats(f_list,
                     stim_chan,
                     sig_chan,
                     pre_win=1.,
                     post_win=1.5,
                     thresh=3,
                     t_thresh=0.1):

    all_evoked_burst = None
    IBI = []
    all_evoked_onset = []
    all_prev_onset = []
    stim_lockout_s = 1.

    for f in f_list:
        dat = pyabf.ABF(f)
        stim_id = abf.get_channel_id_by_label(dat, stim_chan)
        sig_id = abf.get_channel_id_by_label(dat, sig_chan)
        sr = dat.dataRate

        scl = RobustScaler()
        Y_cat = cat_sweeps(dat, sig_chan).T.ravel()
        scl.fit(Y_cat[:, np.newaxis])

        for ii in range(dat.sweepCount):
            dat.setSweep(ii, stim_id)
            stim_samp = rlab_signal.binary_onsets(dat.sweepY, 4.)[0]
            dat.setSweep(ii, sig_id)
            #             if sr == 10000:
            #                 print('Downsampling')
            #                 y = dat.sweepY
            #                 y = scipy.signal.decimate(y, 10)
            #                 sr = sr / 10
            #             else:
            #                 y = dat.sweepY
            y = dat.sweepY
            stim_lockout = int(stim_lockout_s * sr)
            yscl = scl.transform(y[:, np.newaxis]).ravel()
            yscl_NN = yscl - np.min(yscl)
            onsets, offsets = burst.detect_burst(yscl,
                                                 sr,
                                                 thresh=thresh,
                                                 t_thresh=t_thresh)
            # onsets, offsets = burst.rm_endpoint_bursts(yscl, onsets, offsets, pre_win * sr, post_win * sr)

            # Get the threshold crossing time of the bursts that happened within a time window of the evoked
            #Used to get the evoked burst shapek
            try:
                evoked_onset_idx = np.where(
                    onsets > (stim_samp - int(pre_win / 9. * sr)))[0][0]
                next_onset_idx = evoked_onset_idx + 1
                prev_onset_idx = evoked_onset_idx - 1
                evoked_onset = onsets[evoked_onset_idx]
            except:
                IBI.append(np.nan)
                all_prev_onset.append(np.nan)
                all_evoked_onset.append(np.nan)
                evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1
                                         ]) * np.nan
                if all_evoked_burst is None:
                    all_evoked_burst = evoked_burst
                else:
                    all_evoked_burst = np.concatenate(
                        [all_evoked_burst, evoked_burst], axis=1)

                continue
                # evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1]) * np.nan

            if next_onset_idx > len(onsets) - 1:
                next_onset = np.nan
            else:
                next_onset = onsets[next_onset_idx]

            if prev_onset_idx < 0:
                prev_onset = np.nan
            else:
                prev_onset = onsets[prev_onset_idx]

            # Get the threshold crossing of the second burst after stim (good for IBI)

            if evoked_onset < int(stim_samp + stim_lockout):
                evoked_burst = burst.get_aligned_bursts(
                    yscl_NN, [evoked_onset], int(pre_win * sr),
                    int(post_win * sr))
                IBI.append(next_onset - evoked_onset)
                all_evoked_onset.append(evoked_onset)
                all_prev_onset.append(prev_onset)
            else:
                IBI.append(np.nan)
                all_prev_onset.append(np.nan)
                all_evoked_onset.append(np.nan)
                evoked_burst = np.empty([int(pre_win * sr + post_win * sr), 1
                                         ]) * np.nan

            if all_evoked_burst is None:
                all_evoked_burst = evoked_burst
            else:
                all_evoked_burst = np.concatenate(
                    [all_evoked_burst, evoked_burst], axis=1)
    evoked_onset = np.array(all_evoked_onset) / sr
    prev_onset = np.array(all_prev_onset) / sr
    IBI = np.array(IBI) / sr

    return (all_evoked_burst, evoked_onset, prev_onset, IBI)
示例#53
0
class Learned(Model):

    def __init__(self, *args, scale=False, center=False, **kwargs):
        """
        A machine learned model.  Beyond :class:`revscoring.Model`, this
        "Learned" models implement
        :func:`~revscoring.scoring.models.Learned.fit` and
        :func:`~revscoring.scoring.models.Learned.cross_validate`.
        """
        super().__init__(*args, **kwargs)
        self.trained = None
        if scale or center:
            self.scaler = RobustScaler(with_centering=center,
                                       with_scaling=scale)
        else:
            self.scaler = None

        self.params.update({
            'scale': scale,
            'center': center
        })

    def train(self, values_labels):
        """
        Fits the model using labeled data by learning its shape.

        :Parameters:
            values_labels : [( `<feature_values>`, `<label>` )]
                an iterable of labeled data Where <values_labels> is an ordered
                collection of predictive values that correspond to the
                :class:`revscoring.Feature` s provided to the constructor
        """
        raise NotImplementedError()

    def fit_scaler_and_transform(self, fv_vectors):
        """
        Fits the internal scale to labeled data.

        :Parameters:
            fv_vectors : `iterable` (( `<feature_values>`, `<label>` ))
                an iterable of labeled data Where <values_labels> is an ordered
                collection of predictive values that correspond to the
                `Feature` s provided to the constructor

        :Returns:
            A dictionary of model statistics.
        """
        if self.scaler is not None:
            return self.scaler.fit_transform(fv_vectors)
        else:
            return fv_vectors

    def apply_scaling(self, fv_vector):
        if self.scaler is not None:
            if not hasattr(self.scaler, "center_") and \
               not hasattr(self.scaler, "scale_"):
                raise RuntimeError("Cannot scale a vector before " +
                                   "training the scaler")
            fv_vector = self.scaler.transform([fv_vector])[0]

        return fv_vector

    def _clean_copy(self):
        raise NotImplementedError()

    def cross_validate(self, values_labels, folds=10, processes=1):
        """
        Trains and tests the model agaists folds of labeled data.

        :Parameters:
            values_labels : [( `<feature_values>`, `<label>` )]
                an iterable of labeled data Where <values_labels> is an ordered
                collection of predictive values that correspond to the
                `Feature` s provided to the constructor
            folds : `int`
                When set to 1, cross-validation will run in the parent thread.
                When set to 2 or greater, a :class:`multiprocessing.Pool` will
                be created.
        """
        folds_i = KFold(n_splits=folds, shuffle=True,
                        random_state=0)
        if processes == 1:
            mapper = map
        else:
            pool = Pool(processes=processes or cpu_count())
            mapper = pool.map
        results = mapper(self._cross_score,
                         ((i, [values_labels[i] for i in train_i],
                           [values_labels[i] for i in test_i])
                          for i, (train_i, test_i) in enumerate(
                              folds_i.split(values_labels))))
        agg_score_labels = []
        for score_labels in results:
            agg_score_labels.extend(score_labels)

        self.info['statistics'].fit(agg_score_labels)

        return self.info['statistics']

    def _cross_score(self, i_train_test):
        i, train_set, test_set = i_train_test
        logger.info("Performing cross-validation {0}...".format(i + 1))
        model = self._clean_copy()
        logger.debug("Training cross-validation for {0}...".format(i + 1))
        model.train(train_set)
        logger.debug("Scoring cross-validation for {0}...".format(i + 1))
        feature_values, labels = map(list, zip(*test_set))
        docs = model.score_many(feature_values)
        return list(zip(docs, labels))