예제 #1
1
def loadData(path="../data/",k=5,log='add',pca_n=0,SEED=34):
	from pandas import DataFrame, read_csv
	from numpy import log as ln
	from sklearn.cross_validation import KFold
	from sklearn.preprocessing import LabelEncoder
	from sklearn.preprocessing import StandardScaler
	train = read_csv(path+"train.csv")
	test = read_csv(path+"test.csv")
	id = test.id
	target = train.target
	encoder = LabelEncoder()
	target_nnet = encoder.fit_transform(target).astype('int32')
	feat_names = [x for x in train.columns if x.startswith('feat')]
	train = train[feat_names].astype(float)
	test = test[feat_names]
	if log == 'add':
		for v in train.columns:
			train[v+'_log'] = ln(train[v]+1)
			test[v+'_log'] = ln(test[v]+1)
	elif log == 'replace':
		for v in train.columns:
			train[v] = ln(train[v]+1)
			test[v] = ln(test[v]+1)      
	if pca_n > 0:
		from sklearn.decomposition import PCA
		pca = PCA(pca_n)
		train = pca.fit_transform(train)
		test = pca.transform(test)
	scaler = StandardScaler()
	scaler.fit(train)
	train = DataFrame(scaler.transform(train),columns=['feat_'+str(x) for x in range(train.shape[1])])
	test = DataFrame(scaler.transform(test),columns=['feat_'+str(x) for x in range(train.shape[1])])
	cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED)
	return train, test, target, target_nnet, id, cv, encoder
예제 #2
0
def lr_with_scale2():
    """
    Submission: lr_with_scale2_0704_03.csv
    E_val:
    E_in: 0.878996
    E_out: 0.8768131004917349
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegressionCV(Cs=50, cv=5, scoring='roc_auc', n_jobs=-1,
                               class_weight='auto')
    clf.fit(X_scaled, y)
    logger.debug('Best C: %f', clf.C_[0])
    logger.debug('Cs: %s', clf.Cs_)
    logger.debug('Grid scores: %f', clf.scores_)
    logger.debug('Ein: %f', Util.auc_score(clf, X_scaled, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('lr', clf)]), 'lr_with_scale2_0704_03')
예제 #3
0
class FeaturePreProcesser():
    def __init__(self):
        pass

    def fit(self,X):
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(X)
        X = self.imputer.transform(X)

        self.std_scaler = StandardScaler()
        self.std_scaler.fit(X)

    def fit_transform(self, X):
        self.imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
        self.imputer.fit(X)
        X = self.imputer.transform(X)

        self.std_scaler = StandardScaler()
        self.std_scaler.fit(X)
        X = self.std_scaler.transform(X)

        return X
    def transform(self, X):
        X = self.imputer.transform(X)
        X = self.std_scaler.transform(X)
        return X
예제 #4
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = Pipeline([
            ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                         n_jobs=N_JOBS))])
        self.scaler = StandardScaler()
        self.agglo = FeatureAgglomeration(n_clusters=500)

    def fit(self, X, y):
        y = y.ravel()
        n_samples, n_lags, n_lats, n_lons = X.shape
        self.scaler.fit(X[:, -1].reshape(n_samples, -1))
        X = X.reshape(n_lags * n_samples, -1)
        connectivity = grid_to_graph(n_lats, n_lons)
        self.agglo.connectivity = connectivity
        X = self.scaler.transform(X)
        X = self.agglo.fit_transform(X)
        X = X.reshape(n_samples, -1)
        self.clf.fit(X, y)

    def predict(self, X):
        n_samples, n_lags, n_lats, n_lons = X.shape
        X = X.reshape(n_lags * n_samples, -1)
        X = self.scaler.transform(X)
        X = self.agglo.transform(X)
        X = X.reshape(n_samples, -1)
        return self.clf.predict(X)
예제 #5
0
def rf2():
    """
    Submission: rf2_0704_04.csv
    3000 trees
    E_val: 0.871431
    E_in: 0.999998
    E_out:
    30000 trees
    E_val:
    E_in:
    E_out:
    """
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.ensemble import RandomForestClassifier

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rf = RandomForestClassifier(n_estimators=30000, oob_score=True, n_jobs=-1,
                                class_weight='auto', max_features='log2')
    rf.fit(X_scaled, y)

    logger.debug('Eval(oob): %f', rf.oob_score_)
    logger.debug('Ein: %f', Util.auc_score(rf, X_scaled, y))

    IO.cache(rf, Path.of_cache('rf.RandomForestClassifier.log2.pkl'))
    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('rf', rf)]), 'rf2_0704_04')
def knn(x_train, y_train, x_valid):
    x_train=np.log(x_train+1)
    x_valid=np.log(x_valid+1)

    where_are_nan = np.isnan(x_train)
    where_are_inf = np.isinf(x_train)
    x_train[where_are_nan] = 0
    x_train[where_are_inf] = 0
    where_are_nan = np.isnan(x_valid)
    where_are_inf = np.isinf(x_valid)
    x_valid[where_are_nan] = 0
    x_valid[where_are_inf] = 0

    scale=StandardScaler()
    scale.fit(x_train)
    x_train=scale.transform(x_train)
    x_valid=scale.transform(x_valid)

    #pca = PCA(n_components=10)
    #pca.fit(x_train)
    #x_train = pca.transform(x_train)
    #x_valid = pca.transform(x_valid)

    kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
    knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn")
    return knn_train, knn_test, "knn"
예제 #7
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
예제 #8
0
def process(discrete, cont):
  # Create discrete and continuous data matrices
  discrete_X = np.array(discrete)
  cont_X = np.array(cont)

  # Impute discrete values
  imp = Imputer(strategy='most_frequent')
  discrete_X = imp.fit_transform(discrete_X)

  # Impute continuous values
  imp_c = Imputer(strategy='mean')
  cont_X = imp_c.fit_transform(cont_X)

  # Discrete basis representation
  enc = OneHotEncoder()
  enc.fit(discrete_X)
  discrete_X = enc.transform(discrete_X).toarray()

  # Continuous scaling
  scaler = StandardScaler()
  scaler.fit(cont_X)
  cont_X = scaler.transform(cont_X)

  # Merge to one array
  X = np.concatenate((discrete_X, cont_X), axis=1)
  return X
예제 #9
0
def load_data_csv(datafile):
    """
    Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
    :param datafile: path of the file
    :return: a NumPy array containing a data point in each row
    """

    # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
    # will be at the column named 'x' in the CSV file.
    # This will be useful later when we start adding more features.
    _COLUMN_X = 'x'
    _COLUMN_Y = 'y'
    _COLUMN_W = 'color'

    data = pd.read_csv(datafile)

    # Normalize
    scaler = StandardScaler()
    scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
    data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])

    data_coords = data[[_COLUMN_X, _COLUMN_Y]].values
    data_words = [[e] for e in data[[_COLUMN_W]].values.flatten().tolist()]

    data = {"coordinates": data_coords, "words": data_words}

    return sparsify_data(data, None, None), scaler  # None for both params since SVD is not used
def prepare_data():
    # prepare data
    from sklearn import datasets
    iris = datasets.load_iris()
    X = iris.data[:, [2, 3]]
    y = iris.target
    print('Class labels:', np.unique(y))
    print(X.shape, y.shape)
    
    # split train and test
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
    print(X_train.shape, X_test.shape)
    
    print('Labels counts in y:', np.bincount(y))
    print('Labels counts in y_train:', np.bincount(y_train))
    print('Labels counts in y_test:', np.bincount(y_test))
    
    # scaler
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    sc.fit(X_train)  # mean + sd of train data
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)
    return X_train_std, X_test_std, y_train, y_test
예제 #11
0
def load_data_csv_advanced(datafile):
    """
    Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
    :param datafile: path of the file
    :return: a NumPy array containing a data point in each row
    """

    # File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
    # will be at the column named 'x' in the CSV file.
    _COLUMN_X = 'x'
    _COLUMN_Y = 'y'

    data = pd.read_csv(datafile)

    # Normalize
    scaler = StandardScaler()
    scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
    data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])

    #  Get feature vector names by removing "x" and "y"
    feature_vector_names = data.columns.difference([_COLUMN_X, _COLUMN_Y])
    data_coords = data[[_COLUMN_X, _COLUMN_Y]].values

    result = {"coordinates": data_coords}

    for feature in feature_vector_names:
        data_words = [[e.strip() for e in venue_data.split(",")] for venue_data in data[feature].values.flatten().tolist()]

        result[feature] = data_words

    return sparsify_data(result, None, None), scaler  # None for both params since SVD is not used
예제 #12
0
파일: model.py 프로젝트: nikizh/msc-project
class GPR(object):
    def __init__(self, X, y, kernel=None):
        self.X = X
        self.y = y

        self._noise_variance = 0.00001
        self._kernel = kernel
        self._scaler = StandardScaler(with_std=False)
        self._scaler.fit(self.y)
        self.y = self._scaler.transform(self.y)

        assert self._kernel is not None

    @property
    def noise_variance(self):
        return self._noise_variance

    @noise_variance.setter
    def noise_variance(self, value):
        self._noise_variance = value

    def predict(self, X_test):
        assert isinstance(self._kernel, Kern)

        K = self._kernel.K(self.X)
        K_star = self._kernel.K(self.X, X_test)
        K_star_star = self._kernel.K(X_test)

        L = np.linalg.cholesky(K + self._noise_variance * np.eye(len(K)))
        Lk = np.linalg.solve(L, K_star)
        mu = np.dot(Lk.T, np.linalg.solve(L, self.y))
        s2 = np.diag(K_star_star) - np.sum(Lk ** 2, axis=0) + self._noise_variance

        return mu + self._scaler.mean_, s2
예제 #13
0
def get_features_and_labels(frame):
    '''
    Transforms and scales the input data and returns numpy arrays for
    training and testing inputs and targets.
    '''

    # Convert values to floats
    arr = np.array(frame, dtype=np.float)

    # Normalize the entire data set
    from sklearn.preprocessing import StandardScaler, MinMaxScaler
    arr = MinMaxScaler().fit_transform(arr)

    # Use the last column as the target value
    X, y = arr[:, :-1], arr[:, -1]
  
    # Use 50% of the data for training, but we will test against the
    # entire set
    
    # '''ADD LINES FOR CREATING TRAINING AND TEST SET'''
    
    # Normalize the attribute values to mean=0 and variance=1
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    
    # Fit the scaler based on the training data, then apply the same
    # scaling to both training and test sets.
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Return the training and test sets
    return X_train, X_test, y_train, y_test
예제 #14
0
def dbscan_outliers(df):
    """
    Find outliers (noise points) using DBSCAN.

    Parameters
    ----------
    df: A pandas.DataFrame

    Returns
    -------
    A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame)
    """

    scaler = StandardScaler()
    scaler.fit(df)
    scaled = scaler.transform(df)

    dbs = DBSCAN()

    db = dbs.fit(scaled)
    outliers = dbs.fit_predict(scaled)

    df_o = df.ix[np.nonzero(outliers)]

    return db, df_o
예제 #15
0
def sgd(X, y, weight, X_test=False):
    from sklearn.linear_model import SGDRegressor
    from sklearn import cross_validation
    from sklearn.metrics import confusion_matrix
    from sklearn.preprocessing import StandardScaler

    #X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split(
    #        X, y, weight, test_size=0.2, random_state=0)
    clf = SGDRegressor(loss="huber", n_iter=100, penalty="l1")
    #clf = LogisticRegression( max_iter=100)

    X_train = X
    y_train = y

    scaler = StandardScaler(with_mean=False)
    scaler.fit(X_train)  # Don't cheat - fit only on training data
    X_train = scaler.transform(X_train)

    X_test = scaler.transform(X_test)  # apply same transformation to test data

    clf.fit(X_train, y_train, sample_weight=weight)

    print(clf.score(X_train,y_train,weight))

    y_pred = clf.predict(X_test)
    
    from sklearn.externals import joblib
    import scipy.io as sio
    joblib.dump(clf, 'models/sgd_.pkl') 
    sio.savemat('predict_y_forward.mat', {'y':y_pred})
예제 #16
0
def sgc_test(X, y, weight):
    from sklearn.linear_model import SGDClassifier
    from sklearn import cross_validation
    from sklearn.metrics import confusion_matrix
    from sklearn.preprocessing import StandardScaler

    for i in range(0,1):
        X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split(
            X, y, weight, test_size=0.2, random_state=0)
        clf = SGDClassifier(loss="hinge", n_iter=100, n_jobs=-1, penalty="l2")
        #clf = LogisticRegression( max_iter=100)

        scaler = StandardScaler(with_mean=False)
        scaler.fit(X_train)  # Don't cheat - fit only on training data
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)  # apply same transformation to test data

        clf.fit(X_train, y_train, sample_weight=weight_train)

        y_pred = clf.predict(X_train)
        #print(confusion_matrix(y_train, y_pred))
        print(clf.score(X_train,y_train,weight_train))

        y_pred = clf.predict(X_test)

        #print(confusion_matrix(y_test, y_pred))
        print(clf.score(X_test,y_test,weight_test))
def standardize(x_data):        
	print 'Started standardizing of the data'
	sc = StandardScaler()
	sc.fit(x_data)	
	x_std = sc.transform(x_data)        
	print 'Finished standardizing of the data'
	return x_std
예제 #18
0
    def standard_scaler(self, rates):

        import fx2.fx_config as fxconf

        config = fxconf.FxConfig()
        SAVE_FLAG = config.get_scale_save_flag()
        SAVE_FILENAME = config.get_scale_save_filename()

        if SAVE_FLAG:
            fin = open(SAVE_FILENAME, "w")

        for i in range(len(rates)):
            ratesNp = np.array(rates[i])
            reshaped = ratesNp.reshape(-1, 1)  # StandardScallerの入力形式に合わせる
            scaler = StandardScaler()
            scaler.fit(reshaped)

            if SAVE_FLAG:
                self.__save_scaler_value(fin, scaler,i)

            dataStd = scaler.transform(reshaped)

            for j in range(0,len(rates[i])):
                rates[i][j] = dataStd[j][0]

        del ratesNp
        del reshaped
        gc.collect()

        if SAVE_FLAG:
            fin.close()
예제 #19
0
def analyse_stock(X, Y):

	poly_degree = 3

	# print X.shape, Y.shape

	scaler = StandardScaler()
	scaler.fit(X[:, 0])
	X = scaler.transform(X)
	Y = scaler.transform(Y)
	# print X[0]
	


	X_train, X_test, y_train, y_test = cross_validation.train_test_split( \
	X, Y, test_size=0.2, random_state=0)

	stock1_model_pipeline = Pipeline([('poly', PolynomialFeatures(degree=poly_degree)),
		('linear', LinearRegression(fit_intercept=False))])

	stock1_model = stock1_model_pipeline.fit(X_train, y_train)

	print stock1_model.score(X_test, y_test)
	# print stock1_model.predict(X_test[0])

	stock1_model_pipeline = Pipeline([('poly', PolynomialFeatures(degree=poly_degree)),
		('linear', Ridge(alpha=.1))])
	print stock1_model.score(X_test, y_test)
def plot_lr_regularization():
    iris = datasets.load_iris()
    X = iris.data[:, [2, 3]]
    y = iris.target

    X_train, _, y_train, _ = train_test_split(
        X,
        y,
        test_size=0.3,
        random_state=0,
    )

    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)

    weights = []
    params = []
    for c in np.logspace(-5, 4, num=10):
        lr = LogisticRegression(C=c, random_state=0)
        lr.fit(X_train_std, y_train)
        weights.append(lr.coef_[1])
        params.append(c)
    weights = np.array(weights)
    plt.plot(params, weights[:, 0], label='petal length')
    plt.plot(params, weights[:, 1], linestyle='--', label='petal width')
    plt.ylabel('weight coefficient')
    plt.xlabel('C')
    plt.legend(loc='upper left')
    plt.xscale('log')
    plt.show()
def main(use_idf=False, random_state=None, std=False, n_jobs=-1, verbose=2):
    wc_idf_map = None
    if use_idf:
        # ingredients inverse document frequencies
        wc_components = build_tfidf_wc(verbose=(verbose > 0))
        wc_idf = wc_components['model'].idf_
        wc_idf_words = wc_components['model'].get_feature_names()
        wc_idf_map = dict(zip(wc_idf_words, wc_idf))
    # word2vec recipe feature vectors
    wc_components = build_word2vec_wc(feature_vec_size=120, avg=True, idf=wc_idf_map, verbose=(verbose > 0))
    y_train = wc_components['train']['df']['cuisine_code'].as_matrix()
    X_train = wc_components['train']['features_matrix']
    # standardize features aka mean ~ 0, std ~ 1
    if std:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
    # random forest supervised classifier
    time_0 = time.time()
    clf = RandomForestClassifier(n_estimators=100, max_depth=None,
        n_jobs=n_jobs, random_state=random_state, verbose=verbose)
    # perform cross validation
    cv_n_fold = 8
    print 'cross validating %s ways...' % cv_n_fold
    scores_cv = cross_val_score(clf, X_train, y_train, cv=cv_n_fold, n_jobs=-1)
    print 'accuracy: %0.5f (+/- %0.5f)' % (scores_cv.mean(), scores_cv.std() * 2)
    time_1 = time.time()
    elapsed_time = time_1 - time_0
    print 'cross validation took %.3f seconds' % elapsed_time
예제 #22
0
파일: models.py 프로젝트: mcbada/daBus
def kfolds_cv(estimator, X, y):
    num_folds = 10
    kf = KFold(len(X), n_folds=num_folds, shuffle=True)

    yhat_train = np.zeros(len(y), dtype = y.dtype)
    yhat_test  = np.zeros(len(y), dtype = y.dtype)
    train_err  = []
    test_err   = []

    for train_idx, test_idx in kf:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        # Scale the data
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        X_test_scaled  = scaler.transform(X_test)
        # fit the estimator (estimator.__class__.__name__)
        estimator  = estimator.fit(X_train_scaled, y_train)
        yhat_train = estimator.predict(X_train_scaled)
        yhat_test  = estimator.predict(X_test_scaled)
        # store train and test error
        train_err.append( rmsle(y_train, yhat_train) )
        test_err.append(  rmsle(y_test,  yhat_test) )

    return {"Model Name":(estimator.__class__.__name__),
            "Err Train": np.mean(train_err),
            "Err Test": np.mean(test_err)}
예제 #23
0
def data_processing(train,test,features):
    # train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    # test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    # train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    # test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    # train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    # test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    # train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
    # test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
    # features += ['hour','dark','StreetNo']

    print("Filling NAs")
    # print(train.mode())
    train = train.fillna(train.median().iloc[0])
    test = test.fillna(test.median().iloc[0])
    print("Label Encoder")
    le=LabelEncoder()
    for col in features:
        le.fit(list(train[col])+list(test[col]))
        train[col]=le.transform(train[col])
        test[col]=le.transform(test[col])

    le.fit(list(train[target]))
    train[target]=le.transform(train[target])

    print("Standard Scalaer")
    scaler=StandardScaler()
    for col in features:
        scaler.fit(list(train[col]))
        train[col]=scaler.transform(train[col])
        test[col]=scaler.transform(test[col])

    return train,test,features
예제 #24
0
def load_data(dataset, scale=False):
    ''' Loads the dataset

    :type dataset: string
    :param dataset: The folder in ../data/ containing the training/testing numpy arrays
    '''

    print '... loading data'
    path = "../data/" + dataset + "/"
    
    #training set
    trainingData = numpy.load(path + "training.data.npy") 
    trainingIndices = numpy.load(path + "training.indices.npy")
    trainingIndptr = numpy.load(path + "training.indptr.npy")
    training_y = numpy.load(path + "training.labels.npy")
    training_X = scipy.sparse.csr_matrix((trainingData, trainingIndices, trainingIndptr))

    #testing set
    testingData = numpy.load(path + "testing.data.npy") 
    testingIndices = numpy.load(path + "testing.indices.npy")
    testingIndptr = numpy.load(path + "testing.indptr.npy")
    testing_y = numpy.load(path + "testing.labels.npy")
    testing_X = scipy.sparse.csr_matrix((testingData, testingIndices, testingIndptr))

    #scale the data 
    if scale:
        print "..training scaler"
        scaler = StandardScaler(with_mean=False)
        scaler.fit(training_X)
        print "..scaling features"
        training_X = scaler.transform(training_X)
        testing_X = scaler.transform(testing_X)
    
    return [(training_X, training_y),(testing_X, testing_y)]
예제 #25
0
def get_norm_nFoldData(trainXY, testXY):
    trainX = trainXY[:,:-1]
    trainY = trainXY[:,-1]
    testX = testXY[:,:-1]
    testY = testXY[:,-1]

    #standardise only x values not labels
    scaler = StandardScaler()
    scaler.fit(trainX)
    trainX = scaler.transform(trainX)

    scaler.fit(testX)
    testX = scaler.transform(testX)

    trainY = trainY.reshape((trainY.shape[0],1))
    testY = testY.reshape((testY.shape[0],1))
    train_X_Y = np.concatenate((trainX,trainY),axis=1)
    test_X_Y = np.concatenate((testX,testY),axis=1)

    folds_tr = []
    folds_te = []
    nfolds = 5
    for i in range(nfolds):
        xp = int(train_X_Y.shape[0]*.8)
        np.random.shuffle(train_X_Y)
        folds_tr.append(train_X_Y[:xp,:])
        folds_te.append(train_X_Y[xp:,:])
    return folds_tr, folds_te
예제 #26
0
def svc_appr():
    """
    Best params: {'C': 0.022139881953014046}

    Submission:
    E_val:
    E_in:
    E_out:
    """
    from sklearn.svm import LinearSVC
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.grid_search import RandomizedSearchCV
    from scipy.stats import expon

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1,
                            cv=StratifiedKFold(y, 5), verbose=2,
                            param_distributions={'C': expon()})
    rs.fit(X_scaled, y)

    logger.debug('Got best SVC.')
    logger.debug('Best params: %s', rs.best_params_)
    logger.debug('Grid scores:')
    for i, grid_score in enumerate(rs.grid_scores_):
        print('\t%s' % grid_score)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
예제 #27
0
def lr_with_scale3():
    """
    Check the performance of normalizing TEST SET.

    Submission: lr_with_scale3_0707_04.csv
    E_val:
    E_in: 0.879233
    E_out: 0.8770121701777971

    Submission: lr_with_scale3_0712_01.csv
    E_val:
    E_in:
    E_out:
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.pipeline import Pipeline
    import numpy as np

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(np.r_[X, dataset.load_test()])
    X_scaled = raw_scaler.transform(X)

    clf = LogisticRegression(C=0.03, class_weight='auto')
    clf.fit(X_scaled, y)

    logger.debug('E_in: %f', Util.auc_score(clf, X_scaled, y))
    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('lr', clf)]), 'lr_with_scale3_0712_01')

    scores = cross_val_score(clf, X_scaled, y, scoring='roc_auc', n_jobs=-1)
    logger.debug('E_val: %f <- %s', np.average(scores), scores)
예제 #28
0
def bagging_lr():
    """
    Submission: bagging_lr_0707_02.csv
    E_val:
    E_in:
    E_out:
    """
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import BaggingClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    bag = BaggingClassifier(LogisticRegression(class_weight='auto'),
                            n_estimators=3000, oob_score=True, n_jobs=-1,
                            verbose=2)

    logger.debug('E_val (oob): %f', bag.oob_score_)
    logger.debug('E_in: %f', Util.auc_score(bag, X_scaled, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('bag', bag)]), 'bagging_lr_0707_02')
예제 #29
0
def ada_boost_dt():
    """
    Submission: ada_boost_dt_0707_03.csv
    E_val: 0.854350
    E_in: 0.889561
    E_out: 0.8832315976033993
    """
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.cross_validation import cross_val_score
    from sklearn.pipeline import Pipeline

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    ab = AdaBoostClassifier(n_estimators=300)

    scores = cross_val_score(ab, X_scaled, y, cv=5, n_jobs=-1)
    logger.debug('CV: %s', scores)
    logger.debug('E_val: %f', sum(scores) / len(scores))

    ab.fit(X_scaled, y)

    logger.debug('E_in: %f', Util.auc_score(ab, X_scaled, y))

    IO.dump_submission(Pipeline([('scale_raw', raw_scaler),
                                 ('ab', ab)]), 'ada_boost_dt_0707_03')
예제 #30
0
    def __init__(self):
        """
        Constructs a SimulateData object.
        """

        # Read the simulated data.
        simulated = pd.read_csv("simulated.csv", index_col=0)
        predictors = np.asarray(simulated)[:, 0:-1]
        responses = np.asarray(simulated)[:, -1]

        # Divide the simulated data into training and test sets.
        predictors_training, predictors_test,\
        self.responses_training, self.responses_test =\
            train_test_split(predictors, responses, test_size=0.33)

        # Standardize the predictors, both training and test.
        scaler = StandardScaler()
        scaler.fit(predictors_training)
        self.predictors_training = scaler.transform(predictors_training)
        self.predictors_test = scaler.transform(predictors_test)

        # Keep track of the number of samples in the training and test sets,
        # and also the number of features.
        self.training_sample_count = len(self.responses_training)
        self.test_sample_count = len(self.responses_test)
        self.feature_count = np.size(predictors, 1)
        return None
예제 #31
0
> From cc.T[0] we can see that PC1 appears to correlate the most with arts feature with a correlation value of 0.537

2. For the second component:
> From cc.T[1] we can see that PC2 appears to correlate the most with Healthcare feature with a correlation value of 0.1939
"""



"""<h2>4f. PCA with standardizing</h2>"""

data = pd.read_csv('places.txt',delim_whitespace=True,na_values='?')
table = data[['Climate', 'HousingCost', 'HlthCare', 'Crime', 'Transp', 'Educ', 'Arts','Recreat', 'Econ']]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(table)
mean = scaler.mean_
table = scaler.transform(table)

mean = np.mean(table,axis = 0)
print(mean)

std = np.std(table,axis = 0)
print(std)

from sklearn.decomposition import PCA
pca = PCA()
pca.fit(table)
print(pca.components_.shape)

pa1 = pca.components_[0]
예제 #32
0
    return (X, Y)


#train data
data = np.genfromtxt('output/train/train.csv', delimiter=';')
(X_train, Y_train) = clean(data, ncases)

#test data
data = np.genfromtxt('output/test/test.csv', delimiter=';')
(X_test, Y_test) = clean(data, ncases)
del data

#preprocess
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#define the method
from sklearn.neural_network import MLPClassifier
layers = [9]
activation = 'relu'
alpha = 0.001
type_rate = 'adaptive'
rate = 0.1
momentum = 0.09
max_iter = 2000
model = MLPClassifier(
    solver='sgd',
    hidden_layer_sizes=layers,
def train_NN(X,Y,target_names):
    print('Neural Network')
    #split the dataset into training set and testing set
    X[np.isnan(X)] = 0
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.33, random_state=0)
    print('training set')
    print(X_train.shape)
    
    #preprocessing the data
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    num_training_sample = len(X_train)
    best_hidden_layers_list,best_hidden_layers_tuple = grid_search(X_train, X_test, Y_train, Y_test,num_training_sample)
    
    nn_clf = MLPClassifier(alpha=1e-5,
                     hidden_layer_sizes=best_hidden_layers_tuple, random_state=1)
    
    #fit the training data to the model
    nn_clf.fit(X_train,Y_train)
    Y_pred = nn_clf.predict(X_test)

    #common standard to compare across models
    print('f1')
    f1_clf = f1_score(Y_test, Y_pred, average='samples')
    print(f1_clf)
    print('classification report')
    print(classification_report(Y_test,Y_pred))

    ##save model
    f_nn = open('nn_clf.pkl',"wb+")
    pickle.dump(nn_clf, f_nn)
    f_nn.close()

    f_nn_sc = open('nn_scaler.pkl',"wb+")
    pickle.dump(scaler, f_nn_sc)
    f_nn_sc.close()

    '''
                  precision    recall  f1-score   support

               0       0.00      0.00      0.00        28
               1       1.00      0.12      0.22        16
               2       0.00      0.00      0.00        39
               3       0.00      0.00      0.00        31
               4       0.00      0.00      0.00        27
               5       0.60      0.52      0.56        29
               6       0.00      0.00      0.00        23
               7       0.00      0.00      0.00        19

       micro avg       0.63      0.08      0.14       212
       macro avg       0.20      0.08      0.10       212
    weighted avg       0.16      0.08      0.09       212
     samples avg       0.00      0.00      0.00       212

     ['Case', 'Model', 'PowerDissipation', 'StorageTemperature', 'ThermalResistance', 'Type', 'Voltage', 'Weigth']
    '''
    
    return nn_clf, f1_clf
예제 #34
0
from income_data import X, y, X_train, X_test, y_train, y_test

######
# Run Grid Search to find optimal components
######
# import packages
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from plot_learning_curve import drawLearningCurve

# Scale the data
scaler = StandardScaler()
scaler.fit(X)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
X_toTransform = X_train_std
y_train = y_train
y_test = y_test

# Define the classifier
# svm = SVC(random_state=1)
# parameters = {'kernel':(['linear'])
#              ,'C':[10]
#              ,'gamma':([0.1])
#              }
# clf = GridSearchCV(svm, param_grid=parameters, cv=3)
# N_FEATURES_OPTIONS = [2]
예제 #35
0
import numpy as np
import copy
from ann2 import Net
from replicate import replicate_data
from sklearn.preprocessing import StandardScaler
from train import train

# Load training and testing data as pd dataframe
training_data = pd.read_excel('Data3/reduced_training_data.xlsx')
testing_data = pd.read_excel('Data3/test_data.xlsx')

# Standardise training and testing data
scaler_train = StandardScaler()
scaler_test = StandardScaler()

scaler_train.fit(training_data)
scaler_test.fit(testing_data)

testing_data = scaler_test.transform(testing_data)

# Convert training data to pd dataframe
columns = "BC NC LP LI".split()
training_data = pd.DataFrame(data=training_data, index=None, columns=columns)

# Replicate the training data
replicated_data1 = replicate_data(training_data, 10, 0.03)
replicated_data2 = replicate_data(training_data, 10, 0.05)

training_data = training_data.append(replicated_data1,
                                     ignore_index=True,
                                     sort=False)
예제 #36
0
def predict(data, learn_range):
    x, y = [], []
    for start in range(len(data[0]) - learn_range):
        group_data = pd.DataFrame()
        
        for i in reversed(range(len(data))):
            add_data = pd.DataFrame(data[i])[start : start + learn_range]
            group_data = pd.concat((group_data, add_data), axis=0)
            
        group_data = group_data.as_matrix()

        if group_data[-1] < data[0][start + learn_range]:
            y.append(1)
        else:
            y.append(0)
            
        x.append([e for i in group_data for e in i])

    x_train, x_test, y_train, y_test \
    = train_test_split(x, y, test_size=0.3, random_state=0, shuffle=False)
    
    select = SelectFromModel(RandomForestClassifier(), threshold="median").fit(x_train, y_train)
    X_train_selected = select.transform(x_train)
    X_test_selected = select.transform(x_test) 
       
    algorithm = RandomForestClassifier
    
    predict_result = [] 
    predict_result2 = []        
    if algorithm == tree.DecisionTreeClassifier or algorithm == RandomForestClassifier:
        clf = algorithm(random_state=0)
        clf.fit(x_train, y_train)
        predict_result = clf.predict(x_test)
        
        clf2 = algorithm(random_state=0)
        clf2.fit(X_train_selected, y_train)
        predict_result2 = clf2.predict(X_test_selected)
        
    if algorithm == SVC or algorithm == xgb.XGBClassifier:
        sc = StandardScaler()
        sc.fit(x_train)
        x_train_std = sc.transform(x_train)
        x_test_std = sc.transform(x_test)
        clf = algorithm(random_state=0)
        clf.fit(x_train_std, y_train)
        predict_result = clf.predict(x_test_std) 
        
    total = []
    for i in range(len(y_test)):        
        before = data[0][i + learn_range + len(y_train) - 1]
        after = data[0][i + learn_range + len(y_train)]
        if predict_result[i] == 1:
            total.append(after - before)
        else:
            total.append(before - after)
    
    count_0 = float(y.count(0))                 
    count_1 = float(y.count(1))                            
    high = max([count_0, count_1]) / (count_0 + count_1)
    
    print accuracy_score(predict_result, y_test), accuracy_score(predict_result2, y_test)
    #print clf.feature_importances_
    #print clf2.feature_importances_
    print select.get_support()
    
    return (count_0, count_1, high,
            accuracy_score(predict_result, y_test),
            sum(total),
            clf.feature_importances_ ,
            accuracy_score(predict_result2, y_test))
예제 #37
0
    def preprocess_data(self, prefix, normalize=True, load_adj_dir = None, use_random_walks = True, load_walks=False, num_walk = 50, walk_len = 5, supervised=True, train_all_edge=False):

        G = self.G
        if G == None:
            raise Exception("Data hasn't been load")

        print("Loaded data.. now preprocessing..")

        # Categorize train, val and test nodes
        # Using id_maps.keys to control the node index
        self.nodes_ids = np.array([n for n in G.node.keys()])

        # if not train_all_edge and 0:
        #     self.train_nodes_ids = np.array([n for n in self.nodes_ids if not G.node[n]['val'] and not G.node[n]['test']])
        #     self.val_nodes_ids = np.array([n for n in self.nodes_ids if G.node[n]['val']])
        #     self.test_nodes_ids = np.array([n for n in self.nodes_ids if G.node[n]['test']])
        # else:
        self.train_nodes_ids = np.array([n for n in self.nodes_ids])
        self.val_nodes_ids = np.array([n for n in self.nodes_ids])
        self.test_nodes_ids = np.array([n for n in self.nodes_ids])

        self.nodes = np.array([self.id_map[n] for n in self.nodes_ids])
        self.train_nodes = np.array([self.id_map[n] for n in self.train_nodes_ids])
        self.val_nodes = np.array([self.id_map[n] for n in self.val_nodes_ids])
        self.test_nodes = np.array([self.id_map[n] for n in self.test_nodes_ids])

        ## Make sure the graph has edge train_removed annotations
        ## (some datasets might already have this..)
        for edge in G.edges():
            # if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or
            #     G.node[edge[0]]['test'] or G.node[edge[1]]['test']):
            #     G[edge[0]][edge[1]]['train_removed'] = True
            # else:
            G[edge[0]][edge[1]]['train_removed'] = False

        #Remove isolated train nodes after remove "train_remove" edge from train graph
        # and val nodes and test nodes from original graph
        if not train_all_edge:
            self.remove_isolated_node()

        #Construct train_deg and deg, deg[i] is degree of node that have idx i, train_deg consider "train_remove" edge
        if not train_all_edge:
            self.construct_train_val_deg()
        else:
            self.construct_all_deg()

        #Construct train_adj and adj, adj is matrix of Uniformly samples neighbors of nodes
        if load_adj_dir is not None:
            self.train_adj = np.load(load_adj_dir + "train_adj.npy")
            self.adj = np.load(load_adj_dir + "adj.npy")
        else:
            if not train_all_edge:
                self.construct_train_val_adj()
            else:
                self.construct_all_adj()

        if normalize and not self.feats is None:
            from sklearn.preprocessing import StandardScaler
            # import pdb
            # pdb.set_trace()
            train_feats = self.feats[self.train_nodes]
            scaler = StandardScaler()
            scaler.fit(train_feats)
            self.feats = scaler.transform(self.feats)

        if not supervised:
            if use_random_walks:
                if load_walks and os.path.exists(prefix + "-walks.txt"):
                    walks = []
                    with open(prefix + "-walks.txt") as fp:
                        for line in fp:
                            walks.append(map(self.conversion, line.split()))
                    self.walks = walks
                    if len(walks) == 0:
                        raise Exception("Empty walks file at {0}".format(prefix + "-walks.txt"))
                else:
                    if load_walks:
                        print("Walks file not exist, run random walk with num_walk {0} and len_walk {1}".format(num_walk, walk_len))
                    else:
                        print("Run random walk with num_walk {0} and len_walk {1}".format(num_walk, walk_len))
                    self.walks = self.run_random_walks(out_file = self.prefix + "-walks.txt", num_walks = num_walk, walk_len = walk_len)

                print("Total walks edge: {0}".format(len(self.walks)))

            if not train_all_edge:
                self.construct_train_val_edge()
            else:
                self.construct_train_all_edge()

        print("Preprocessing finished, graph info:")
        print(nx.info(G))
def validate():
    """
    run KFOLD method for regression 
    """
    #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    
    
    x = 475
    y = 476
    
    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])
    
    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)
        
        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            return "file already analyzed!"
        
        
        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        pred_surge.sort_values(by = 'date', inplace = True)
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)
        
        metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
            #train regression model
            lm = LinearRegression()
            lm.fit(X_train, y_train)
            
            #predictions
            predictions = lm.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)    
            
            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)
        
        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')
        
        #original size and pca size of matrix added
        new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis = 0)
        
        
        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)
        
        #cd to dir_in
        os.chdir(dir_in)
예제 #39
0
파일: train.py 프로젝트: fmidev/trains
def main():
    """
    Main program
    """
    local_device_protos = device_lib.list_local_devices()
    logging.info(
        [x.name for x in local_device_protos if x.device_type == 'GPU'])

    bq = _bq.BQHandler()
    io = _io.IO(gs_bucket=options.gs_bucket)
    viz = _viz.Viz()

    starttime, endtime = io.get_dates(options)
    #save_path = options.save_path+'/'+options.config_name

    logging.info('Using dataset {} and time range {} - {}'.format(
        options.feature_dataset, starttime.strftime('%Y-%m-%d'),
        endtime.strftime('%Y-%m-%d')))

    all_param_names = options.label_params + options.feature_params + options.meta_params
    aggs = io.get_aggs_from_param_names(options.feature_params)

    logging.info('Reading data...')
    bq.set_params(starttime,
                  endtime,
                  batch_size=2500000,
                  loc_col='trainstation',
                  project=options.project,
                  dataset=options.feature_dataset,
                  table=options.feature_table,
                  parameters=all_param_names,
                  only_winters=options.only_winters)

    data = bq.get_rows()

    data = io.filter_train_type(labels_df=data,
                                train_types=options.train_types,
                                sum_types=True,
                                train_type_column='train_type',
                                location_column='trainstation',
                                time_column='time',
                                sum_columns=['train_count', 'delay'],
                                aggs=aggs)

    if options.y_avg_hours is not None:
        data = io.calc_running_delay_avg(data, options.y_avg_hours)

    if options.y_avg:
        data = io.calc_delay_avg(data)

    data.sort_values(by=['time', 'trainstation'], inplace=True)

    if options.normalize:
        logging.info('Normalizing data...')
        xscaler = StandardScaler()
        yscaler = StandardScaler()

        non_scaled_data = data.loc[:, options.meta_params]
        labels = data.loc[:, options.label_params].astype(
            np.float32).values.reshape((-1, 1))

        yscaler.fit(labels)
        scaled_labels = pd.DataFrame(yscaler.transform(labels),
                                     columns=['delay'])
        scaled_features = pd.DataFrame(xscaler.fit_transform(
            data.loc[:, options.feature_params].astype(np.float32)),
                                       columns=options.feature_params)

        data = pd.concat([non_scaled_data, scaled_features, scaled_labels],
                         axis=1)

    if options.pca:
        logging.info('Doing PCA analyzis for the data...')
        ipca = IncrementalPCA(n_components=options.pca_components,
                              whiten=options.whiten,
                              copy=False)

        non_processed_data = data.loc[:, options.meta_params +
                                      options.label_params]
        processed_data = data.loc[:, options.feature_params]
        ipca.fit(processed_data)
        processed_features = pd.DataFrame(ipca.transform(processed_data))

        data = pd.concat([non_processed_data, processed_data], axis=1)

        fname = options.output_path + '/ipca_explained_variance.png'
        viz.explained_variance(ipca, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)

    data_train, data_test = train_test_split(data, test_size=0.33)
    X_test, y_test = io.extract_batch(data_test,
                                      options.time_steps,
                                      batch_size=None,
                                      pad_strategy=options.pad_strategy,
                                      quantile=options.quantile,
                                      label_params=options.label_params,
                                      feature_params=options.feature_params)

    # Define model
    batch_size = io.get_batch_size(data_train,
                                   options.pad_strategy,
                                   quantile=options.quantile)
    logging.info('Batch size: {}'.format(batch_size))
    model = LSTM.LSTM(options.time_steps,
                      len(options.feature_params),
                      1,
                      options.n_hidden,
                      options.lr,
                      options.p_drop,
                      batch_size=batch_size)

    # Initialization
    rmses, mses, maes, steps, train_mse = [], [], [], [], []
    saver = tf.train.Saver()
    sess = tf.Session()
    init = tf.global_variables_initializer()
    sess.run(init)
    summary_writer = tf.summary.FileWriter(options.log_dir,
                                           graph=tf.get_default_graph())

    #tf.summary.scalar('Training MSE', model.loss)
    tf.summary.scalar('Validation_MSE', model.mse)
    tf.summary.scalar('Validation_RMSE', model.rmse)
    tf.summary.scalar('Validation_MAE', model.mae)
    tf.summary.histogram('y_pred_hist', model.y_pred)
    merged_summary_op = tf.summary.merge_all()
    train_summary_op = tf.summary.scalar('Training_MSE', model.loss)

    train_step = 0
    start = 0
    while True:
        # If slow is set, go forward one time step at time,
        # else proceed whole batch size
        if options.slow:
            X_train, y_train = io.extract_batch(
                data_train,
                options.time_steps,
                start=start,
                pad_strategy=options.pad_strategy,
                quantile=options.quantile,
                label_params=options.label_params,
                feature_params=options.feature_params)
        else:
            X_train, y_train = io.extract_batch(
                data_train,
                options.time_steps,
                train_step,
                pad_strategy=options.pad_strategy,
                quantile=options.quantile,
                label_params=options.label_params,
                feature_params=options.feature_params)

        if (len(X_train) < options.time_steps):
            break

        if options.cv:
            logging.info('Doing random search for hyper parameters...')

            param_grid = {
                "C": [0.001, 0.01, 0.1, 1, 10],
                "epsilon": [0.01, 0.1, 0.5],
                "kernel": ['rbf', 'linear', 'poly', 'sigmoid', 'precomputed'],
                "degree": [2, 3, 4],
                "shrinking": [True, False],
                "gamma": [0.001, 0.01, 0.1],
                "coef0": [0, 0.1, 1]
            }

            random_search = RandomizedSearchCV(model,
                                               param_distributions=param_grid,
                                               n_iter=int(
                                                   options.n_iter_search),
                                               n_jobs=-1)

            random_search.fit(X_train, y_train)
            logging.info("RandomizedSearchCV done.")
            fname = options.output_path + '/random_search_cv_results.txt'
            report_cv_results(random_search.cv_results_, fname)
            io._upload_to_bucket(filename=fname, ext_filename=fname)
            sys.exit()
        else:
            if train_step == 0:
                logging.info('Training...')

            feed_dict = {model.X: X_train, model.y: y_train}
            _, loss, train_summary = sess.run(
                [model.train_op, model.loss, train_summary_op],
                feed_dict=feed_dict)

            summary_writer.add_summary(train_summary, train_step * batch_size)

        # Metrics
        feed_dict = {model.X: X_test, model.y: y_test}
        #model.cell_init_state: state}

        val_loss, rmse, mse, mae, y_pred, summary = sess.run(
            [
                model.loss, model.rmse, model.mse, model.mae, model.y_pred,
                merged_summary_op
            ],
            feed_dict=feed_dict)

        train_mse.append(loss)
        mses.append(mse)
        rmses.append(rmse)
        maes.append(mae)
        steps.append(train_step)

        summary_writer.add_summary(summary, train_step * batch_size)
        if train_step % 50 == 0:
            logging.info("Step {}:".format(train_step))
            logging.info("Training loss: {:.4f}".format(loss))
            logging.info("Validation MSE: {:.4f}".format(val_loss))
            logging.info('Validation RMSE: {}'.format(rmse))
            logging.info('Validation MAE: {}'.format(mae))
            logging.info('................')
            saver.save(sess, options.save_file)

        train_step += 1
        start += 1
        # <-- while True:

    saver.save(sess, options.save_file)
    if options.normalize:
        fname = options.save_path + '/yscaler.pkl'
        io.save_scikit_model(yscaler, fname, fname)
    io._upload_dir_to_bucket(options.save_path, options.save_path)

    try:
        fname = options.output_path + '/learning_over_time.png'
        metrics = [{
            'metrics': [{
                'values': mses,
                'label': 'Validation MSE'
            }, {
                'values': train_mse,
                'label': 'Train MSE'
            }],
            'y_label':
            'MSE'
        }, {
            'metrics': [{
                'values': rmses,
                'label': 'Validation RMSE'
            }],
            'y_label': 'RMSE'
        }, {
            'metrics': [{
                'values': maes,
                'label': 'Validation MAE'
            }],
            'y_label': 'MAE'
        }]
        viz.plot_learning(metrics, fname)
        io._upload_to_bucket(filename=fname, ext_filename=fname)
    except Exception as e:
        logging.error(e)

    error_data = {
        'steps': steps,
        'mse': mses,
        'rmse': rmses,
        'mae': maes,
        'train_mse': train_mse
    }
    fname = '{}/training_time_validation_errors.csv'.format(
        options.output_path)
    io.write_csv(error_data, filename=fname, ext_filename=fname)
#done at top


# ** Create a StandardScaler() object called scaler.**

# In[13]:

scaler = StandardScaler()


# ** Fit scaler to the features.**

# In[14]:

data_features = data.drop('TARGET CLASS', axis=1)
scaler.fit(data_features)


# **Use the .transform() method to transform the features to a scaled version.**

# In[17]:

scaled_features = scaler.transform(data.drop('TARGET CLASS', axis=1))


# **Convert the scaled features to a dataframe and check the head of this dataframe to make sure the scaling worked.**

# In[20]:

data_feat = pd.DataFrame(scaled_features, columns=data.columns[0:-1])
data_feat.head()
예제 #41
0
파일: pbar.py 프로젝트: mtgilmore/pbar
class Model(object):
    def __init__(self):
        self.features = []
        # self.model = KNeighborsRegressor(n_neighbors=3, p=2)
        # self.model = LinearRegression()
        self.model = RandomForestRegressor(n_estimators=300)
        #self.model = AdaBoostRegressor(n_estimators=200)
        self.imp = SimpleImputer(missing_values=np.nan,
                                 strategy='constant',
                                 fill_value=0)
        self.scaler = StandardScaler()
        self.trained = False

    @staticmethod
    def get_labeled_logs(
            dataset: List[ProgramLog]) -> List[Tuple[ProgramLog, float]]:
        # Logic here is a bit messy -- basically just want MAX_LOG_GRANULARITY entries at most for one log
        items = []
        for log in dataset:
            total_time = log.duration()
            calls_per_entry = max(len(log.calls) // MAX_LOG_GRANULARITY, 1)

            acc = []
            for syscall in log.calls:
                acc.append(syscall)
                if len(acc) > 1 and len(acc) % calls_per_entry == 0:
                    new_log = ProgramLog(log.cmd, acc)
                    items.append(
                        (new_log, (new_log.duration() / total_time) * 1))
        print("labeled log count", len(items))
        return items

    def update_features(self, logs: List[ProgramLog]) -> ():
        features = set(self.features)
        for log in logs:
            features.update(log.to_feature_map().keys())
        self.features = list(sorted(features))

    def extract_features(self, log: ProgramLog) -> List[Any]:
        vec = []
        feature_map = log.to_feature_map()
        for feature in self.features:
            vec.append(feature_map.get(feature, nan))
        return vec

    def train(self, cmd: List[str], dataset: List[ProgramLog]):
        print("Generating trimmed dataset...")
        # Create a trimmed dataset of commands that prefix match -- getting as specific as possible
        trimmed_dataset = dataset
        i = 0
        while len(cmd) > i:
            candidate = list(
                filter(lambda log: i < len(log.cmd) and log.cmd[i] == cmd[i],
                       dataset))
            if len(candidate) == 0:
                break
            else:
                trimmed_dataset = candidate
                i += 1

        print("Generating labeled logs...")
        trimmed_dataset = Model.get_labeled_logs(trimmed_dataset)
        self.update_features([log for log, label in trimmed_dataset])

        if len(trimmed_dataset) > 0:
            x = []
            y = []
            print("Extracting features from labeled logs...")
            for log, label in trimmed_dataset:
                x.append(self.extract_features(log))
                y.append(label)

            x = np.array(x)
            y = np.array(y)

            print("Preprocessing data...")
            self.imp.fit(x, y)
            x = self.imp.transform(x)
            self.scaler.fit(x, y)
            x = self.scaler.transform(x)

            print("Fitting model...")
            self.model.fit(x, y)
            print("model training accuracy = ",
                  self.model.score(x, y) * 100, "%")
            self.trained = True

    def predict_completion(self, log: ProgramLog) -> float:
        if not self.trained:
            return 1.
        features = [self.extract_features(log)]
        features = self.imp.transform(features)
        features = self.scaler.transform(features)

        return self.model.predict(features)

    def check_accuracy(self, labled_logs: List[Tuple[ProgramLog, float]]):
        if not self.trained:
            return 1.

        x = [self.extract_features(log) for log, _ in labled_logs]
        x = self.imp.transform(x)
        x = self.scaler.transform(x)

        y = [label for _, label in labled_logs]
        return self.model.score(x, y)
df = pd.read_csv(filepath + os.sep + "iris.data", skiprows=0, header=None)
# target variable
y = df.iloc[:, 4].map(class_label).values
# print(df.head())
# sys.exit()
# feature matrix
print('*' * 30)
first_feature = int(input('Please enter first feature >> '))
second_feature = int(input('Please enter second feature >> '))
print('*' * 30)
X = df.iloc[:, [first_feature, second_feature]]
# standardization of the feature matrix
std_sc = StandardScaler(copy=True, with_mean=True, with_std=True)
# X_new = std_sc.fit_transform(X)
# Compute the mean and std to be used for later scaling
std_sc.fit(X)
# Perform standardization by centering and scaling and return X
X_std = std_sc.transform(X)  # standardized feature matrix

# random splitting of train and test data
# splitting date for training and test
X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=1,
                                                    shuffle=True,
                                                    stratify=y)
# support vector machine classification
svm = SVC(C=1,
          kernel='rbf',
          degree=3,

unscaled_inputs.columns.values
#columns_to_scale = ['Month of absence', 'Day of the week', 'Seasons',
#      'Transportation expense', 'Distance from Residence to Work',
#     'Service time', 'Age', 'Work load Average/day ', 'Hit target',
#     'Disciplinary failure', 'Son', 'Social drinker',
#    'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index']

columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']
columns_to_scale = [
    x for x in unscaled_inputs.columns.values if x not in columns_to_omit
]

absent_scaler = CustomScaler(columns_to_scale)
absent_scaler.fit(unscaled_inputs)
scaled_input = absent_scaler.transform(unscaled_inputs)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_input,
                                                    targets,
                                                    train_size=0.8,
                                                    random_state=20)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

classifier.score(X_train, y_train)

y_pred = classifier.predict(X_test)
예제 #44
0
파일: HW1.py 프로젝트: X-Jian/-5-5-9
for errormean3 in errorall:
    var = var + ((errormean3 - errormean)**2)
var = var / (78 - 1)
deviation = math.sqrt(var)

x0, x1, lx2, mdatax2 = dispose('wine_train.csv', 0, 1)
trainingx2 = []
trainingx2.append(x0)
trainingx2.append(x1)
trainingx2 = np.array(trainingx2)
trainingx2 = trainingx2.T
mdatax2 = np.array(mdatax2)
lx2 = np.array(lx2)

std2 = StandardScaler()
std2.fit(trainingx2)
train_std2 = std2.fit_transform(trainingx2)

y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, l2, mdata2 = dispose2(
    'wine_train.csv', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
training2 = []
training2.append(y0)
training2.append(y1)
training2.append(y2)
training2.append(y3)
training2.append(y4)
training2.append(y5)
training2.append(y6)
training2.append(y7)
training2.append(y8)
training2.append(y9)
예제 #45
0
label_encoded_df = all_df[label_cols].apply(label_encoder)
#label_encoder 함수 적용(숫자 데이터로 만들어줌)

numerical_df = pd.DataFrame(scaler.fit_transform(all_df[numerical_cols]), columns=numerical_cols)
# StandardScaler 적용해서 값들 전처리

target_df = all_df[TARGET]



# 열 단위로 집계(axis = 0), 행 단위로 집계(axis = 1)
all_df = pd.concat([numerical_df, label_encoded_df, onehot_encoded_df, target_df], axis=1)

all_df_scaled = all_df.drop([TARGET], axis = 1).copy()

scaler.fit(all_df.drop([TARGET], axis = 1))
all_df_scaled = scaler.transform(all_df_scaled)
# 학습 데이터 세트로 fit() 된 Scaler를 이용하여 테스트 데이터를 변환할 경우에는 
# 테스트 데이터에서 다시 fit()하지 않고 반드시 그대로 이 Scaler를 이용하여 transform()을 수행해야 한다.

all_df_scaled = pd.DataFrame(all_df_scaled, columns=all_df.drop([TARGET], axis = 1).columns)

X = all_df_scaled
y = all_df[TARGET]
print (f'X:{X.shape} y: {y.shape}')
# X:(200000, 21) y: (200000,)

X_train, X_test, y_train, y_test = train_test_split(
       X, y, test_size = 0.20, random_state = RANDOM_SEED)

test = all_df_scaled[len(train):]
def reconstructRF():
    """
    run KFOLD method for random forest regression 
    """
    #import packages
    import os
    import numpy as np
    import pandas as pd
    #from sklearn import metrics
    #from scipy import stats
    #import seaborn as sns
    #import matplotlib.pyplot as plt
    #from sklearn.model_selection import KFold
    from datetime import datetime
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/rfReconstruction"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    # #load KFOLD result csv file
    # os.chdir('F:\\06_eraint_results\\sonstig')
    # kf_dat = pd.read_csv('eraint_randForest_kfold.csv')
    # #edit the tg names to be usable later on
    # editName = lambda x: x.split('.csv')[0]
    # kf_dat['tg'] = pd.DataFrame(list(map(editName, kf_dat['tg'])), columns= ['tg'])

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 39
    y = 40

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        #get the number of PCs used during validation
        # pc_num = kf_dat.loc[kf_dat['tg'] == tg_name]['num_95pcs']
        pca = PCA(0.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        {  # #apply 10 fold cross validation
            # kf = KFold(n_splits=10, random_state=29)

            # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
            # for train_index, test_index in kf.split(X):
            #     X_train, X_test = X_pca[train_index], X_pca[test_index]
            #     y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #     #train regression model
            #     rf = RandomForestRegressor(n_estimator = 50, min_samples_leaf = 1)
            #     lm.fit(X_train, y_train)

            #     #predictions
            #     predictions = lm.predict(X_test)
            #     # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #     #                       pd.DataFrame(np.array(y_test))], \
            #     #                      axis = 1)
            #     # pred_obs.columns = ['pred', 'obs']
            #     # combo = pd.concat([combo, pred_obs], axis = 0)

            #     #evaluation matrix - check p value
            #     if stats.pearsonr(y_test, predictions)[1] >= 0.05:
            #         print("insignificant correlation!")
            #         continue
            #     else:
            #         #print(stats.pearsonr(y_test, predictions))
            #         metric_corr.append(stats.pearsonr(y_test, predictions)[0])
            #         #print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            #         metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))

            # #number of years used to train/test model
            # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\
            #                       pred_surge['date'][0]).days/365)
        }

        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1]  #number of principal components
        # corr = np.mean(metric_corr)
        # rmse = np.mean(metric_rmse)

        # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\
        #       np.mean(metric_corr), ' -  avg_rmse (m) = ', \
        #       np.mean(metric_rmse), '\n')

        #%%
        #surge reconstruction
        pred_for_recon = pred[~pred.isna().any(axis=1)]
        pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1)

        #standardize predictor data
        dat = pred_for_recon.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat(
            [pred_for_recon['date'], dat_standardized], axis=1)

        X_recon = pred_standardized.iloc[:, 1:]

        #apply PCA
        pca = PCA(num_pc)  #use the same number of PCs used for training
        pca.fit(X_recon)
        X_pca_recon = pca.transform(X_recon)

        #%%
        #model preparation
        #defining the rf model with number of trees and minimum leaves
        rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=1, \
                                   random_state = 29)
        rf.fit(X_pca, y)

        #get prediction interval
        def pred_ints(model, X_pca_recon, percentile=95):
            """
            function to construct prediction interval
            taking into account the result of each 
            regression tree
            """
            err_down = []
            err_up = []
            preds = []

            for pred in model.estimators_:
                preds.append(pred.predict(X_pca_recon))
            preds = np.vstack(preds).T
            err_down = np.percentile(preds, (100 - percentile)/2., axis = 1, \
                                     keepdims = True)
            err_up = np.percentile(preds, 100 - (100 - percentile)/2., axis =1, \
                                   keepdims = True)

            return err_down.reshape(-1), err_up.reshape(-1)

        #compute 95% prediction intervals
        err_down, err_up = pred_ints(rf, X_pca_recon, percentile=95)
        #reconstructed surge goes here
        truth = rf.predict(X_pca_recon)

        correct = 0.
        for i, val in enumerate(truth):
            if err_down[i] <= val <= err_up[i]:
                correct += 1
        print(correct * 100 / len(truth), '\n')

        #final dataframe
        final_dat = pd.concat([pred_standardized['date'], \
                               pd.DataFrame([truth, err_down, err_up]).T], axis = 1)
        final_dat['lon'] = longitude
        final_dat['lat'] = latitude
        final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\
                             'pred_int_upper', 'lon', 'lat']

        {  #plot - optional
            # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date'])
            # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date'])
            # sns.set_context('notebook', font_scale = 2)
            # plt.figure()
            # plt.plot(final_dat['date'], final_dat['mean'], color = 'green')
            # plt.scatter(surge['date'], surge['surge'], color = 'blue')
            #prediction intervals
            # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red',  linestyle = "--", lw = 0.8)
            #confidence intervals
            # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black',  linestyle = "--", lw = 0.8)
        }

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        final_dat.to_csv(tg_name)

        #cd to dir_in
        os.chdir(dir_in)
예제 #47
0
y = generate_random_points(n=100, p=10)
x = generate_random_points(n=100, p=10)


#supress scikit future warnings
def warn(*args, **kwargs):
    pass


import warnings
warnings.warn = warn

from numpy import mean, std
from sklearn.preprocessing import StandardScaler, scale
scaler = StandardScaler()
scaler.fit(x)
x_transformed = scaler.transform(x)
x_scaled = scale(x)
y_scaled = scale(y)
# print(x_scaled)
# print(x_transformed)
#print(mean(x))
#print(mean(x_scaled))
#print(mean(x_transformed))
#print(std(x))
#print(std(x_scaled))
#print(std(x_transformed))

#print(mean(x_scaled))
#print(std(x_scaled))
print("LASSO OF unstandardized :")
#print(clf.coef_)
print("\n", pretty_print_linear(clf.coef_))
print("Training score of LASSO with alpha {} is {} \n".format(
    alpha, clf.score(X_all_train, y_all_train)))
print("Testing score of LASSO with alpha {} is {} \n".format(
    alpha, clf.score(X_all_test, y_all_test)))
print("clf != 0 : ", sum(clf.coef_ != 0))
#

# #
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(dataset)
dataset = scaler.transform(dataset)
dataset = pd.DataFrame(
    dataset,
    columns=[
        'Age', 'Number of sexual partners', 'First sexual intercourse',
        'Num of pregnancies', 'Smokes', 'Smokes (years)',
        'Smokes (packs/year)', 'Hormonal Contraceptives',
        'Hormonal Contraceptives (years)', 'IUD', 'IUD (years)', 'STDs',
        'STDs (number)', 'STDs:condylomatosis', 'STDs:vaginal condylomatosis',
        'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis',
        'STDs:pelvic inflammatory disease', 'STDs:genital herpes',
        'STDs:molluscum contagiosum', 'STDs:HIV', 'STDs:Hepatitis B',
        'STDs:HPV', 'STDs: Number of diagnosis', 'Dx:Cancer', 'Dx:CIN',
        'Dx:HPV', 'Dx', 'Hinselmann', 'Schiller', 'Citology', 'Biopsy'
    ])
예제 #49
0
# Разбили набор данных на обучающий и испытательный

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Проверить количество меток

print("Метки y:", np.bincount(y))
print("Метки y_train:", np.bincount(y_train))
print("Метки y_test:", np.bincount(y_test))

# Масштабирование признаков

print("// Обучение начато //")
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
print("// Обучение завершено //")

# Перцептрон

ppn = Perceptron(max_iter=40, eta0=0.01, random_state=1)
ppn.fit(X_train_std, y_train)
y_pred = ppn.predict(X_test_std)

print("Неправильно классифицированные: %d" % (y_test != y_pred).sum())

# Правильность

print("Правильность: %.2f" % accuracy_score(y_test, y_pred))
예제 #50
0
파일: pointwise.py 프로젝트: kkonevets/pat
from sklearn.preprocessing import StandardScaler
import matplotlib.pylab as plt
import seaborn as sns

train_val = ftrs_df['q'].unique()
t_ixs, v_ixs = train_test_split(train_val, test_size=0.2, random_state=SEED)
x_train = data.loc[ftrs_df['q'].isin(t_ixs)]
y_train = ftrs_df.loc[ftrs_df['q'].isin(t_ixs), 'rank']
x_val = data.loc[ftrs_df['q'].isin(v_ixs)]
y_val = ftrs_df.loc[ftrs_df['q'].isin(v_ixs), 'rank']

x_train, y_train = shuffle(x_train, y_train, random_state=SEED)
x_val, y_val = shuffle(x_val, y_val, random_state=SEED)

scaler = StandardScaler()
print(scaler.fit(x_train))

x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns)
x_val = pd.DataFrame(scaler.transform(x_val), columns=x_val.columns)

plt.scatter(x_val['bm25'], x_val['tfidf_gs'], c=y_val, alpha=0.1)
plt.show()

from sklearn import linear_model

model = linear_model.LogisticRegression(C=1)
model.fit(x_train, y_train)
probs = model.predict_proba(x_val)[:, 0]

ones = probs[np.where(y_val == 1)]
twoes = probs[np.where(y_val == 2)]
예제 #51
0
x = wine.drop("quality", axis=1)

# y 레이블 변경하기 --- (*2)
newlist = []
for v in list(y):
    if v <= 4:
        newlist += [0]
    elif v <= 7:
        newlist += [1]
    else:
        newlist += [2]
y = newlist

from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
scaler.fit(x)
x = scaler.transform(x)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

# 학습하기
model = RandomForestClassifier(n_estimators=800, max_features='log2', n_jobs=4)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)

# 평가하기
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))
print("정답률=", accuracy_score(y_test, y_pred))
print("Score :", score)
#└ 여기서는 model의 score와 accuracy_score와 점수가 동일하게 나오는데, 이는 현재 모델이 분류모델이기에 데이터 값이 분류형태로 나왔기 때문이다.

# In[25]:


from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 33)


# In[26]:


from sklearn.preprocessing import StandardScaler
num_values1=data.select_dtypes(['float64','int64']).columns
scaler = StandardScaler()
scaler.fit(data[num_values1])
data[num_values1]=scaler.transform(data[num_values1])


# In[27]:


from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred_lr = lr.predict(X_test)


# In[28]:

def reconstruct():
    """
    run KFOLD method for regression 
    """
    #import packages
    import os
    import pandas as pd
    import statsmodels.api as sm
    from datetime import datetime
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/mlrReconstruction"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 173
    y = 174

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        {
            # #apply 10 fold cross validation
            # kf = KFold(n_splits=10, random_state=29)

            # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
            # for train_index, test_index in kf.split(X):
            #     X_train, X_test = X_pca[train_index], X_pca[test_index]
            #     y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #     #train regression model
            #     lm = LinearRegression()
            #     lm.fit(X_train, y_train)

            #     #predictions
            #     predictions = lm.predict(X_test)
            #     # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #     #                       pd.DataFrame(np.array(y_test))], \
            #     #                      axis = 1)
            #     # pred_obs.columns = ['pred', 'obs']
            #     # combo = pd.concat([combo, pred_obs], axis = 0)

            #     #evaluation matrix - check p value
            #     if stats.pearsonr(y_test, predictions)[1] >= 0.05:
            #         print("insignificant correlation!")
            #         continue
            #     else:
            #         #print(stats.pearsonr(y_test, predictions))
            #         metric_corr.append(stats.pearsonr(y_test, predictions)[0])
            #         #print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            #         metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))

            # # #number of years used to train/test model
            # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\
            #                       pred_surge['date'][0]).days/365)
            # longitude = surge['lon'][0]
            # latitude = surge['lat'][0]
            # num_pc = X_pca.shape[1] #number of principal components
            # corr = np.mean(metric_corr)
            # rmse = np.mean(metric_rmse)

            # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\
            #       np.mean(metric_corr), ' -  avg_rmse (m) = ', \
            #       np.mean(metric_rmse), '\n')
        }

        num_pc = X_pca.shape[1]  #number of principal components
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]

        #surge reconstruction
        pred_for_recon = pred[~pred.isna().any(axis=1)]
        pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1)

        #standardize predictor data
        dat = pred_for_recon.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat(
            [pred_for_recon['date'], dat_standardized], axis=1)

        X_recon = pred_standardized.iloc[:, 1:]

        #apply PCA
        pca = PCA(num_pc)  #use the same number of PCs used for training
        pca.fit(X_recon)
        X_pca_recon = pca.transform(X_recon)

        #model preparation
        #first train model using observed surge and corresponding predictors
        X_pca = sm.add_constant(X_pca)
        est = sm.OLS(y['surge'], X_pca).fit()

        #predict with X_recon and get 95% prediction interval
        X_pca_recon = sm.add_constant(X_pca_recon)
        predictions = est.get_prediction(X_pca_recon).summary_frame(alpha=0.05)

        #drop confidence interval and mean_se columns
        predictions.drop(['mean_se', 'mean_ci_lower','mean_ci_upper'], \
                         axis = 1, inplace = True)

        #final dataframe
        final_dat = pd.concat([pred_standardized['date'], predictions], axis=1)
        final_dat['lon'] = longitude
        final_dat['lat'] = latitude
        final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\
                             'pred_int_upper', 'lon', 'lat']

        {
            # plot - optional
            # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date'])
            # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date'])
            # sns.set_context('notebook', font_scale = 2)
            # plt.figure()
            # plt.plot(final_dat['date'], final_dat['mean'], color = 'green')
            # plt.scatter(surge['date'], surge['surge'], color = 'blue')
            # prediction intervals
            # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red',  linestyle = "--", lw = 0.8)
            # confidence intervals
            # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black',  linestyle = "--", lw = 0.8)
        }

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        final_dat.to_csv(tg_name)
예제 #54
0
파일: Annex.py 프로젝트: CangelosiQ/Defi
def get_data_raw(scale,
                 add_dummies,
                 var_dummies,
                 TrainTestSplit=True,
                 sz_test=0.3,
                 impute_method='drop',
                 convert_month2int=False,
                 date_method='drop'):
    print('We are addressing your request.')
    listdir('./../data_meteo/')
    list_files = np.empty(36, dtype='|U12')
    i = 0
    for fichier in listdir('./../data_meteo/'):
        if 'train' in fichier:
            list_files[i] = fichier
            i = i + 1

    df = pd.DataFrame()
    for file in list_files:
        df = pd.concat([df, open_and_transform(file)])

    df = df.sort_values(by=['ech', 'date'], ascending=True)
    print('Data has been imported. Size:', df.shape)

    if convert_month2int:
        df = convert_month_to_int(df)
        print('Months converted to int.')

    if add_dummies:
        df_dummies = pd.get_dummies(df[var_dummies])
        df = pd.concat([df, df_dummies], axis=1)
        df = df.drop(var_dummies, axis=1)
        print('Dummies added.')

    if date_method == 'drop':
        df = df.drop(['date'], axis=1)
        print('Date dropped.')

    if impute_method == 'drop':
        N_before = df.shape[0]
        df = df.dropna(axis=0)
        N_after = df.shape[0]
        print("%d data points deleted. %0.2f %s" %
              (N_before - N_after, (N_before - N_after) / N_before * 100, '%'))

    if TrainTestSplit:
        Y = df['tH2_obs']
        X = df
        X = X.drop(['tH2_obs'], axis=1)  ## !!! Date?
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=sz_test,
                                                            random_state=11)
        print('Train size: %d, Test size: %d' %
              (X_train.shape[0], X_test.shape[0]))

    if scale:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        # Meme transformation sur le test
        X_test = scaler.transform(X_test)
        print('Data scaled')

    return X_train, X_test, Y_train, Y_test, scaler
예제 #55
0
파일: KNN.py 프로젝트: alidrd/Auction_data
# #isolation forest
# clf = IsolationForest(random_state=0, contamination="auto").fit(X_Y)
# inliers = clf.predict(X_Y)
# # covariance
# cov = EllipticEnvelope(random_state=0, contamination=0.2).fit(X_Y)
# inliers = cov.predict(X_Y)
print('Number of inliners are ', str(len(inliers[inliers == 1])))

# keeping only the inliers (all variables)
X = country_data.loc[:, columns_to_consider].values.reshape(-1, len(columns_to_consider))
Y = country_data.loc[:, ['Price', 'Demand']].values.reshape(-1, 2)
X = X[inliers == 1, :]
Y = Y[inliers == 1, :]
# scaling X and Y
scaler_Y = StandardScaler()
scaler_Y.fit(Y)
Y_scaled = scaler_Y.transform(Y)
scaler_X = StandardScaler()
scaler_X.fit(X)
X_scaled = scaler_X.transform(X)
FR_RD_scaler = 2
X_scaled[:, 0] = FR_RD_scaler * X_scaled[:, 0]
# finding KNN design
neigh = KNeighborsRegressor()
param_grid = [{'n_neighbors': [5, 15, 52, 168], 'weights': ['uniform']}]
clf = GridSearchCV(neigh, param_grid, scoring='r2', cv=10, refit=True)  # scoring='neg_mean_squared_error'
clf.fit(X_scaled, Y_scaled)

# loading all data points (not just the inliers) -------------------------------
X = country_data.loc[:, columns_to_consider].values.reshape(-1, len(
    columns_to_consider))  # comment to ignore outliers in the predictions
예제 #56
0
    print("running", data_dir)

    if data_dir == "feat":
        print("Using only features..")
        feats = np.load(dataset_dir + "/dolphins-feats.npy")
        ## Logistic gets thrown off by big counts, so log transform num comments and score
        feats[:, 0] = np.log(feats[:, 0] + 1.0)
        feats[:, 1] = np.log(feats[:, 1] - min(np.min(feats[:, 1]), -1))
        feat_id_map = json.load(open(dataset_dir + "/dolphins-id_map.json"))
        feat_id_map = {int(id): val for id, val in feat_id_map.iteritems()}
        train_feats = feats[[feat_id_map[id] for id in train_ids]]
        test_feats = feats[[feat_id_map[id] for id in test_ids]]
        print("Running regression..")
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaler.fit(train_feats)
        train_feats = scaler.transform(train_feats)
        test_feats = scaler.transform(test_feats)
        run_regression(train_feats, train_labels, test_feats, test_labels)
    else:
        embeds = np.load(data_dir + "/val.npy")
        id_map = {}
        with open(data_dir + "/val.txt") as fp:
            for i, line in enumerate(fp):
                id_map[int(line.strip())] = i
        train_embeds = embeds[[id_map[id] for id in train_ids]]
        test_embeds = embeds[[id_map[id] for id in test_ids]]

        print("Running regression..")
        run_regression(train_embeds, train_labels, test_embeds, test_labels)
        plt.scatter(x_test[:,0], x_test[:,1], c='', linewidth=1, marker='o', s=80, label='testSet')
         
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.legend(loc=2)
    plt.title(title)
    plt.show()

if __name__=='__main__':
    iris=datasets.load_iris()
    x=iris.data[:,[2,3]]
    y=iris.target
    
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0)
    
    sc = StandardScaler()
    sc.fit(x_train) # calcuate average and Standard Deviation of x_train
    x_train_std=sc.transform(x_train) # regulation x_train
    x_test_std=sc.transform(x_test) # regulation x_test
    
    
    ml=SVC(kernel='linear', C=10.0, gamma=0.10, random_state=0)
    ml.fit(x_train_std, y_train)
    y_pred = ml.predict(x_test_std)
    
    print('total test set : %d, total error : %d' %(len(y_test), (y_test != y_pred).sum()))
    print('accuracy : %.2f' %accuracy_score(y_test, y_pred))
    
    x_total = np.vstack((x_train_std, x_test_std)) #stack vertical
    y_total = np.hstack((y_train, y_test)) # stack horizental
    plot_decision_region(x=x_total, y=y_total, classifier=ml, title='scikit-learn SVM RBF')
예제 #58
0
                        na_values='?',
                        engine="python").dropna()

X, Xt = train_data[columns[::-1]], test_data[columns[::-1]]
y = [-1 if s == '<=50K' else 1 for s in train_data["income"]]
yt = [-1 if s == '<=50K.' else 1 for s in test_data["income"]]

demographic_groups(X)
vq_demographic_groups(X)

# numerical columns : standardize
numcols = [
    'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'
]
ss = StandardScaler()
ss.fit(X[numcols])
Xnum, Xtnum = ss.transform(X[numcols]), ss.transform(Xt[numcols])

# categorical columns: apply 1-hot-encoding
catcols = [
    'workClass', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'native-country'
]
enc = OneHotEncoder()
enc.fit(X[catcols])
Xcat, Xtcat = enc.transform(X[catcols]).toarray(), enc.transform(
    Xt[catcols]).toarray()
X, Xt = np.concatenate((Xnum, Xcat), axis=1), np.concatenate((Xtnum, Xtcat),
                                                             axis=1)

pca = PCA(n_components=10)
예제 #59
0
#Converting string to float
le=LabelEncoder()
for col in convt_columns:
	le.fit(X[col].astype(str))
	X[col]=le.transform(X[col].astype(str))

# Filling of empty values
X=X.fillna(round(X.mean(),2))
Y=Y.fillna(round(Y.mean(),2))

#####################################################################################################
#Splitting data into train and test 
Train_X,Test_X,Train_Y,Test_Y=train_test_split(X,Y,test_size=0.20,random_state=30)
#Transorming data for SVM model
scaler=StandardScaler()
scaler.fit(Train_X)
Train_X=scaler.fit_transform(Train_X)
Test_X=scaler.fit_transform(Test_X)

#Training the model 
from sklearn.svm import SVR
model=SVR(kernel='rbf')
model.fit(Train_X,Train_Y)
pred=model.predict(Test_X)
print("Model: SVM \n")
print("Score :",round(model.score(Test_X,Test_Y),4))
print("Mean absolute error :",round(metrics.mean_absolute_error(Test_Y,pred),2))
print("_____________________________\n")
#####################################################################################################
# Uploading test dataset
df=pd.read_excel('Test_dataset.xlsx')
예제 #60
0
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
dataset=MultiColumnLabelEncoder(columns = ['day','week','weather','holiday','Special','meal type']).fit_transform(dataset)
test_set_att=MultiColumnLabelEncoder(columns = ['day','week','weather','holiday','Special','meal type']).fit_transform(test_set_att)

scaler = StandardScaler()
print(scaler.fit(dataset))
dataset = scaler.transform(dataset)
print(dataset)
test_set_att = scaler.transform(test_set_att)

lin_reg = LinearRegression()
lin_reg.fit(dataset,dataset_label)
dataset_prediction=lin_reg.predict(test_set_att)
lin_mse = mean_squared_error(test_set_label,dataset_prediction)
lin_rmse = np.sqrt(lin_mse)
print(lin_rmse)


accuracy = lin_reg.score(test_set_label, dataset_prediction)
print(accuracy*100,'%')