def remove_outliers(image,mask):
#taking the mask part to image to check the presence of bee
	im = cv2.bitwise_and(image,image,mask=mask);
	ldp_image,_,_ = ldp.ldp(im);
	test_Y = ldp_image.reshape((ldp_image.shape[0] * ldp_image.shape[1], ldp_image.shape[2]));
	test_rgb = im.reshape((im.shape[0] * im.shape[1], im.shape[2]));
	test = np.concatenate((test_Y,test_rgb),axis=1);
	mask_not = cv2.bitwise_not(mask);
	ret1, mask_not = cv2.threshold (mask_not,np.mean(mask_not), 255, cv2.THRESH_BINARY);		
	im = cv2.bitwise_and(image,image,mask=mask_not);
	ldp_image,_,_ = ldp.ldp(im);	
	data_ldp = ldp_image.reshape((ldp_image.shape[0] * ldp_image.shape[1], ldp_image.shape[2]));
	data_rgb = im.reshape((im.shape[0] * im.shape[1], im.shape[2]));
	data = np.concatenate((data_rgb,data_ldp),axis=1);
	data = data[np.any(data!=0,axis=1)];	
	print data.shape;		
	data = data.astype('float64');		
	data = preprocessing.normalize(data,axis=0);
	ss = StandardScaler();	
	data = ss.fit_transform(data);
	clf = svm.OneClassSVM(nu=0.8, kernel="rbf", gamma=0.1)
	clf.fit(data);
	test = test.astype('float64');		
	test = preprocessing.normalize(test,axis=0);	
	print test.shape;	
	test = ss.fit_transform(test);
	test = clf.predict(test);
	test = test.reshape((image.shape[0] , image.shape[1]));
	test[test==-1] = 0;
	test[test==1] = 255;
	test = test.astype('uint8');
	im = cv2.bitwise_and(image,image,mask=test);	
	im = cv2.bitwise_and(im,im,mask=mask);	
	#print test[:,0],test[:,1];	
	return(im,test);  
예제 #2
0
class LinearXGB(ClippedMixin):
    trained = set()
    cache = {}

    def __init__(self, params, num_rounds):
        self.params = params
        self.scaler = StandardScaler(with_mean=False)
        self.num_rounds = num_rounds

    def fit(self, dense, svd, sparse, y):
        X_train = np.hstack((dense, svd))
        #X_train = hstack((X_train, sparse))
        train_hash = hash(str(X_train))
        if train_hash not in self.trained:
            X_scaled = self.scaler.fit_transform(X_train)
            X_scaled = normalize(X_scaled)
            dtrain = xgb.DMatrix(X_scaled, label=y)
            watchlist = [(dtrain, 'train')]
            self.bst = xgb.train(self.params, dtrain, self.num_rounds)#, watchlist)
            self.trained.add(train_hash)

    def predict(self, dense, svd, sparse):
        X_test = np.hstack((dense, svd))
        #X_test = hstack((X_test, sparse))
        test_hash = hash(str(X_test))
        if test_hash not in self.cache:
            #X_scaled = X_test
            X_scaled = self.scaler.fit_transform(X_test)
            X_scaled = normalize(X_scaled)
            dtest = xgb.DMatrix(X_scaled)
            #dtest = xgb.DMatrix(X_test)
            y_pred = self.bst.predict(dtest)
            self.cache[test_hash] = y_pred
        return self.cache[test_hash]
예제 #3
0
def DBScan_Flux(phots, ycenters, xcenters, dbsClean=0, useTheForce=False):
    """Class methods are similar to regular functions.

    Note:
        Do not include the `self` parameter in the ``Args`` section.

    Args:
        param1: The first parameter.
        param2: The second parameter.

    Returns:
        True if successful, False otherwise.

    """
    
    dbsPhots    = DBSCAN()#n_jobs=-1)
    stdScaler   = StandardScaler()
    
    phots       = np.copy(phots.ravel())
    phots[~np.isfinite(phots)] = np.median(phots[np.isfinite(phots)])
    
    featuresNow = np.transpose([stdScaler.fit_transform(ycenters[:,None]).ravel(), \
                                stdScaler.fit_transform(xcenters[:,None]).ravel(), \
                                stdScaler.fit_transform(phots[:,None]).ravel()   ] )
    
    # print(featuresNow.shape)
    dbsPhotsPred= dbsPhots.fit_predict(featuresNow)
    
    return dbsPhotsPred == dbsClean
예제 #4
0
def prep_X_y(df, constant=False, split=True):
	cols_to_exclude = ['venue_state', 'venue_name', 'venue_country', 'venue_address', 'ticket_types', 'email_domain', 'description', 'previous_payouts', 'payee_name', 'org_name', 'org_desc', 'object_id', 'name', 'acct_type', 'country', 'listed', 'currency', 'payout_type', 'channels']

	if constant:
		df['const'] = 1

	X = df.drop(cols_to_exclude + ['fraud'], axis=1).values
	y = df['fraud'].values

	print 'columns used:\n', df.drop(cols_to_exclude + ['fraud'], axis=1).columns

	if split:
		X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

		scaler = StandardScaler()
		X_train = scaler.fit_transform(X_train)
		X_test = scaler.fit_transform(X_test)

		X_smoted, y_smoted = smote(X_train, y_train, target=.5)
		return X_smoted, X_test, y_smoted, y_test
	else:
		scaler = StandardScaler()
		X = scaler.fit_transform(X)
		X_smoted, y_smoted = smote(X, y, target=.5)
		return X_smoted, y_smoted
def logisticRegression():
    data = loadtxtAndcsv_data("data1.txt", ",", np.float64)
    X = data[:,0:-1]
    y = data[:,-1]

    # 划分为训练集和测试集
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

    # 归一化
    scaler = StandardScaler()
    # scaler.fit(x_train)
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.fit_transform(x_test)

    # 逻辑回归
    model = LogisticRegression()
    model.fit(x_train,y_train)

    # 预测
    predict = model.predict(x_test)
    right = sum(predict == y_test)

    predict = np.hstack((predict.reshape(-1,1),y_test.reshape(-1,1)))   # 将预测值和真实值放在一块,好观察
    print(predict)
    print('测试集准确率:%f%%'%(right*100.0/predict.shape[0]))          # 计算在测试集上的准确度
예제 #6
0
    def test_same_fit_transform(self):
        X, X_rdd = self.make_dense_rdd()

        local = StandardScaler()
        dist = SparkStandardScaler()

        X_trans = local.fit_transform(X)
        X_rdd_trans = dist.fit_transform(X_rdd).toarray()
        X_converted = dist.to_scikit().transform(X)

        assert_array_almost_equal(X_trans, X_rdd_trans)
        assert_array_almost_equal(X_trans, X_converted)

        local = StandardScaler(with_mean=False)
        dist = SparkStandardScaler(with_mean=False)

        X_trans = local.fit_transform(X)
        X_rdd_trans = dist.fit_transform(X_rdd).toarray()
        X_converted = dist.to_scikit().transform(X)

        assert_array_almost_equal(X_trans, X_rdd_trans)
        assert_array_almost_equal(X_trans, X_converted)

        local = StandardScaler(with_std=False)
        dist = SparkStandardScaler(with_std=False)

        X_trans = local.fit_transform(X)
        X_rdd_trans = dist.fit_transform(X_rdd).toarray()
        X_converted = dist.to_scikit().transform(X)

        assert_array_almost_equal(X_trans, X_rdd_trans)
        assert_array_almost_equal(X_trans, X_converted)
예제 #7
0
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False):
    """Generate a regression dataset with the given parameters."""
    if verbose:
        print("generating dataset...")

    X, y, coef = make_regression(n_samples=n_train + n_test,
                                 n_features=n_features, noise=noise, coef=True)

    random_seed = 13
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=n_train, random_state=random_seed)
    X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

    X_scaler = StandardScaler()
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)

    y_scaler = StandardScaler()
    y_train = y_scaler.fit_transform(y_train[:, None])[:, 0]
    y_test = y_scaler.transform(y_test[:, None])[:, 0]

    gc.collect()
    if verbose:
        print("ok")
    return X_train, y_train, X_test, y_test
예제 #8
0
class TrainValidSplitter(object):
    def __init__(self, standardize=True, few=False):
        self.standardize = standardize
        self.few = few
        self.standa = None

    def __call__(self, X, y, net):
        strati = StratifiedShuffleSplit(y = y, n_iter = 1, test_size = 0.2, random_state = 1234)
        
        train_indices, valid_indices = next(iter(strati))
        
        if self.standardize:
            self.standa = StandardScaler()
            if self.few:
                X_train = np.hstack((self.standa.fit_transform(X[train_indices,:23]), X[train_indices,23:]))
                X_valid = np.hstack((self.standa.transform(X[valid_indices,:23]), X[valid_indices,23:]))
            else:
                X_train = self.standa.fit_transform(X[train_indices])
                X_valid = self.standa.transform(X[valid_indices])
        else:
            X_train, X_valid = X[train_indices], X[valid_indices]
        
        y_train, y_valid = y[train_indices], y[valid_indices]
        
        return X_train, X_valid, y_train, y_valid
예제 #9
0
def _transform_data():
    from solaris.run import load_data
    from solaris.models import LocalModel

    data = load_data()
    X = data['X_train']
    y = data['y_train']

    # no shuffle - past-future split
    offset = X.shape[0] * 0.5
    X_train, y_train = X[:offset], y[:offset]
    X_test, y_test = X[offset:], y[offset:]

    print('_' * 80)
    print('transforming data')
    print
    tf = LocalModel(None)
    print('transforming train')
    X_train, y_train = tf.transform(X_train, y_train)
    print('transforming test')
    X_test, y_test = tf.transform(X_test, y_test)
    print('fin')

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    scaler = StandardScaler()
    y_train = scaler.fit_transform(y_train)
    y_test = scaler.transform(y_test)

    data = {'X_train': X_train, 'X_test': X_test,
            'y_train': y_train, 'y_test': y_test}
    joblib.dump(data, 'data/dbndata.pkl')
예제 #10
0
def batchScaling(in_root="raw", out_root="data", with_mean=True, with_std=True):

    Xy_files = filter(lambda x:x.endswith(".Xy.npz"), os.listdir(in_root))
    # Xy_files = ["image_rgb_gist.Xy.npz"]

    for Xy_file in Xy_files:

        in_path = os.path.join( in_root, Xy_file )
        out_path = os.path.join( out_root, Xy_file )

        print '> load %s' % ( in_path )

        data = np.load( in_path )
        
        ## detect sparse or dense
        _sparse = True if len(data['X'].shape) == 0 else False

        print '> scaling'
        if _sparse:
            ## Cannot center sparse matrices: pass `with_mean=False` instead.
            print '>> Sparse matrix detected. Use with_mean=False'
            scaler = StandardScaler(with_mean=False, with_std=with_std)
            X = scaler.fit_transform( data['X'].all() )
        else:
            scaler = StandardScaler(with_mean=with_mean, with_std=with_std)
            X = scaler.fit_transform( data['X'] )

        
        print '> compressing and dumping to %s' % (out_path)
        np.savez_compressed(out_path, X=X, y=data['y'])

        print '='*50
def data_fr(novel_num):
    #if csv_file(novel, novel_num) is True:
    nn = str(novel_num)
    df_novel = pd.read_csv('novel_'+nn+'list_1.csv', header=None)
    try: 
        df_novel['wrd_length'] = df_novel[0].apply(wrd_lengths)
        df_novel['total_char'] = [sum(l) for l in df_novel['wrd_length']]
        df_novel['syl_count'] = df_novel[0].apply(syl_count)
        df_novel['syl_sum'] = [sum(l) for l in df_novel['syl_count']]
        df_novel['sentiment'] = df_novel[0].apply(detect_sentiment)
        #create csv for word to syl to improve syl function
        d = {}
        for l in df_novel[0]:
            sent = TextBlob(l)
            for x in sent.words:
                w = CountSyllables(x)
                d[x] = w
        with open('novel_'+nn+'list_1_syl.csv', 'wb') as f:
            writer = csv.writer(f)
            for row in d.iteritems():
                writer.writerow(row) 
        #create cluster columns
        df_cluster = df_novel.drop('wrd_length', 1)
        df_cluster = df_cluster.drop('syl_count', 1)
        X = df_cluster.drop(0, axis = 1)
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        km = KMeans(n_clusters=20, random_state=1)
        km.fit(X_scaled)
        df_cluster_20 = df_cluster.copy()
        df_cluster_20['cluster'] = km.labels_
        df_novel['cluster_20'] = df_cluster_20['cluster']
        #Create cluster 3
        df_cluster_3 = df_cluster.copy()
        X = df_cluster_3.drop(0, axis=1)
        X_scaled = scaler.fit_transform(X)
        km = KMeans(n_clusters = 3, random_state=1)
        km.fit(X_scaled)
        df_cluster_3['cluster'] = km.labels_
        df_novel['cluster_3_syl'] = df_cluster_3['cluster']
        #create cluster 3 no syl
        df_cluster_3no_syl = df_cluster.copy()
        X = df_cluster_3no_syl.drop(0, axis=1)
        X_scaled = scaler.fit_transform(X)
        km = KMeans(n_clusters=3, random_state=1)
        km.fit(X_scaled)
        df_cluster_3no_syl['cluster'] = km.labels_
        df_novel['cluster_3no_syl'] = df_cluster_3no_syl['cluster']
        #Create 5 clusters
        df_cluster_5 = df_cluster.copy()
        X = df_cluster_5.drop(0, axis=1)
        X_scaled = scaler.fit_transform(X)
        km = KMeans(n_clusters=5, random_state=1)
        km.fit(X_scaled)
        df_cluster_5['cluster'] = km.labels_
        df_novel['cluster_5'] = df_cluster_5['cluster']
        df_novel.to_csv('novel_'+nn+'list_1.csv', index=False)
    except:
        rejects_3.append(novel_num)
예제 #12
0
def correlation_matching(I_tr, T_tr, I_te, T_te, n_comps):
    """ Learns correlation matching (CM) over I_tr and T_tr
        and applies it to I_tr, T_tr, I_te, T_te
        
        
        Parameters
        ----------
        
        I_tr: np.ndarray [shape=(n_tr, d_I)]
            image data matrix for training
        
        T_tr: np.ndarray [shape=(n_tr, d_T)]
            text data matrix for training
        
        I_te: np.ndarray [shape=(n_te, d_I)]
            image data matrix for testing
        
        T_te: np.ndarray [shape=(n_te, d_T)]
            text data matrix for testing
        
        n_comps: int > 0 [scalar]
            number of canonical componens to use
            
        Returns
        -------
        
        I_tr_cca : np.ndarray [shape=(n_tr, n_comps)]
            image data matrix represetned in correlation space
        
        T_tr_cca : np.ndarray [shape=(n_tr, n_comps)]
            text data matrix represetned in correlation space
        
        I_te_cca : np.ndarray [shape=(n_te, n_comps)]
            image data matrix represetned in correlation space
        
        T_te_cca : np.ndarray [shape=(n_te, n_comps)]
            text data matrix represetned in correlation space
        
        """


    # sclale image and text data
    I_scaler = StandardScaler()
    I_tr = I_scaler.fit_transform(I_tr)
    I_te = I_scaler.transform(I_te)

    T_scaler = StandardScaler()
    T_tr = T_scaler.fit_transform(T_tr)
    T_te = T_scaler.transform(T_te)

    cca = PLSCanonical(n_components=n_comps, scale=False)
    cca.fit(I_tr, T_tr)

    I_tr_cca, T_tr_cca = cca.transform(I_tr, T_tr)
    I_te_cca, T_te_cca = cca.transform(I_te, T_te)

    return I_tr_cca, T_tr_cca, I_te_cca, T_te_cca
    def train_test(self, X, y, X_test):
        """
        """
        sss = StratifiedShuffleSplit(y, 1, test_size=0.5)    
        for train_id, valid_id in sss:
            X0, X1 = X[train_id], X[valid_id]
            y0, y1 = y[train_id], y[valid_id]  
            
        #First half
        
        w0 = np.zeros(len(y0))
        for i in range(len(w0)):
            w0[i] = self.w[int(y0[i])]
        xg0_train = DMatrix(X0, label=y0, weight=w0)  
        xg0_test = DMatrix(X1, label=y1)   
        xgt_test = DMatrix(X_test)
        bst0 = my_train_xgboost(self.param, xg0_train, self.num_round)
        y0_pred = bst0.predict(xg0_test).reshape(X1.shape[0], 9)
        yt_pred = bst0.predict(xgt_test).reshape(X_test.shape[0], 9)
        
        #Calibrated RF
        rf = RandomForestClassifier(n_estimators=600, criterion='gini', 
                class_weight='auto', max_features='auto')
        cal = CalibratedClassifierCV(rf, method='isotonic', cv=3)
        cal.fit(X0, y0)
        y0_cal = cal.predict_proba(X1)
        yt_cal = cal.predict_proba(X_test)
        
        #Second half
        ss = StandardScaler()
        y0_pred = ss.fit_transform(y0_pred)
        yt_pred = ss.fit_transform(yt_pred)
        y0_cal = ss.fit_transform(y0_cal)
        yt_cal = ss.fit_transform(yt_cal)
        X1 = np.hstack((X1, y0_pred, y0_cal))
        X_test = np.hstack((X_test, yt_pred, yt_cal))  
        w1 = np.zeros(len(y1))
        
#        self.param['eta'] = 0.01
        self.num_round = 450

        for i in range(len(w1)):
            w1[i] = self.w[int(y1[i])]
        xg1_train = DMatrix(X1, label=y1, weight=w1)    
        xg_test= DMatrix(X_test)
        bst1 = my_train_xgboost(self.param, xg1_train, self.num_round)
        y_pred = bst1.predict(xg_test).reshape(X_test.shape[0], 9)
        
        return y_pred






                    
        
예제 #14
0
def stack_features(params):

    """
    Get local features for all training images together
    """

    # Init detector and extractor
    detector, extractor = init_detect_extract(params)

    # Read image names
    with open(
        os.path.join(params["root"], params["root_save"], params["image_lists"], params["split"] + ".txt"), "r"
    ) as f:
        image_list = f.readlines()

    X = []
    for image_name in image_list:

        # Read image
        im = cv2.imread(
            os.path.join(params["root"], params["database"], params["split"], "images", image_name.rstrip())
        )

        # Resize image
        im = resize_image(params, im)

        feats = image_local_features(im, detector, extractor)
        # Stack all local descriptors together

        if feats is not None:
            if len(X) == 0:

                X = feats
            else:
                X = np.vstack((X, feats))

    if params["normalize_feats"]:
        X = normalize(X)

    if params["whiten"]:

        pca = PCA(whiten=True)
        pca.fit_transform(X)

    else:
        pca = None

    # Scale data to 0 mean and unit variance
    if params["scale"]:

        scaler = StandardScaler()

        scaler.fit_transform(X)
    else:
        scaler = None

    return X, pca, scaler
예제 #15
0
def main():

    df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data',
            header = None,
            sep = '\s+')
    df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM',
            'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B',
            'LSTAT', 'MEDV']
    print(df.head())

    # Select a subset of the features and plot the correlation between features
    cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
    sns.pairplot(df[cols], size=2.5);
    plt.title('Correlations between 5 features')
    plt.show()

    # Plot a heatmap of the same subset of features
    cm = np.corrcoef(df[cols].values.T)
    sns.set(font_scale=2.5)
    hm = sns.heatmap(cm,
            cbar = True,
            annot = True,
            square = True,
            fmt = '.2f',
            annot_kws = {'size': 15},
            yticklabels = cols,
            xticklabels = cols)
    plt.show()

    X = df[['RM']].values
    y = df['MEDV'].values

    sc_x = StandardScaler()
    sc_y = StandardScaler()

    X_std = sc_x.fit_transform(X)
    y_std = sc_y.fit_transform(y)
    
    lr = LinearRegressionGD()
    lr.fit(X_std, y_std)

    plt.plot(range(1, lr.n_iter + 1), lr.cost_)
    plt.ylabel('SSE')
    plt.xlabel('Epoch')
    plt.show()

    lin_regplot(X_std, y_std, lr)
    plt.xlabel('Average number of rooms [RM] (standardized)')
    plt.ylabel('Price in $1000\'s [MEDV] (standardized)')
    plt.show()
    
    # Example classification for a house with 5 rooms
    num_rooms_std = sc_x.transform([5.0])
    price_std = lr.predict(num_rooms_std)
    print("Price in $1000's: %.3f" % \
            sc_y.inverse_transform(price_std))
    def train_validate(self, X_train, y_train, X_valid, y_valid):
        """
        """
        sss = StratifiedShuffleSplit(y_train, 1, test_size=0.5)    
        for train_id, valid_id in sss:
            X0_train, X1_train = X_train[train_id], X_train[valid_id]
            y0_train, y1_train = y_train[train_id], y_train[valid_id]  
            
        #First half
       
        w0_train = np.zeros(len(y0_train))
        for i in range(len(w0_train)):
            w0_train[i] = self.w[int(y0_train[i])]
        xg0_train = DMatrix(X0_train, label=y0_train, weight=w0_train)  
        xg0_valid = DMatrix(X1_train, label=y1_train)   
        xgv_valid = DMatrix(X_valid, label=y_valid)
        watchlist = [(xg0_train,'train'), (xg0_valid, 'validation0')]
        
#        bst0 = train(self.param, xg0_train, self.num_round, watchlist)
        bst0 = my_train_xgboost(self.param, xg0_train, self.num_round, watchlist)
        y0_pred = bst0.predict(xg0_valid).reshape(X1_train.shape[0], 9)
        yv_pred = bst0.predict(xgv_valid).reshape(X_valid.shape[0], 9)
        
        #Calibrated RF
        rf = RandomForestClassifier(n_estimators=600, criterion='gini', 
                                    class_weight='auto', max_features='auto')
        cal = CalibratedClassifierCV(rf, method='isotonic', cv=3)        
        cal.fit(X0_train, y0_train)
        y0_cal = cal.predict_proba(X1_train)
        yv_cal = cal.predict_proba(X_valid)
        
        #Second half
        ss = StandardScaler()
        y0_pred = ss.fit_transform(y0_pred)
        yv_pred = ss.fit_transform(yv_pred)
        y0_cal = ss.fit_transform(y0_cal)
        yv_cal = ss.fit_transform(yv_cal)
        X1_train = np.hstack((X1_train, y0_pred, y0_cal))
        X_valid = np.hstack((X_valid, yv_pred, yv_cal))        
        w1_train = np.zeros(len(y1_train))
        
#        self.param['eta'] = 0.05
        self.num_round = 450

        for i in range(len(w1_train)):
            w1_train[i] = self.w[int(y1_train[i])]
        xg1_train = DMatrix(X1_train, label=y1_train, weight=w1_train)    
        xg_valid = DMatrix(X_valid, label=y_valid)
        watchlist = [(xg1_train,'train'), (xg_valid, 'validation')]
        
#        bst1 = train(self.param, xg1_train, self.num_round, watchlist)
        bst1 = my_train_xgboost(self.param, xg1_train, self.num_round, watchlist)
        y_pred = bst1.predict(xg_valid).reshape(X_valid.shape[0], 9)

#        pdb.set_trace()
        return y_pred
def perform_scaling (features, scaling = 'standard') :
	if (scaling == 'standard') :
		print ("Performing standard scaling")
		scaler = StandardScaler()
	else :
		print ("Performing min-max scaling")
		scaler = MinMaxScaler()

	scaler.fit_transform(features)
	print ("Completed %s Scaler fit!" %(scaling))
	return features
예제 #18
0
def main():


    if REDUCE_SIZE:
        TEST_OUTPUT_DATA_FILE=os.path.join(OUTPUT_DATA_PATH, 'test_RS.csv')
        TRAIN_OUTPUT_DATA_FILE=os.path.join(OUTPUT_DATA_PATH, 'train_RS.csv')
    else:
        TEST_OUTPUT_DATA_FILE=os.path.join(OUTPUT_DATA_PATH, 'test_FS.csv')
        TRAIN_OUTPUT_DATA_FILE=os.path.join(OUTPUT_DATA_PATH, 'train_FS.csv')

    #
    #  Process Training Data
    #

    training_data = processs_image_data(TRAINING_FILE, reduce_size = REDUCE_SIZE, is_test_data = False)
    column_names = list(training_data.columns)

    #
    #  Scale (z-score) features, save scale tranform and to use with test data
    #

    y = training_data['label']
    X = training_data.drop('label', axis=1)
    scalar = StandardScaler().fit(X)
    X = scalar.fit_transform(X)

    scaled_data = np.column_stack((y, X))
    scaled_training_data = pd.DataFrame(data=scaled_data, columns=column_names)



    scaled_training_data.to_csv(TRAIN_OUTPUT_DATA_FILE, index=False)
    print('Samples: %d, attributes: %d' %(scaled_training_data.shape[0],
        scaled_training_data.shape[1]))
    print('Training Data saved to %s' % (TRAIN_OUTPUT_DATA_FILE))

    #
    #   Process Test Data
    #

    test_data = processs_image_data(TEST_FILE, reduce_size = REDUCE_SIZE, is_test_data = True)
    column_names = list(test_data.columns)

    #
    #  Apply scaling transform
    #

    scaled_data = scalar.fit_transform(test_data)
    scaled_test_data = pd.DataFrame(data=scaled_data, columns=column_names)

    scaled_test_data.to_csv(TEST_OUTPUT_DATA_FILE, index=False)
    print('Samples: %d, attributes: %d' %(scaled_test_data.shape[0],
        scaled_test_data.shape[1]))
    print('Test Data saved to %s' % (TEST_OUTPUT_DATA_FILE))
예제 #19
0
def fit_svm(train_y, train_x, test_x, c=None, gamma=None):
    """
    Returns a DataFrame of svm results, containing
    prediction strain labels and printing the best model. The
    model's parameters will be tuned by cross validation, and
    accepts user-defined parameters.
    Parameters
    ----------
    train_y: pandas.Series
             labels of classification results, which are predicted strains.
    train_x: pandas.DataFrame
             features used to predict strains in training set
    test_x: pandas.DataFrame
            features used to predict strains in testing set
    c: list, optional
       tuning parameter of svm, which is penalty parameter of the error term
    gamma: list, optional
           tuning parameter of svm, which is kernel coefficient
    Returns
    ----------
    svm results: pandas.DataFrame
        Prediction strain labels
    """
    # input validation
    if c is not None:
        if not isinstance(c, list):
            raise TypeError("c should be a list")
    if gamma is not None:
        if not isinstance(gamma, list):
            raise TypeError("gamma should be a list")
    # creat svm model
    scaler = StandardScaler()
    train_x = scaler.fit_transform(train_x)
    Cs = c
    Gammas = gamma
    if c is None:
        Cs = list(np.logspace(-6, -1, 10))
    if gamma is None:
        Gammas = list(np.linspace(0.0001, 0.15, 10))
    svc = svm.SVC()
    clf = GridSearchCV(estimator=svc, param_grid=dict(C=Cs, gamma=Gammas),
                       n_jobs=-1)
    clf.fit(train_x, train_y)
    clf = clf.best_estimator_
    # fit the best model
    clf.fit(train_x, train_y)
    # predict the testing data and convert to data frame
    prediction = clf.predict(scaler.fit_transform((test_x)))
    prediction = pd.DataFrame(prediction)
    prediction.columns = ['predict_strain']
    print('The best SVM Model is:')
    print(clf)
    return prediction
def artificial_linear(ldsvm, X, y):
    params_dist_svm = {
            "C"        : [1, 2**2, 5, 2**3, 2**4],
            "c"        : [5, 10, 12, 16, 18],
            "max_iter" : [400],
            "step"     : [10]
            }


    params_svm = {
            "C"        : [0.1, 0.3, 0.5, 1, 2, 3, 6, 2**3],
            "max_iter" : [400],
            "penalty"  : ['l1'],
            "dual"     : [False]
            }

    local_risk, central_risk, ldsvm_risk = get_risks(ldsvm, params_svm, params_dist_svm, X, y)

    print(">-------------Best Risks from Grid Search---------------------<")
    print("Risk Local         --> ", local_risk)
    print("Risk LDSVM         --> ", ldsvm_risk)
    print("Risk Central       --> ", central_risk)

    gs      = GridSearchCV(LinearSVC(), params_svm)
    scaler  = StandardScaler()

    ldsvm.network.split_data(X, y, stratified=False)
    local_data    = str(datas_path) + "/data_0.csv"
    local_class   = str(datas_path) + "/class_0.csv"
    X_local       = pd.read_csv(local_data).values
    y_local       = pd.read_csv(local_class).values.T[0]
    X_local_scale = scaler.fit_transform(X_local)
    X_scale       = scaler.fit_transform(X)

    params_dist_best = ldsvm.grid_search(X, y, params_dist_svm, stratified=False)
    gs.fit(X_local, y_local)
    params_local_best = gs.best_params_
    gs.fit(X, y)
    params_central_best = gs.best_params_

    ldsvm.set_params(**params_dist_best)
    ldsvm.fit(X_scale, y, stratified=False)
    local_model   = LinearSVC(**params_local_best).fit(X_local_scale, y_local)
    central_model = LinearSVC(**params_central_best).fit(X_scale, y)

    print(">-------------Best Parameters for Whole data Set--------------<")
    print("Parameters Local   --> ", params_local_best)
    print("Parameters LDSVM   --> ", params_dist_best)
    print("Parameters Central -->", params_central_best)

    analysis.plot_planes(X, y, local_model, central_model, ldsvm)
    analysis.plot_dispersion(ldsvm)
def make_scaler(subject):
    
    raw = []
    fnames =  glob('../data/subj%d_series[1-7]_data.csv' % (subject))
    for fname in fnames:
        data, _ = prepare_data_train(fname)
        raw.append(data)
    X = pd.concat(raw)
    X = np.asarray(X.astype(float))
    
    scaler = StandardScaler()
    scaler.fit_transform(X)
    return scaler
예제 #22
0
파일: run_model.py 프로젝트: t36li/FINRA
def logistic_regression(x_train,y_train,x_test,penalty='L2', regularization=1.0, do_CV=False):
	from sklearn.linear_model import LogisticRegression
	from sklearn.cross_validation import KFold

	### Mean Normalize variables before regression ###
	from sklearn.preprocessing import StandardScaler
	ss=StandardScaler()
	x_train=ss.fit_transform(x_train)
	x_test=ss.fit_transform(x_test)

	lr=LogisticRegression()	
	
	if penalty=='L1':
		lr = LogisticRegression(penalty='l1')
		filename="Lasso_submission.csv"
	else:
		lr = LogisticRegression(penalty='l2')
		filename="Ridge_submission.csv"
	
	if do_CV:
		Cs=np.logspace(-1.5, 1.5, 10)
		cv_list=list()

		### Fit lasso to various choices of regularization parameter C to select optimal C
		for c in Cs:
			lr.C = c
			print 'Running K-fold CV with C = %.5f' % (1.0/c)
			cv_scores=udf.cross_val_score_proba(x_train,y_train,5,lr)
			cv_list.append(np.mean(cv_scores))

		print 'Best lambda based on Cross-Validation...'
		max_score=np.max(cv_list)
		max_lambda=Cs[cv_list.index(max_score)]
		print 1.0/max_lambda, max_score
	else:
		print 'Making prediction with optimal lambda....'
		lr.C=1.0/regularization
		lr.fit(x_train,y_train)
		y_pred=lr.predict_proba(x_test)[:,1]

		print 'Coefficients of the regression:'
		print lr.coef_

		print 'Writing submission file....'
		with open(filename,'wb') as testfile:
			w=csv.writer(testfile)
			w.writerow(('Id','Probability'))
			for i in range(len(y_pred)):
				w.writerow(((i+1),y_pred[i]))
		testfile.close()
		print 'File written to disk...'
예제 #23
0
파일: tslib.py 프로젝트: kingishb/tslib
    def _create_features(self):
        standard_scaler = StandardScaler()

        # throw away 1st point
        train = self.train_features.diff().iloc[1:]
        test = self.test_features.diff().iloc[1:]

        scaled_train = pd.DataFrame(index=train.index, data=standard_scaler.fit_transform(train.values))

        scaled_test = pd.DataFrame(index=test.index, data=standard_scaler.fit_transform(test.values))

        self.normalized_differenced_train_features = scaled_train
        self.normalized_differenced_test_features = scaled_test
        return
예제 #24
0
def main():
    data = np.genfromtxt('housing.csv', delimiter=',')

    data = np.hstack((np.ones((data.shape[0], 1)), data))

    # indexes = np.random.permutation(data.shape[0])

    # data = data[indexes, :].astype(float)

    c = 400

    train_x = data[:-1, :c].T
    train_y = data[-1, :c]

    sc_x = StandardScaler()
    sc_y = StandardScaler()
    X_std = sc_x.fit_transform(train_x)
    y_std = sc_y.fit_transform(train_y)

    m, n = train_x.shape
    train_y = train_y.reshape(m, 1)

    test_x = data[:-1, c + 1:].T
    test_y = data[-1, c + 1:]
    test_y = test_y.reshape(test_y.shape[0], 1)

    theta = np.random.random(n).reshape(n, 1)

    res = fmin_cg(linear_regression, theta, fprime=gradient, args=(X_std, y_std, m, n), maxiter=200, disp=True)

    Theta = res
    print('Theta: %s' % str(Theta))

    actual_prices = y_std
    predicted_prices = X_std.dot(Theta.T).reshape(train_x.shape[0], 1)

    train_rms = math.sqrt(np.power(predicted_prices - actual_prices, 2).mean())
    print('RMS training error: %f' % (train_rms))

    test_x_std = sc_x.transform(test_x)
    test_y_std = sc_y.transform(test_y)
    actual_prices = test_y_std
    predicted_prices = test_x_std.dot(Theta.T).reshape(test_x_std.shape[0], 1)

    test_rms = math.sqrt(np.power(predicted_prices - actual_prices, 2).mean())
    print('RMS testing error: %f' % (test_rms))

    plot(actual_prices, predicted_prices)
예제 #25
0
    def transformTestData(self, train_data, test_data):
        #Select the right features for both training and testing data
        X_train, y_train = self.__selectRelevantFeatures(train_data)
        X_test, y_test = self.__selectRelevantFeatures(test_data)

        #Transform categorical variables into integer labels
        martial_le = LabelEncoder()
        occupation_le = LabelEncoder()
        relationship_le = LabelEncoder()
        race_le = LabelEncoder()
        sex_le = LabelEncoder()
        transformers = [martial_le, occupation_le, relationship_le, race_le, sex_le]

        for i in range(len(transformers)):
            X_train[:,i] = transformers[i].fit_transform(X_train[:,i])
            X_test[:,i] = transformers[i].transform(X_test[:,i])

        #Dummy code categorical variables
        dummy_code = OneHotEncoder(categorical_features = range(5))
        X_train = dummy_code.fit_transform(X_train).toarray()
        X_test = dummy_code.transform(X_test).toarray()

        #Normalize all features
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        #Encode y
        class_le = LabelEncoder()
        y_train = class_le.fit_transform(y_train)
        y_test = class_le.transform(y_test)
        #print class_le.transform(["<=50K", ">50K"])

        return X_train, X_test, y_train, y_test
예제 #26
0
def buildTreeRegressor(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'c_a', md = None):
    """
    Build a random forest-regressor model to predict some structure feature from compositional data.  Will return the model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = df[targetcolumn].values

    rfr = RandomForestRegressor(max_depth = md)
    acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfr.fit(X_train,y_train)
    y_predict = rfr.predict(X_test)
    
    t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
    
    rfr.fit(X, y)

    return rfr, t, round(acc,2)
예제 #27
0
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv',  targetcolumn = 'pointGroup', md = None):
    """
    Build a random forest-classifier model to predict some structure feature from compositional data.  Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object
    """
    df = pd.read_csv(structurestable)
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    s = StandardScaler()
    le = LabelEncoder()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = le.fit_transform(df[targetcolumn].values)

    rfc = RandomForestClassifier(max_depth = md)
    acc = mean(cross_val_score(rfc, X, y))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfc.fit(X_train,y_train)
    y_predict = rfc.predict(X_test)
    cm = confusion_matrix(y_test, y_predict)
    
    cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_)

    rfc.fit(X, y)

    return rfc, cm, round(acc,2), le
예제 #28
0
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None):
    """
    Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
    """
    try:
        df = pd.read_csv(coordinationDir + element + '.csv')
    except Exception:
        print 'No data for ' + element
        return None, None, None
    df = df.dropna()
    if('fracNobleGas' in df.columns):
        df = df[df['fracNobleGas'] <= 0]
    
    if(len(df) < 4):
        print 'Not enough data for ' + element
        return None, None, None
    s = StandardScaler()
    
    X = s.fit_transform(df[predictorColumns].astype('float64'))
    y = df['avgCoordination'].values

    rfr = RandomForestRegressor(max_depth = md)
    acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))

    X_train, X_test, y_train, y_test = train_test_split(X,y)
    rfr.fit(X_train,y_train)
    y_predict = rfr.predict(X_test)
    
    t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
    
    rfr.fit(X, y)

    return rfr, t, round(acc,2)
예제 #29
0
def train_and_test(train_books, test_books, train, scale=True):
    X_train, y_train, cands_train, features = get_pair_data(train_books, True)
    X_test, y_test, cands_test, features = get_pair_data(test_books)

    scaler = None
    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    print sum(y_train)*0.1/len(y_train)
    print 'Start training'
    print X_train.shape
    clf = train(X_train, y_train)
    print 'Done training'
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)

    '''
    # print performance for training books
    print "--------------Traning data-------------"
    train_perf = evaluate_books(clf, train_books, scaler, evaluate_pair)

   # print performance for testing books
    print "\n"
    print "--------------Testing data-------------"
    test_perf = evaluate_books(clf, test_books, scaler, evaluate_pair)
    '''
    print 'Train Non-unique Precision:', precision(y_train_pred, y_train), 'Non-unique Recall:', recall(y_train_pred, y_train)
    print 'Test Non-unique Precision:', precision(y_test_pred, y_test), 'Recall:', recall(y_test_pred, y_test)
    return clf, scaler, X_train, y_train, X_test, y_test
예제 #30
0
def main():
    
    t0 = time.time() # start time

    # output files path
    TRAINX_OUTPUT = "../../New_Features/train_x_processed.csv"
    TEST_X_OUTPUT = "../../New_Features/test__x_processed.csv"
    # input files path
    TRAIN_FILE_X1 = "../../ML_final_project/sample_train_x.csv"
    TRAIN_FILE_X2 = "../../ML_final_project/log_train.csv"
    TEST__FILE_X1 = "../../ML_final_project/sample_test_x.csv"
    TEST__FILE_X2 = "../../ML_final_project/log_test.csv"
    # load files
    TRAIN_DATA_X1 = np.loadtxt(TRAIN_FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18)))
    TEST__DATA_X1 = np.loadtxt(TEST__FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18)))
    TRAIN_DATA_X2 = logFileTimeCount(np.loadtxt(TRAIN_FILE_X2, delimiter=',', skiprows=1, dtype=object))
    TEST__DATA_X2 = logFileTimeCount(np.loadtxt(TEST__FILE_X2, delimiter=',', skiprows=1, dtype=object))
    # combine files
    TRAIN_DATA_X0 = np.column_stack((TRAIN_DATA_X1, TRAIN_DATA_X2))
    TEST__DATA_X0 = np.column_stack((TEST__DATA_X1, TEST__DATA_X2))
    # data preprocessing
    scaler = StandardScaler()
    TRAIN_DATA_X = scaler.fit_transform(TRAIN_DATA_X0)
    TEST__DATA_X = scaler.transform(TEST__DATA_X0)
    # output processed files
    outputXFile(TRAINX_OUTPUT, TRAIN_DATA_X)
    outputXFile(TEST_X_OUTPUT, TEST__DATA_X)

    t1 = time.time() # end time
    print "...This task costs " + str(t1 - t0) + " second."
예제 #31
0
def scalar_transform(x):
    #print(x)
    scaler = StandardScaler()
    #scaler.fit(x)
    return scaler.fit_transform([x])[0]
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
# eval
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


raw_boston = datasets.load_boston()

X = raw_boston.data
y = raw_boston.target

X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state = 42) 

std_scale = StandardScaler()
X_tn_std = std_scale.fit_transform(X_tn)
X_te_std = std_scale.transform(X_te)

clf_lr = LinearRegression()
clf_lr.fit(X_tn_std, y_tn)
print('clf_lr.coef_ : ', clf_lr.coef_)
print('clf_lr.intercept_ : ', clf_lr.intercept_)

clf_ridge = Ridge(alpha = 1)
clf_ridge.fit(X_tn_std, y_tn)
print('clf_ridge.coef_ : ', clf_ridge.coef_)
print('clf_ridge.intercept_ : ', clf_ridge.intercept_)

clf_lasso = Lasso(alpha = 0.01)
clf_lasso.fit(X_tn_std, y_tn)
print('clf_lasso.coef_ : ', clf_lasso.coef_)
예제 #33
0
y = dataset.iloc[:, 560].values
y=y-1
y_train = keras.utils.to_categorical(y)

X = dataset.iloc[:, 0:559]
y = dataset.iloc[:, 560]


XT = dataset.iloc[:, 0:559].values
yT = dataset.iloc[:, 560].values
yT=yT-1
y_test = keras.utils.to_categorical(yT)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(X)
x_test = sc.fit_transform(XT)

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD

model = Sequential()
model.add(Dense(64, activation='relu', input_dim=559))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(6, activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
                          'req_method', 'req_dir', 'req_http_header',
                          'status_code', 'bytes_trans'
                      ])
#we only need the IP Address & Status Code
dataset = dataset[['IP', 'status_code']]
#modifying dataset by aggregating count of status code against IP Address
dataset = dataset.groupby(
    ['IP',
     'status_code']).status_code.agg('count').to_frame('Total').reset_index()
#We are inserting the Index No as it needs it, otherwise it will give Shape of passed values is (13, 2), indices imply (13, 3) error
dataset.insert(0, 'IndexNo', range(len(dataset)))
#we are droping IP Column as instead of this we will take the Index No as reference of IP and scale it
train_data = dataset.drop(['IP'], axis=1)

sc = StandardScaler()
scaled_data = sc.fit_transform(train_data)
#We have used here 3 as a cluster because it's a good practice to give odd number due to the calculation of points which are crucial between two cluster
#This solely depends and varry from data to data
model = KMeans(n_clusters=3)
pred = model.fit_predict(scaled_data)
#here IP_Scaled is actually IndexNo because IP Address is treated as string
pred_ds = pd.DataFrame(
    scaled_data, columns=['IP_Scaled', 'status_code_Scaled', 'Total_Scaled'])
pred_ds['Cluster'] = pred
ds = pd.concat([dataset, pred_ds], axis=1, sort=False)
#Here we are creating graph of Request per IP vs Count
Graph = pxp.scatter(ds,
                    'Total',
                    'IP',
                    'Cluster',
                    hover_data=['status_code'],
             acc_val.append(acc1/5)



 #print the best parameters
 loc=acc_val.index(max(acc_val))
 print(max(acc_val))
 print(parameters[loc])


#training all the training data and write the parameters of model to file
smote=SMOTE(k_neighbors=5)
X_train,y_train=smote.fit_resample(TCGA_data,TCGA_label)

std=StandardScaler()
X_train=std.fit_transform(X_train)

#training model
ga,ma_num,mi_num=parameters[loc][0],parameters[loc][1],parameters[loc][2]
clf_xg=XGBClassifier(gamma=ga,max_depth=ma_num,min_child_weight=mi_num,
                     learning_rate=0.4,booster='gbtree',n_jobs=-1)
clf_xg.fit(X_train,y_train)


import pickle
with open("./model_file/TCGA_clf_xg_all.pickle",'wb') as f:
    pickle.dump(clf_xg,f)


 #stander the testing data
 TCGA_test=pd.read_csv(loc_path+"TCGA_dataset/gene_sel_data/test.csv",header=None)
예제 #36
0
#df_rune = pd.DataFrame(df_rune)#,index=df_rune[:,0])
print(df_rune)

# separate training and test data (70, 30)
#X, y = df_rune.iloc[:, 1:].values, df_rune.iloc[:, 0].values
X, y = df_rune[:, 1:], df_rune[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=0)

# standardize the features
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)

cov_mat = np.cov(X_train_std.T)
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)
#print('\nEigenvalues {}'.format(eigen_vals))

# with cumsum we can calculate the cumulative sum of expained variances
tot = sum(eigen_vals)
var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

# Make a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigen_vals[i]), eigen_vecs[:, i])
               for i in range(len(eigen_vals))]
# eigen_pairs
예제 #37
0
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])

# In[61]:

forest_clf.predict_proba([some_digit])

# In[62]:

cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

# In[63]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

# In[64]:

y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

# In[65]:


def plot_confusion_matrix(matrix):
    """If you prefer color and a colorbar"""
    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111)
예제 #38
0
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv("Social_Network_Ads.csv")
features = dataset.iloc[:, [2, 3]].values
labels = dataset.iloc[:, 4].values

#SPLITTING
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.25, random_state=0)

#FEATURE SCALING
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
features_train = sc.fit_transform(features_train)
features_test = sc.transform(features_test)

#FITTING LOGISTIC REGRESSION INTO TRAINING SET
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(features_train, labels_train)

#PREDICTING THE RESULTS
labels_pred = classifier.predict(features_test)

#MAKING THE CONFUSION MATRIX
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(labels_test, labels_pred)

#VISUALISING TRAINING DATA SET
예제 #39
0
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

numeros = skdata.load_digits()
target = numeros['target']
imagenes = numeros['images']
n_imagenes = len(target)
data = imagenes.reshape((n_imagenes, -1)) 

x_train, x_test, y_train, y_test = train_test_split(data, target, train_size=0.5)
# todo lo que es diferente de 1 queda marcado como 0
y_train[y_train!=1]=0
y_test[y_test!=1]=0

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


cov = np.cov(x_train.T)
valores, vectores = np.linalg.eig(cov)

# pueden ser complejos por baja precision numerica, asi que los paso a reales
valores = np.real(valores)
vectores = np.real(vectores)

# reordeno de mayor a menor
ii = np.argsort(-valores)
valores = valores[ii]
vectores = vectores[:,ii]
예제 #40
0
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
# Create your classifier here

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
# Dataset for decision function visualization: we only keep the first two
# features in X and sub-sample the dataset to keep only 2 classes and
# make it a binary classification problem.

X_2d = X[:, :2]
X_2d = X_2d[y > 0]
y_2d = y[y > 0]
y_2d -= 1

# It is usually a good idea to scale the data for SVM training.
# We are cheating a bit in this example in scaling all of the data,
# instead of fitting the transformation on the training set and
# just applying it on the test set.

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_2d = scaler.fit_transform(X_2d)

# #############################################################################
# Train classifiers
#
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X, y)
예제 #42
0
def main():
    if os.path.exists(__TRAINED_DATA_SET):
        df = pd.read_csv(__TRAINED_DATA_SET)
    else:
        df = train()

    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values

    # Encoding the Dependent Variable
    labelencoder_y = LabelEncoder()
    y = labelencoder_y.fit_transform(y)

    # Splitting the dataset into the Training set and Test set
    x_train, x_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=0)

    # Feature Scaling
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)

    lda = LDA(n_components=None)
    x_train = lda.fit_transform(x_train, y_train)
    x_test = lda.transform(x_test)
    explained_variance = lda.explained_variance_ratio_

    # Fitting Logistic Regression to the Training set
    classifier = LogisticRegression(random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting K-NN to the Training set
    classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting SVM to the Training set
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting Kernel SVM to the Training set
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting Naive Bayes to the Training set
    classifier = GaussianNB()
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting Decision Tree Classification to the Training set
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    # Fitting Random Forest Classification to the Training set
    classifier = RandomForestClassifier(n_estimators=10,
                                        criterion='entropy',
                                        random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))

    parameters = [{
        'C': [1, 10, 100, 1000],
        'kernel': ['linear']
    }, {
        'C': [1, 10, 100, 1000],
        'kernel': ['rbf'],
        'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    }]
    grid_search = GridSearchCV(estimator=classifier,
                               param_grid=parameters,
                               scoring='accuracy',
                               cv=10,
                               n_jobs=-1)
    grid_search = grid_search.fit(x_train, y_train)
    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_

    # Fitting Kernel SVM to the Training set
    classifier = SVC(kernel='rbf', random_state=0)
    classifier.fit(x_train, y_train)

    # Predicting the Test set results
    y_pred = classifier.predict(x_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / sum(sum(cm))
plt.style.use('seaborn-deep')
import matplotlib.cm
cmap = matplotlib.cm.get_cmap('plasma')

# Reading in data
ds = pd.read_csv("Social_Network_Ads.csv")
X = ds.iloc[:, [2,3]].values
y = ds.iloc[:,4].values

# Splitting and scaling 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

# PCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=2, kernel="rbf")
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)

# Fitting logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
def transform(feature):
    df[feature] = le.fit_transform(df[feature])
    #print(df[feature])
    #print(le.classes_)


cat_df = df.select_dtypes(include='object')

for col in cat_df.columns:
    transform(col)

#for col in df.columns:
#print(col)

scaler = StandardScaler()
scaled_df = scaler.fit_transform(df.drop('Attrition', axis=1))
X = scaled_df
Y = df['Attrition'].as_matrix()
Y = to_categorical(Y)
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=42)
#print('x_test=',x_test.shape)

np.random.seed(31)
rn.seed(31)
tf.set_random_seed(31)

model = Sequential()
model.add(Dense(input_dim=13, units=50, activation='relu'))
    time_diff = np.round(time_diff, 4)
    all_seq[u]['time_diff'] = time_diff.tolist()

    for c in df.columns[2:]:
        all_seq[u][c] = df[df['actor_id'] == u][c].values.tolist()

### Store to JSON
with open('make_sequence__observ_{}__labeled_{}_{}_{}_{}.json'.format(observ_daterange, label_daterange, try_date, version, desc), 'w') as fp:
    json.dump(all_seq, fp)


### Restore `clust_index`, `time_diff` to CSV
clust_collect = []
diff_collect = []
for u in tqdm(uid):
    for i in range(0, len(all_seq[u]['clust_index'])):
        clust_collect.append(all_seq[u]['clust_index'][i])
        diff_collect.append(all_seq[u]['time_diff'][i])
df = pd.concat([df, pd.DataFrame({'clust_index': clust_collect, 'time_diff': diff_collect})], axis=1)


### time_diff scaling
df['time_diff_scaled'] = sd.fit_transform(df['time_diff'].values.reshape(-1, 1))
for u in tqdm(uid):
    all_seq[u]['time_diff_scaled'] = df[df['actor_id'] == u]['time_diff_scaled'].values.tolist()

with open('make_sequence__observ_{}__labeled_{}_{}_{}_{}.json'.format(observ_daterange, label_daterange, try_date, version, desc), 'w') as fp:
    json.dump(all_seq, fp)

df.to_csv('featureGeneration__observ_{}__labeled_{}_{}_{}_{}.csv'.format(observ_daterange, label_daterange, try_date, version, desc), index=False)
예제 #46
0
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load data
print('reading train data...')
client_aggs = pd.read_csv('../input/groupby_client_aggs.csv')

ids = client_aggs['ClientId']
client_aggs['TotalUnits'] = np.log1p(client_aggs['TotalUnits'])
client_aggs[
    'CostPerUnit'] = client_aggs['TotalPesos'] / client_aggs['TotalUnits']
client_aggs.drop(['TotalPesos', 'ClientId'], axis=1, inplace=True)
client_aggs.fillna(0, inplace=True)

scaler = StandardScaler()
client_aggs = scaler.fit_transform(client_aggs)

print("KMeans...\n")
clf1000 = KMeans(n_clusters=1000,
                 n_init=10,
                 max_iter=300,
                 tol=0.0001,
                 verbose=0,
                 random_state=1,
                 copy_x=True,
                 n_jobs=-1)
clf250 = KMeans(n_clusters=250,
                n_init=10,
                max_iter=300,
                tol=0.0001,
                verbose=0,
예제 #47
0
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])],
                       remainder="passthrough")
X = np.array(ct.fit_transform(X))
# print(X);
#Encoding Dependent Variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = np.array(le.fit_transform(y))
# print(y);

#Splitting Dataset into Training Set & Test Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
# print(X_train);
# print(X_test);
# print(y_train);
# print(y_test);

#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])
print(X_train)
print(X_test)
colnames.append('effective')
df.columns = colnames

#dropping empty or with 'effective- NaN' column and duplicated rows:
df = df.drop([21,22,28,30,50,68,105,106,110,122])

#dealing  with Nan
df.iloc[:,:-1] = df.iloc[:,:-1].apply(lambda x: pd.factorize(x)[0])

X=df[['start_treat','doxy','ilads','buhner','cowden','liposomal','other_herbs','vitaminD','supp','oil','sugar-free','gluten-free','dairy-free','bioresonance','antimicrobial','oxygen','cannabis_oil','binaural','tobacco','alcohol','coffee','marijuana','other_stim','num_antibiotics','method_antibiotics']].values
y=df['effective'].values


from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)


import keras
from keras.utils.np_utils import to_categorical
y_binary = to_categorical(y)


'''
model = DecisionTreeRegressor(max_depth=10)
cross_val_score(model, X, y, cv=3, scoring='neg_mean_absolute_error')
'''

model = RandomForestRegressor(max_depth=15, n_estimators=25, n_jobs=8)

model.fit(X,y_binary)
예제 #49
0
df.head() 


dataset['Class']=dataset['Class'].replace(3,0)
dataset['Class']=dataset['Class'].replace(1,3)
dataset['Class']=dataset['Class'].replace(2,1)
dataset['Class']=dataset['Class'].replace(3,2)


target = dataset['Class']    

df=df.iloc[0:177,[1,12]]


sc=StandardScaler()
df=sc.fit_transform(df)

pca = PCA(n_components=2)
pca_x=pca.fit_transform(df)
pca_df = pd.DataFrame(data=pca_x,columns=['comp1','comp2'])


KModel = KMeans(n_clusters=3,random_state=2)
KModel.fit_predict(pca_df)
KModel.labels_


colormap = np.array(['Red','Blue','Green'])


z = plt.scatter(pca_df.comp1,pca_df.comp2,c = colormap[KModel.labels_])
예제 #50
0
"""Checking for Quasi-Constant Features"""

occ = x.loc[x.promotion_last_5years == 0, 'promotion_last_5years'].count()
number_of_occ_per = occ/x.shape[0] * 100
print(str(number_of_occ_per) + '%')

occ = x.loc[x.Work_accident == 0, 'Work_accident'].count()
number_of_occ_per = occ/x.shape[0] * 100
print(str(number_of_occ_per) + '%')

"""Standard Scaling all the features to come under a common range."""

from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x = sc_x.fit_transform(x)

x

y

"""Inference : <br>
The Data is Imbalanced. So, we must use ensemble learning methods and cross validation to avoid overfitting.

# 8. Splitting into Train and Test Sets
"""

y.shape

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)
예제 #51
0
from sklearn.preprocessing.imputation import Imputer

imr = Imputer(missing_values='NaN', strategy='mean', axis=1)
imr = imr.fit(x_train)
imputed_data = imr.transform(x_train.values)
print('trasf')
print(imputed_data[:200])

imr = Imputer(missing_values='NaN', strategy='mean', axis=1)
imr = imr.fit(x_test)
imputed_data2 = imr.transform(x_test.values)
print('trasf')
print(imputed_data2[:200])

std = StandardScaler()
x_train_std = std.fit_transform(imputed_data)
x_test_std = std.fit_transform(imputed_data2)

print(x_train_std)

print(imputed_data.shape)

#  rete neurale
# modello neurale
model1 = Sequential()

model1.add(layers.Dense(50, input_dim=9, activation='relu'))
model1.add(layers.Dense(40, activation='relu'))
model1.add(layers.Dense(30, activation='relu'))
model1.add(layers.Dense(25, activation='relu'))
model1.add(layers.Dense(2, activation='sigmoid'))
예제 #52
0
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)

#Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

#predicting test set results
y_pred = classifier.predict(X_test)

#confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
예제 #53
0
파일: keras_test.py 프로젝트: grjd/keras
DATA_DIR = "data"
AIRQUALITY_FILE = os.path.join(DATA_DIR, "AirQualityUCI.csv")
aqdf = pd.read_csv(AIRQUALITY_FILE, sep=";", decimal=",", header=0)
# remove columsn. data, time and last two columns
del aqdf["Date"]
del aqdf["Time"]
del aqdf["Unnamed: 15"]
del aqdf["Unnamed: 16"]
# fill NaNs with the mean value
aqdf = aqdf.fillna(aqdf.mean())
Xorig = aqdf.as_matrix()

# scale the data
scaler = StandardScaler()
Xscaled = scaler.fit_transform(Xorig)
# store the meand and std to be used after for porediction
Xmeans = scaler.mean_
Xstds = scaler.scale_
# the target variable is the fourthn columun
y= Xscaled[:,3]
# delete the target variable from the input (training data)
X = np.delete(Xscaled, 3, axis=1)
# split training data inot 70 training and 30 testing
train_size = int(0.7*X.shape[0])
Xtrain, Xtest, ytrain, ytest = X[0:train_size], X[train_size:],y[0:train_size], y[train_size:] 
# define the network, a 2 layer dense netweork takes the 12 features and outputs ascaled prediction
# the hidden layer has 8 neurons, initialization, loss function (mse) and optimizer (adam)
readings = Input(shape=(12,))
x = Dense(8, activation="relu", kernel_initializer="glorot_uniform")(readings)
benzene = Dense(1, kernel_initializer="glorot_uniform")(x)
예제 #54
0
plt.rcParams['axes.unicode_minus'] = False

if __name__ == '__main__':
    """
    2020-4-17指导:
     基于现场46个指标的新数据(铁次),基于PCA的分值看一下规律
    """
    FILE = '铁次结果_5h滞后处理v3.0_tc.xlsx'
    N_COMPONENTS = 21  # 主成分个数, 需要先设定 None

    # 数据读入
    input_df = pd.read_excel(FILE, index_col=0, sheet_name='46')

    # 标准化
    scaler = StandardScaler()
    scaled_np = scaler.fit_transform(input_df)
    df_scaled = pd.DataFrame(scaled_np,
                             index=input_df.index,
                             columns=input_df.columns)

    # PCA
    n = N_COMPONENTS
    pca = PCA(n)
    pca.fit(scaled_np)
    pca.explained_variance_ratio_.cumsum()  # 累计值去 0.9 0.95
    df_pca = pca.transform(scaled_np)

    # 打分图
    score = df_pca.dot(pca.explained_variance_.reshape(
        n, 1)) / pca.explained_variance_.sum()
    max_range = score.shape[0]
예제 #55
0
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Encode categorical varaibles
label_encoder_X = LabelEncoder()
#X[:,1] = label_encoder_X.fit_transform(X[:,1])
label_encoder_y = LabelEncoder()
#y = label_encoder_y.fit_transform(y)

# Scale Data

st_sc = StandardScaler()

print(st_sc.fit(X))

X = st_sc.fit_transform(X)

#print(y)

# Split X and y into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.15,
                                                    random_state=0)

# Create Model
model = Sequential()

#model.add(Dense(20, input_dim = 10, activation = 'relu'))
#model.add(Dense(80, activation = 'relu'))
#model.add(Dense(130, activation = 'relu'))
예제 #56
0
                                                            random_state=7,
                                                            test_size=0.25)
# 拆分训练集 / 验证集
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,
                                                      y_train_all,
                                                      random_state=11)

print(x_train.shape, y_train.shape)  # 查看样本
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

# 归一化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaler = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

#############################wide&deep模型的多输入###########################################
# 搭建模型
# 多输入
input_wide = keras.layers.Input(shape=[5])
input_deep = keras.layers.Input(shape=[6])
hidden1 = keras.layers.Dense(30, activation='relu')(input_deep)
hidden2 = keras.layers.Dense(30, activation='relu')(hidden1)
concat = keras.layers.concatenate([input_wide, hidden2])  # 拼接起来
output = keras.layers.Dense(1)(concat)  # 函数式调用
model = keras.models.Model(inputs=[input_wide, input_deep], outputs=[output])
# fit前需要拆分模型
previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8])
previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9])
previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13])

#Existe uma ineficiência nessa solução, pois essas variáveis trasnformadas são do tipo nominal
#No caso não posso dizer por exemplo que uma raça é melhor que outra

onehotencoder = OneHotEncoder(categorical_features=[1, 3, 5, 6, 7, 8, 9, 13])
previsores = onehotencoder.fit_transform(previsores).toarray()

labelEncoder_classe = LabelEncoder()

classe = labelEncoder_classe.fit_transform(classe)

standardScaler = StandardScaler()
previsores = standardScaler.fit_transform(previsores)

###########################CRIAÇÃO BASE DE TESTE###############################

from sklearn.model_selection import train_test_split
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(
    previsores, classe, test_size=0.15, random_state=0)

from sklearn.linear_model import LogisticRegression
classificador = LogisticRegression()
classificador.fit(previsores_treinamento, classe_treinamento)

previsoes = classificador.predict(previsores_teste)

from sklearn.metrics import confusion_matrix, accuracy_score
예제 #58
0
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

data = genData(1, 50, 50, 1200, 1280, 1320, 0.004, .001)

dipData = data[0]
qFactorClassification = data[1]

x_train, x_test, y_train, y_test = train_test_split(dipData, qFactorClassification, test_size = 0.01, random_state = 0)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

start = time.time()
qFactorPrediction = loaded_model.predict(x_test)
end = time.time()

print(str((end - start)))

plt.plot(y_test, color = 'red', label = 'Theorectical Q-Factor', marker = '.')
plt.plot(qFactorPrediction, color = 'blue', label = 'Predicted Q-Factor', marker = '.')
plt.title('Model Prediction')
plt.legend()
plt.show()
예제 #59
0
lin_reg2.fit(X_poly, Y)

# p-value degerleri incelenecek
print("Polynomial Reg OLS:")
model2 = sm.OLS(lin_reg2.predict(poly_reg.fit_transform(X)),X)
print(model2.fit().summary())
print("Polynomial R-square value:")
print(r2_score(Y , lin_reg2.predict(poly_reg.fit_transform(X))))



# SVR 
from sklearn.preprocessing import StandardScaler
sc1 = StandardScaler()
sc2 = StandardScaler()
x_olcekli = sc1.fit_transform(X)
y_olcekli = sc2.fit_transform(Y)

from sklearn.svm import SVR
svr_reg = SVR(kernel = 'rbf')
svr_reg.fit(x_olcekli, y_olcekli)
print("SVR OLS:")
model3 = sm.OLS(svr_reg.predict(x_olcekli),x_olcekli)
print(model3.fit().summary())
print("SVR R-square value:")
print(r2_score(Y , svr_reg.predict(x_olcekli)))



# Decision Tree
from sklearn.tree import DecisionTreeRegressor
예제 #60
0
def main(model_name, params):

    ## Read data
    print("Reading features...")
    train_df = pd.read_csv(features_directory + model_name + "_features.csv",
                           delimiter=',')
    ids = train_df.id.tolist()
    train_df = train_df.drop(["id"], axis=1)

    # Sanity checks: check if ids are from 0 to N
    if [x for x in range(len(ids))] != ids:
        print("ERROR: Indices are not ordered!")
        sys.exit()

    ## Normalize features
    print("\nNormalization...")
    scaler = StandardScaler()
    train_array = scaler.fit_transform(train_df)

    ## Make train data
    print("\nMaking datasets...")
    triplet_file = "../datasets/train_triplets.txt"

    ## Make train and validation triplets making sure there are no common images in the train and validation triplets
    triplets_train, triplets_validation, triplets_test = make_train_validation_test_triplets_list(
        triplet_file)

    data_train_1 = make_triplets(train_array, triplets_train)
    data_validation_1 = make_triplets(train_array, triplets_validation)
    data_test_1 = make_triplets(train_array, triplets_test)

    data_train_0 = make_0_triplets(data_train_1)
    data_validation_0 = make_0_triplets(data_validation_1)
    data_test_0 = make_0_triplets(data_test_1)

    data_train_1_in, data_train_1_out, data_train_0_out, data_train_0_in = train_test_split(
        data_train_1, data_train_0, train_size=0.5)
    data_validation_1_in, data_validation_1_out, data_validation_0_out, data_validation_0_in = train_test_split(
        data_validation_1, data_validation_0, train_size=0.5)
    data_test_1_in, data_test_1_out, data_test_0_out, data_test_0_in = train_test_split(
        data_test_1, data_test_0, train_size=0.5)

    n1 = len(data_train_1_in)
    n0 = len(data_train_0_in)
    X_train = np.concatenate((data_train_1_in, data_train_0_in), axis=0)
    y_train = np.array(n1 * [1.] + n0 * [0.])
    y_2D_train = np.array(list(map(list, zip(y_train, not_(y_train)))))

    n1 = len(data_validation_1_in)
    n0 = len(data_validation_0_in)
    X_validation = np.concatenate((data_validation_1_in, data_validation_0_in),
                                  axis=0)
    y_validation = np.array(n1 * [1.] + n0 * [0.])
    y_2D_validation = np.array(
        list(map(list, zip(y_validation, not_(y_validation)))))

    n1 = len(data_test_1_in)
    n0 = len(data_test_0_in)
    X_test = np.concatenate((data_test_1_in, data_test_0_in), axis=0)
    y_test = np.array(n1 * [1.] + n0 * [0.])
    y_2D_test = np.array(list(map(list, zip(y_test, not_(y_test)))))

    ## Shuffle the datasets
    X_train, y_2D_train = shuffle(X_train, y_2D_train)
    X_validation, y_2D_validation = shuffle(X_validation, y_2D_validation)

    ## Make model
    model = create_model(
        np.shape(X_train)[2], params["n_units"], params["dropout"])

    print("Model summary:")
    print(model.summary())

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['acc'],
    )

    print("\nFitting...")
    history = model.fit(
        (X_train[:, 0], X_train[:, 1], X_train[:, 2]),
        y_2D_train,
        validation_data=((X_validation[:, 0], X_validation[:, 1],
                          X_validation[:, 2]), y_2D_validation),
        epochs=params["n_epochs"],
        batch_size=params["batch_size"])

    ## Prediction on the test dataset
    print("\nPredictions for the test sample...")
    y_pred_proba = model.predict((X_test[:, 0], X_test[:, 1], X_test[:, 2]))
    auc = roc_auc_score(y_2D_test[:, 0], y_pred_proba[:, 0])
    print("ROC AUC: %.2f" % auc)

    best_cut = 0.5
    y_pred = y_pred_proba[:, 0] >= best_cut

    print("Accuracy: %.3f" % (accuracy_score(y_2D_test[:, 0], y_pred)))

    ## Control plots
    # Loss
    #for variable in ("loss", "acc"):
    #    plt.figure()
    #    plot_var(variable, history)
    #    plt.savefig(variable + ".pdf")
    #    plt.close()

    ## Load test dataset
    print("\nPredictions for the test dataset...")
    triplet_file = "../datasets/test_triplets.txt"
    X_test = make_triplets_from_file(train_array, triplet_file)
    y_pred_test_proba = model.predict((X_test[:, 0], X_test[:, 1], X_test[:,
                                                                          2]))
    y_pred_test = y_pred_test_proba[:, 0] >= best_cut

    np.savetxt("submit.txt", y_pred_test, fmt="%d")

    return