Пример #1
0
def run_lasso_on_input(df, target):
   
	X_part, y_part, _ = sample_data_frame_return_x_y_column_name(df, True, target, int(0.7*df.shape[0]))

	X_part, _ = scale_input_data(X_part)

	print "#######################################"
	print "Starting LARS CV"
	print "#######################################"

	lars_cv = LassoLarsCV(cv=10).fit(X_part, y_part)

	print "#######################################"
	print "Done with LARS CV"
	print "#######################################"

	#alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
	
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, df.shape[0])

	X, _ = scale_input_data(X)

	print "#######################################"
	print "Starting main lasso"
	print "#######################################"

	clf = RandomizedLasso(alpha= lars_cv.alphas_, random_state=12, n_resampling= 400, normalize=True).fit(X, y) 

	print "#######################################"
	print "Done with main lasso"
	print "#######################################"

	return clf, column_list_for_sampled
Пример #2
0
def perform_orthogonal_variable_selection(X, y, column_list_for_sampled,
                                          non_linear, feature_dictionary):

    X, _ = scale_input_data(X)

    #train base line model with the original model
    #define lasso model

    print "defining lasso model "
    clf_baseline = LassoCV(n_alphas=3, cv=3)

    print "fitting lasso model with cross validation library"
    base_line_lasso = cross_validation.cross_val_score(clf_baseline,
                                                       X,
                                                       y,
                                                       cv=3,
                                                       n_jobs=1)

    print "done fitting lasso library, now computing baseline performance"

    #baseline performance
    baseline_performance = np.mean(base_line_lasso)

    print baseline_performance

    print "baseline performance printed above"

    #get number of columns and rows for main data
    rows, columns = X.shape

    print "starting iterative orthogonal transformation for each feature"

    for i in range(columns):
        feature_name_current_i = column_list_for_sampled[i]
        print "Currently on ----->> " + feature_name_current_i
        current_column = X[:, i]

        transformed_X = np.zeros((rows, columns))

        if non_linear > 0:
            #print "performing non-linear transformation"
            A = return_non_linear_transformation(current_column,
                                                 poly=3,
                                                 log=True,
                                                 square_root=False,
                                                 exponential=False,
                                                 sin=False,
                                                 cos=False)
            for j in range(columns):
                #print "in inner loop for " + feature_name_current_i  + " ---->> " + column_list_for_sampled[j]
                if i == j:
                    transformed_X[:, j] = np.zeros((rows))
                else:
                    transformed_X[:, j] = orthogonal_vector_to_subspace(
                        A, X[:, j])

        else:
            #print "performing linear transformation"
            for j in range(columns):
                #print "in inner loop for " + feature_name_current_i  + " ---->> " + column_list_for_sampled[j]
                if i == j:
                    #skip the current columns, because we are removing it from the
                    #analysis.
                    transformed_X[:, j] = np.zeros((rows))

                else:
                    transformed_X[:, j] = get_orthogonal_vector(
                        current_column, X[:, j])

        #print "done with computing transformed matrix"
        #print "original matrix size"
        #print X.shape
        #print "transformed matrix size"
        #print transformed_X.shape

        #now we have transformed_x the same size as original X with the column i having all zeros.
        # since we have our own new classifier then delete original columne
        transformed_X = np.delete(transformed_X, i, 1)

        #print "deleted original column ----->> " + feature_name_current_i

        #print "transformed matrix size after deleting original columns"
        #print transformed_X.shape

        #print "training and fitting lasso model on transformed data set"
        #now we have have a transformed vector with which we are ready to do another prediction.
        lasso_scores_i = cross_validation.cross_val_score(clf_baseline,
                                                          transformed_X,
                                                          y,
                                                          cv=3,
                                                          n_jobs=1)

        #print "computing scores for the model"
        current_feature_score = np.mean(lasso_scores_i)

        #print current_feature_score
        #print "score for current feature shown above"
        change_in_score = baseline_performance - current_feature_score

        feature_dictionary[feature_name_current_i].append(change_in_score)

    return feature_dictionary
Пример #3
0
def perform_orthogonal_variable_selection(X, y, column_list_for_sampled, non_linear, feature_dictionary):

	X, _ = scale_input_data(X)

	#train base line model with the original model
	#define lasso model

	print "defining lasso model "
	clf_baseline = LassoCV(n_alphas=3, cv=3)

	print "fitting lasso model with cross validation library"
	base_line_lasso = cross_validation.cross_val_score(clf_baseline, X, y, cv = 3, n_jobs=1)

	print "done fitting lasso library, now computing baseline performance"

	#baseline performance 
	baseline_performance = np.mean(base_line_lasso)	

	print baseline_performance

	print "baseline performance printed above"

	#get number of columns and rows for main data
	rows, columns = X.shape 


	print "starting iterative orthogonal transformation for each feature"

	for i in range(columns):
		feature_name_current_i = column_list_for_sampled[i]
		print "Currently on ----->> " + feature_name_current_i
		current_column = X[:,i] 
			
		transformed_X = np.zeros((rows, columns))
		
		if non_linear > 0:
			#print "performing non-linear transformation"
			A = return_non_linear_transformation(current_column, poly=3, log=True,square_root=False, exponential=False, sin=False, cos=False)
			for j in range(columns):
				#print "in inner loop for " + feature_name_current_i  + " ---->> " + column_list_for_sampled[j]
				if i==j:
					transformed_X[:, j]  = np.zeros((rows))
				else:
					transformed_X[:, j] = orthogonal_vector_to_subspace(A, X[:,j])

		else:
			#print "performing linear transformation"
			for j in range(columns):
				#print "in inner loop for " + feature_name_current_i  + " ---->> " + column_list_for_sampled[j]
				if i==j:
					#skip the current columns, because we are removing it from the
					#analysis. 
					transformed_X[:, j]  = np.zeros((rows))

				else:
					transformed_X[:, j] = get_orthogonal_vector(current_column, X[:,j])

		#print "done with computing transformed matrix"
		#print "original matrix size"
		#print X.shape
		#print "transformed matrix size"
		#print transformed_X.shape


		#now we have transformed_x the same size as original X with the column i having all zeros. 
		# since we have our own new classifier then delete original columne
		transformed_X = np.delete(transformed_X, i, 1)

		#print "deleted original column ----->> " + feature_name_current_i

		#print "transformed matrix size after deleting original columns"
		#print transformed_X.shape

		#print "training and fitting lasso model on transformed data set"
		#now we have have a transformed vector with which we are ready to do another prediction. 
		lasso_scores_i = cross_validation.cross_val_score(clf_baseline, transformed_X, y, cv = 3, n_jobs=1)

		#print "computing scores for the model"
		current_feature_score = np.mean(lasso_scores_i)

		#print current_feature_score
		#print "score for current feature shown above"
		change_in_score = baseline_performance - current_feature_score

		feature_dictionary[feature_name_current_i].append(change_in_score)

	return feature_dictionary