def SelectFwe_selector(data, target, sf): selector = SelectFwe(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
dataset = pd.read_csv('regressionDataSet.csv') x = dataset.iloc[:, 1:].values y = dataset.iloc[:, 0].values #feature selector 1 from sklearn.feature_selection import SelectKBest fs1 = SelectKBest(k=5) x_new1 = fs1.fit_transform(x, y) #feature selector 2 from sklearn.feature_selection import SelectFdr fs2 = SelectFdr() x_new2 = fs2.fit_transform(x, y) #feature selector 3 from sklearn.linear_model import LinearRegression estimator = LinearRegression() from sklearn.feature_selection import RFE fs3 = RFE(estimator, 5) x_new3 = fs3.fit_transform(x, y) #feature selector 4 from sklearn.feature_selection import SelectFromModel fs4 = SelectFromModel(estimator) x_new4 = fs4.fit_transform(x, y) #feature selector 5 from sklearn.feature_selection import SelectFwe fs5 = SelectFwe() x_new5 = fs5.fit_transform(x, y)
#splitting training and test set x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=0) #Chi-Squared Analysis sel = SelectPercentile(chi2, percentile=80) sel.fit(x_train, y_train) x_train = sel.transform(x_train) x_test = sel.transform(x_test) #Univariate Feature Selection fs = SelectFwe(alpha=150.0) x_train = fs.fit_transform(x_train, y_train) x_test = fs.transform(x_test) #Classifier Fitting clf = svm.LinearSVC(C=10, penalty='l2', loss='l1', dual=True, fit_intercept=False, class_weight='auto') clf.fit(x_train, y_train) ############################################### '''Printed Data Analysis''' ###############################################
def preprocess_dataset(X, y, features, exploration_results, fs_example=False): """ Preprocess the data according to earlier performed exploration results with found issues. These issues are based on: - feature types, - feature dimensionality, - missing values, - output imbalance, - irrelevant features, - normalisation, - multicollinearity Since feature selection can be very dataset specific, it can also be removed from the preprocessing list. :param X: A numpy matrix of the data. First axis corresponding to instances, second axis corresponding to samples :param y: A numpy array of the output. The length of the array should correspond to the size of the first axis of X :param features: A numpy array of the feature names. The length of the array should correspond to the size of the second axis of X :param exploration_results: A dict with the results of the earlier exploration, corresponding to the aforementioned issues :param fs_example: Whether also an example of feature selection should be done. Default: False :return: The preprocessed X, y and features """ # Test the input to be according to the standards robustness_methods.check_input_arrays(X, y, features) # First change data for missing values if exploration_results['mv']: print("\nStarting missing value handling...") old_features = np.copy(features) if exploration_results['cca']: X, y = LDM.cca(X, y, missing_values='') elif exploration_results['aca']: X, features = LDM.aca(X, features, missing_values='') else: X, features = LDM.aca(X, features, missing_values='', removal_fraction=0.15) X = impute.mean_imputation(X, missing_values='') removed_features = _return_removed_features(features, old_features) print( "These features are removed due to having too many missing values: %s" % removed_features) if exploration_results['irrelevance'] > 0: print("\nRemoving irrelevant features...") # Remove irrelevant irr_feat_loc = exploration_results['irrelevant_features'] X = np.delete(X, irr_feat_loc, axis=1) old_features = np.copy(features) features = np.delete(features, irr_feat_loc) removed_features = _return_removed_features(features, old_features) print("These features are removed due to having no information: %s" % removed_features) _return_removed_features(features, old_features) if exploration_results['norm_means'] or exploration_results['norm_stdev']: print("\nNormalising numeric features...") # Normalise or standardise values NS.normalise_numeric_features(X, exploration_results['stand'], exploration_results['norm_means'], exploration_results['norm_stdev']) # Than change categorical to numeric values if exploration_results['cat']: print("\nHot encoding categorical values...") X, features = HE.hot_encode_categorical_features(X, features) if exploration_results['fs'] and fs_example: print("\nDoing an example of feature selection...") # Feature selection if multicollinearity if exploration_results['mc']: # Remove multicollinearity feature_selector = WM.ForwardSelector(threshold=0.0001) # Order to have more relevant features first feature_orderer = OM.FeatureOrderer(f_classif) X = feature_orderer.fit_transform(X, y) features = features[np.argsort(-feature_orderer.scores_)] else: feature_selector = SF(f_classif, alpha=0.05) # Transform data to feature_selection X = feature_selector.fit_transform(X, y) old_features = np.copy(features) features = features[feature_selector.get_support()] # Remove extra features as only 200 are needed. if features.shape[0] > 200: print( "Extra feature selection is done to reduce the number of features to 200..." ) extra_feature_selector = SelectKBest(f_classif, k=200) X = extra_feature_selector.fit_transform(X, y) features = features[feature_selector.get_support()] removed_features = _return_removed_features(features, old_features) print("These features are removed due to feature selection: %s" % removed_features) return X, y, features
fs = SelectFwe(alpha=700.0) print "Before", x_train.shape clf = svm.LinearSVC(C=100, penalty="l2", dual=False) clf.fit(x_train, y_train) print "NO FEATURE SELECTION" print "Training Accuracy" print clf.decision_function(x_train) print (classification_report(y_train, clf.predict(x_train), target_names=target_names)) print "Testing Accuracy" print (classification_report(y_test, clf.predict(x_test), target_names=target_names)) x_train = fs.fit_transform(x_train, y_train) print "After", x_train.shape clf.fit(x_train, y_train) """ w = clf.coef_ print w a = np.array(w[0].todense(), dtype=np.float) b = np.array(w[1].todense(), dtype=np.float) c = -100*a/b print a, b, c xx = np.linspace(-5, 5) yy = c * xx - clf.intercept_[0] / b
def run(): target_names = ["Self", "Another Person", "General Statement"] tweets_and_labels = parse_labeled_data(filename) #splitting training and test set y_train, x_test, x_train = get_x_y(tweets_and_labels, testdata) #Chi-Squared Analysis sel = SelectPercentile(chi2, percentile=80) sel.fit(x_train, y_train) x_train = sel.transform(x_train) x_test = sel.transform(x_test) #Univariate Feature Selection fs = SelectFwe(alpha=150.0) x_train = fs.fit_transform(x_train, y_train) x_test = fs.transform(x_test) #Classifier Fitting clf = svm.LinearSVC(C=10, penalty='l2', loss='l1', dual=True, fit_intercept=False, class_weight='auto') clf.fit(x_train, y_train) returned = clf.predict(x_test) print returned #Print relevant usernames & tweets to .csv file t = time.strftime("%d_%m_%Y") output1 = 'classifications/' + t + '_self.csv' output2 = 'classifications/' + t + '_another_person.csv' with open(output1, 'w+') as o1: wr = csv.writer(o1, quoting=csv.QUOTE_ALL) for i, val in enumerate(returned): if val == 0: row = [testdata[i][1], testdata[i][0]] wr.writerow(row) with open(output2, 'w+') as o2: wr = csv.writer(o2, quoting=csv.QUOTE_ALL) for i, val in enumerate(returned): if val == 1: row = [testdata[i][1], testdata[i][0]] wr.writerow(row) ######################################################################## '''Graphing of Data''' '''Note, since there is no annotation for test data''' '''This is a visual representation of output data, not model accuracy''' ######################################################################## graph = True if (graph): #Graph setup X, Y, Z, new_y = graph_setup(clf, x_test, returned) #graph Scatter Plot of training data graph_scatter(x_train, y_train) #Graph 3D Plot of test data graph_3d(X, Y, Z, new_y) #Graph 2-D Plot of test data graph_2d(X, Y, new_y)