def cv(theme, percentage, current_svm): [dataset, features] = parse_theme(theme) [known_dataset, known_targets, unk] = split_dataset(dataset, targets) known_targets = np.asarray(known_targets) # cv_features = features_cross_validation(known_dataset, known_targets, features, current_svm) # selected_features = select_final_features_from_cv(cv_features, percentage) selected_features = select_features(percentage, theme) sf = SelectedFeatures(known_dataset, known_targets, selected_features, features) combined_dataset = sf.extract_data_from_selected_features() std = StandardizedData(known_targets, combined_dataset) known_dataset_scaled, known_targets = std.split_and_standardize_dataset() print '####### FEATURES ####### %d \n %s' % (len(selected_features), str(selected_features)) return cross_validation(np.array(known_dataset_scaled), known_targets, ids, current_svm)
return np.ma.masked_array(np.interp(value, x, y)) if __name__ == "__main__": spreadsheet = Spreadsheet(project_data_file) data = Data(spreadsheet) targets = data.targets ids = data.ids theme = raw_input("Theme.\n") percentage = float(raw_input("Percentage as float.\n")) [dataset, features] = parse_theme(theme) [known_dataset, known_targets, unk] = split_dataset(dataset, targets) known_targets = np.asarray(known_targets) selected_features = select_features(percentage, theme) sf = SelectedFeatures(known_dataset, known_targets, selected_features, features) dataset = sf.extract_data_from_selected_features() dataset = preprocessing.scale(dataset) C_range = np.arange(0.1, 9, 0.1) gamma_range = np.arange(0.1, 9, 0.1) param_grid = dict(gamma=gamma_range, C=C_range) # cv = StratifiedShuffleSplit(known_targets, random_state=42) cv = StratifiedKFold(known_targets, n_folds=10) grid = GridSearchCV(SVC(class_weight='auto'), param_grid=param_grid, cv=cv, scoring='f1') grid.fit(dataset, known_targets) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_)) classifiers = []