def extract_features(extraction_method, index=None, percent_subset=100, classification=False): """Runs the given extraction method on only those galaxys listed in index. Return a subset of those galaxies. Attrubutes: extraction_method: Extraction method to use. See feature_extraction index: Index of Galaxy for which to process data. If None, process all galaxies. percent_subset: Returns a subset of the data of this size (percent). Returns: A Tuple containing (X, y), with X being the features and y the labels. """ data = GalaxyData(extraction_method, scale_features=False) if index is not None: data.set_restricted_universe(index) if percent_subset == 100: (X, y) = data.get_training_data(competition=True) else: (X, y, _, _) = data.split_training_and_validation_data(100-percent_subset, competition=True) y = get_reduced_solutions(y, classification=classification) return (X, y)
import random import numpy as np import pandas as pd import SimpleCV as cv from sklearn import svm from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier import evaluate import feature_extraction from galaxy_data import GalaxyData solutions_raw = pd.read_csv("./input_data/training_solutions_rev1.csv", index_col="GalaxyID") solutions = solutions_raw[["Class1.1", "Class1.2"]] upper_threshold = 1 solutions = solutions[(solutions >= upper_threshold)] solutions = solutions.dropna(how='all') #solutions.apply(lambda x: x[0] if not isnan(x[0]) else x[1], axis=1).to_frame() solutions = solutions.applymap(lambda x: 0 if np.isnan(x) else x) data = GalaxyData(feature_extraction.raw) data.set_restricted_universe(solutions.index) (feature_vectors, _) = data.get_training_data()