Exemplo n.º 1
0
def extract_features(extraction_method, index=None, percent_subset=100, classification=False):
    """Runs the given extraction method on only those galaxys listed in index. Return a subset of
    those galaxies.

    Attrubutes:
        extraction_method: Extraction method to use. See feature_extraction
        index: Index of Galaxy for which to process data. If None, process all galaxies.
        percent_subset: Returns a subset of the data of this size (percent).

    Returns: A Tuple containing (X, y), with X being the features and y the labels.
    """
    data = GalaxyData(extraction_method, scale_features=False)
    if index is not None:
        data.set_restricted_universe(index)

    if percent_subset == 100:
        (X, y) = data.get_training_data(competition=True)
    else:
        (X, y, _, _) = data.split_training_and_validation_data(100-percent_subset, competition=True)

    y = get_reduced_solutions(y, classification=classification)
    return (X, y)
Exemplo n.º 2
0
import random
import numpy as np
import pandas as pd
import SimpleCV as cv

from sklearn import svm
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

import evaluate
import feature_extraction
from galaxy_data import GalaxyData

solutions_raw = pd.read_csv("./input_data/training_solutions_rev1.csv", index_col="GalaxyID")
solutions = solutions_raw[["Class1.1", "Class1.2"]]

upper_threshold = 1

solutions = solutions[(solutions >= upper_threshold)]

solutions = solutions.dropna(how='all')
#solutions.apply(lambda x: x[0] if not isnan(x[0]) else x[1], axis=1).to_frame()
solutions = solutions.applymap(lambda x: 0 if np.isnan(x) else x)

data = GalaxyData(feature_extraction.raw)
data.set_restricted_universe(solutions.index)
(feature_vectors, _) = data.get_training_data()