예제 #1
0
def remove_cols_where(M, arguments):
    """Returns a structured array containing columns not adhering to a query

    Parameters
    ----------
    M : numpy.ndarray
        Structured array 
    arguments : list of dict
        See module documentation

    Returns
    -------
    numpy.ndarray
        Structured array without specified columns

    """
    M = utils.check_sa(M)
    args = __check_args_col_select(arguments)

    to_remove = np.ones(len(M.dtype), dtype=bool)
    for arg_set in arguments:
        lambd, vals = (arg_set['func'], arg_set['vals'])
        to_remove = np.logical_and(to_remove, lambd(M, vals))
    remove_col_names = [
        col_name for col_name, included in zip(M.dtype.names, to_remove)
        if included
    ]
    return remove_cols(M, remove_col_names)
예제 #2
0
def remove_cols_where(M, arguments):
    """Returns a structured array containing columns not adhering to a query

    Parameters
    ----------
    M : numpy.ndarray
        Structured array 
    arguments : list of dict
        See module documentation

    Returns
    -------
    numpy.ndarray
        Structured array without specified columns

    """
    M = utils.check_sa(M)
    args = __check_args_col_select(arguments)

    to_remove = np.ones(len(M.dtype), dtype=bool)
    for arg_set in arguments:
        lambd, vals = (arg_set['func'], arg_set['vals'])
        to_remove = np.logical_and(to_remove, lambd(M,  vals))
    remove_col_names = [col_name for col_name,included in 
                        zip(M.dtype.names, to_remove) if included] 
    return remove_cols(M, remove_col_names)
예제 #3
0
파일: server.py 프로젝트: digideskio/hylas
def run_csv(fin, uid_feature, label_feature, clfs=DBG_std_clfs):
    """ Turn a CSV into an Experiment then turn the Experiment into models"""

    sa = open_csv_as_sa(fin)
    labels = sa[label_feature]
    M = remove_cols(sa, label_feature)
    exp = Experiment(M, labels, clfs=clfs)
    register_exp(exp, uid_feature)
예제 #4
0
    def test_remove_cols(self):
        M = np.array(
            [(1, 'a', 1.0, datetime(2015, 12, 12)), 
             (2, 'b', 2.0, datetime(2015, 12, 13))], 
            dtype=[('int', int), ('str', 'O'), ('float', float),
                   ('dt', 'M8[us]')])

        ctrl = np.array(
            [(1, 'a', 1.0), (2, 'b', 2.0)], 
            dtype=[('int', int), ('str', 'O'), ('float', float)])
        res = utils.remove_cols(M, 'dt')
        self.assertTrue(np.array_equal(ctrl, res))

        ctrl = np.array([(1, 'a'), (2, 'b')], dtype=[('int', int), 
                                                     ('str', 'O')])
        res = utils.remove_cols(M, ['dt', 'float'])        
        self.assertTrue(np.array_equal(ctrl, res))
예제 #5
0
파일: server.py 프로젝트: kkhanh89/hylas
def run_csv(fin, uid_feature, label_feature, clfs=DBG_std_clfs):
    """ Turn a CSV into an Experiment then turn the Experiment into models"""

    sa = open_csv_as_sa(fin)
    labels = sa[label_feature]
    M = remove_cols(sa, label_feature)
    exp = Experiment(M, labels, clfs=clfs)
    register_exp(exp, uid_feature)
예제 #6
0
파일: server.py 프로젝트: jigyasu10/hylas
def run_csv(fin, uid_feature, label_feature):
    sa = open_csv_as_sa(fin)
    labels = sa[label_feature]
    M = remove_cols(sa, label_feature)
    exp = Experiment(M, labels, clfs=DBG_std_clfs)
    exp.run()
    last_experiments[current_user.id] = exp
    clear_models(current_user.id)
    for trial in exp.trials:
        for subset in trial.runs:
            for run in subset:
                register_model(current_user.id, run.clf, dt.now(),
                               run.M[run.train_indices],
                               run.M[run.test_indices],
                               run.labels[run.train_indices],
                               run.labels[run.test_indices], run.col_names,
                               uid_feature)
예제 #7
0
파일: server.py 프로젝트: jigyasu10/hylas
def run_csv(fin, uid_feature, label_feature):
    sa = open_csv_as_sa(fin)
    labels = sa[label_feature]
    M = remove_cols(sa, label_feature)
    exp = Experiment(M, labels, clfs=DBG_std_clfs)
    exp.run()
    last_experiments[current_user.id] = exp
    clear_models(current_user.id)
    for trial in exp.trials:
        for subset in trial.runs:
            for run in subset:
                register_model(
                        current_user.id,
                        run.clf, 
                        dt.now(),
                        run.M[run.train_indices], 
                        run.M[run.test_indices], 
                        run.labels[run.train_indices], 
                        run.labels[run.test_indices], 
                        run.col_names, 
                        uid_feature)
예제 #8
0
    def subset_over(
        self,
        label_col,
        interval_train_window_start,
        interval_train_window_size,
        interval_test_window_start,
        interval_test_window_size,
        interval_inc_value,
        interval_expanding=False,
        row_M_col_name=None,
        row_M_train_window_start=None,
        row_M_train_window_size=None,
        row_M_test_window_start=None,
        row_M_test_window_size=None,
        row_M_inc_value=None,
        row_M_expanding=False,
        clfs=[{"clf": RandomForestClassifier}],
        feature_gen_lambda=None,
    ):
        """
        Generates ArrayGenerators according to some subsetting directive.

        There are two ways that we determine what the train and test sets are
        for each trial:

        1. The start time/stop time interval. This is the interval used to
           create features in the M-formatted matrix. Setting the start 
           time/stop time of this interval is equalivalent to passing values 
           to set_interval.  variables pertaining to this interval have the 
           interval* prefix.

        2. The rows of the M matrix to select, based on the value of some
           column in the M matrix. Setting the start and end of this interval
           is equivalent to passing values to select_rows_in_M. Values 
           pertaining to this set of rows have the row_M* prefix. Taking
           subsets over rows of M is optional, and it will only occur if
           row_M_col_name is not None

        Parameters
        ----------
        label_col : str
            The name of the column containing labels
        interval_train_window_start : number or datetime
            start of training interval
        interval_train_window_size : number or datetime
            (Initial) size of training interval
        interval_test_window_start : number or datetime
            start of testing interval
        interval_test_window_size : number or datetime
            size of testing interval
        interval_inc_value : datetime, timedelta, or number
            interval to increment train and test interval
        interval_expanding : boolean
            whether or not the training interval is expanding
        row_M_col_name : str or None
            If not None, the name of the feature which will be used to select
            different training and testing sets in addition to the interval

            If None, train and testing sets will use all rows given a 
            particular time interval
        row_M_train_window_start : ? or None
            Start of train window for M rows. If None, uses
            interval_train_window_start
        row_M_train_window_size : ? or None
            (Initial) size of train window for M rows. If None, uses
            interval_train_window_size
        row_M_test_window_start : ? or None
            Start of test window for M rows. If None, uses
            interval_test_window_start
        row_M_train_window_size : ? or None
            size of test window for M rows. If None, uses
            interval_test_window_size
        row_M_inc_value : ? or None
            interval to increment train and test window for M rows. If None,
            uses interval_inc_value
        row_M_expanding : bool
            whether or not the training window for M rows is expanding
        clfs : list of dict
            classifiers and parameters to run with each train/test set. See
            documentation for diogenes.grid_search.experiment.Experiment.
        feature_gen_lambda : (np.ndarray, str, ?, ?, ?, ?) -> np.ndarray or None
            If not None,function to by applied to generated arrays before they 
            are fit to classifiers. Must be a function of signature:

            f(M, test_or_train, interval_start, interval_end, row_M_start,
              row_M_end)

            Where:
            * M is the generated array, 
            * test_or_train is 'test' if this is a test set or 'train' if it's
              a train set
            * interval_start and interval_end define the interval
            * row_M_start and row_M_end define the rows of M that are included

        Returns
        -------
        diogenes.grid_search.experiment.Experiment
            Experiment collecting train/test sets that have been run
        """
        if row_M_train_window_start is None:
            row_M_train_window_start = interval_train_window_start
        if row_M_train_window_size is None:
            row_M_train_window_size = interval_train_window_size
        if row_M_test_window_start is None:
            row_M_test_window_start = interval_test_window_start
        if row_M_test_window_size is None:
            row_M_test_window_size = interval_test_window_size
        if row_M_inc_value is None:
            row_M_inc_value = interval_inc_value

        conn = self.__conn
        col_specs = self.__col_specs
        table_name = self.__rg_table_name

        sql_get_max_interval_end = "SELECT MAX({}) FROM {}".format(col_specs["stop_time"], table_name)
        interval_end = conn.execute(sql_get_max_interval_end)[0][0]
        if row_M_col_name is not None:
            sql_get_max_col = ("SELECT MAX({}) FROM {} " "WHERE {} = '{}'").format(
                col_specs["val"], table_name, col_specs["feature"], row_M_col_name
            )
            row_M_end = conn.execute(sql_get_max_col)[0][0]
        else:
            row_M_end = interval_end

        trial_directives = []
        for clf_params in clfs:
            clf = clf_params["clf"]
            all_clf_ps = clf_params.copy()
            del all_clf_ps["clf"]
            for param_dict in utils.transpose_dict_of_lists(all_clf_ps):
                trial_directives.append((clf, param_dict, []))

        current_interval_train_start = interval_train_window_start
        current_interval_train_end = interval_train_window_start + interval_train_window_size
        current_interval_test_start = interval_test_window_start
        current_interval_test_end = interval_test_window_start + interval_test_window_size
        current_row_M_train_start = row_M_train_window_start
        current_row_M_train_end = row_M_train_window_start + row_M_train_window_size
        current_row_M_test_start = row_M_test_window_start
        current_row_M_test_end = row_M_test_window_start + row_M_test_window_size
        while current_interval_test_end <= interval_end and current_row_M_test_end <= row_M_end:
            ae_train = self.set_interval(current_interval_train_start, current_interval_train_end)
            ae_test = self.set_interval(current_interval_test_start, current_interval_test_end)
            if row_M_col_name is not None:
                ae_train = ae_train.select_rows_in_M(
                    "{col} >= {start} AND {col} <= {stop}".format(
                        col=row_M_col_name, start=current_row_M_train_start, stop=current_row_M_train_end
                    )
                )
                ae_test = ae_test.select_rows_in_M(
                    "{col} >= {start} AND {col} <= {stop}".format(
                        col=row_M_col_name, start=current_row_M_test_start, stop=current_row_M_test_end
                    )
                )
            # TODO this should actually run clfs and build an experiment
            # rather than doing this yield
            data_train = ae_train.emit_M()
            M_train = utils.remove_cols(data_train, label_col)
            y_train = data_train[label_col]
            data_test = ae_test.emit_M()
            M_test = utils.remove_cols(data_test, label_col)
            y_test = data_test[label_col]

            if feature_gen_lambda is not None:
                M_train = feature_gen_lambda(
                    M_train,
                    "train",
                    current_interval_train_start,
                    current_interval_train_end,
                    current_row_M_train_start,
                    current_row_M_train_end,
                )
                M_test = feature_gen_lambda(
                    M_test,
                    "test",
                    current_interval_test_start,
                    current_interval_test_end,
                    current_row_M_test_start,
                    current_row_M_test_end,
                )

            col_names = M_train.dtype.names
            M_train_nd = utils.cast_np_sa_to_nd(M_train)
            M_test_nd = utils.cast_np_sa_to_nd(M_test)

            for clf, params, runs in trial_directives:
                clf_inst = clf(**params)
                clf_inst.fit(M_train_nd, y_train)
                runs.append(
                    exp.Run(
                        M_train_nd,
                        y_train,
                        col_names,
                        clf_inst,
                        None,
                        None,
                        col_names,
                        np.arange(len(col_names)),
                        {
                            "train_interval_start": current_interval_train_start,
                            "train_interval_end": current_interval_train_end,
                            "test_interval_start": current_interval_test_start,
                            "test_interval_end": current_interval_test_end,
                        },
                        {
                            "train_start": current_row_M_train_start,
                            "train_end": current_row_M_train_end,
                            "test_start": current_row_M_test_start,
                            "test_end": current_row_M_test_end,
                        },
                        M_test_nd,
                        y_test,
                    )
                )

            if not interval_expanding:
                current_interval_train_start += interval_inc_value
            current_interval_train_end += interval_inc_value
            current_interval_test_start += interval_inc_value
            current_interval_test_end += interval_inc_value
            if not row_M_expanding:
                current_row_M_train_start += row_M_inc_value
            current_row_M_train_end += row_M_inc_value
            current_row_M_test_start += row_M_inc_value
            current_row_M_test_end += row_M_inc_value

        trials = [
            exp.Trial(None, None, None, clf, params, "Array Emitter", {}, "Array Emitter", {}, [runs])
            for clf, params, runs in trial_directives
        ]
        return exp.Experiment(None, None, clfs, [{"subset": "Array Emitter"}], [{"cv": "Array Emitter"}], trials)
예제 #9
0
from diogenes.read import open_csv_url
from diogenes.display import (plot_correlation_scatter_plot,
                               plot_correlation_matrix, 
                               plot_kernel_density,
                               plot_box_plot)

from diogenes.grid_search import Experiment 
from diogenes.grid_search import std_clfs as std_clfs
from diogenes.utils import remove_cols


data = open_csv_url(
            'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv',  
            delimiter=';')
y = data['quality']
M = remove_cols(data, 'quality')

y = y < np.average(y)


if False:
    for x in describe_cols(M):
        print x

if False:
   plot_correlation_scatter_plot(M) 
   plot_correlation_matrix(M)
   plot_kernel_density(M['f0']) #no designation of col name
   plot_box_plot(M['f0']) #no designation of col name

exp = Experiment(M, y, clfs=std_clfs)
예제 #10
0
                X_cols = filter(lambda i: regex.search(i), self._col_names)
                yield (np.arange(self._y.shape[0]), X_cols, {'max_grade' : max_grade})

    def __repr__(self):
        return 'SubsetSchool({})'.format(grades)

DATA_PATH = '/home/zar1/hs-scratch/'

fin = open(os.path.join(DATA_PATH, 'data_rec_array.pkl'))
print 'loading data'
M = cPickle.load(fin)
fin.close()
print 'data loaded'

y = M['label']
M = remove_cols(M, ['label', 'student_id', 'index'])

print 'set up data'

M = replace_missing_vals(M, 'constant', np.nan)
print 'imputed'


min_year = min(M['cohort'])

clfs = [{'clf': RandomForestClassifier, 'random_state': [0]}]
csvs = []
train_start = min_year
train_window_size = 2
init_train_window_end = train_start + train_window_size - 1
for max_grade in xrange(9, 12):
예제 #11
0
import sklearn.datasets

from diogenes.read import open_csv_url
from diogenes.display import (plot_correlation_scatter_plot,
                              plot_correlation_matrix, plot_kernel_density,
                              plot_box_plot)

from diogenes.grid_search import Experiment
from diogenes.grid_search import std_clfs as std_clfs
from diogenes.utils import remove_cols

data = open_csv_url(
    'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv',
    delimiter=';')
y = data['quality']
M = remove_cols(data, 'quality')

y = y < np.average(y)

if False:
    for x in describe_cols(M):
        print x

if False:
    plot_correlation_scatter_plot(M)
    plot_correlation_matrix(M)
    plot_kernel_density(M['f0'])  #no designation of col name
    plot_box_plot(M['f0'])  #no designation of col name

exp = Experiment(M, y, clfs=std_clfs)
exp.make_csv()