def test_featurize_time_series_default_errors(): """Test featurize wrapper function for time series w/ missing errors""" n_channels = 3 t, m, _ = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err'] meta_features = {} fset = featurize.featurize_time_series(t, m, None, features_to_use, meta_features, scheduler=dask.get) t = [[t, t[0:-5], t[0:-10]]] m = [[m[0], m[1][0:-5], m[2][0:-10]]] fset = featurize.featurize_time_series(t, m, None, features_to_use, meta_features, scheduler=dask.get) t = t[0][0] m = m[0][0] fset = featurize.featurize_time_series(t, m, None, features_to_use, meta_features, scheduler=dask.get) assert ('amplitude', 0) in fset.columns
def test_featurize_time_series_default_times(): """Test featurize wrapper function for time series w/ missing times""" n_channels = 3 _, m, e = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err'] meta_features = {} fset = featurize.featurize_time_series(None, m, e, features_to_use, meta_features, scheduler=get_sync) m = [[m[0], m[1][0:-5], m[2][0:-10]]] e = [[e[0], e[1][0:-5], e[2][0:-10]]] fset = featurize.featurize_time_series(None, m, e, features_to_use, meta_features, scheduler=get_sync) m = m[0][0] e = e[0][0] fset = featurize.featurize_time_series(None, m, e, features_to_use, meta_features, scheduler=get_sync)
def test_ignore_exceptions(): import cesium.features.graphs def raise_exc(x): raise ValueError() old_value = cesium.features.graphs.dask_feature_graph['mean'] try: cesium.features.graphs.dask_feature_graph['mean'] = (raise_exc, 't') t, m, e = sample_values() features_to_use = ['mean'] with pytest.raises(ValueError): fset = featurize.featurize_time_series(t, m, e, features_to_use, scheduler=dask.get, raise_exceptions=True) fset = featurize.featurize_time_series(t, m, e, features_to_use, scheduler=dask.get, raise_exceptions=False) assert np.isnan(fset.values).all() finally: cesium.features.graphs.dask_feature_graph['mean'] = old_value
def test_featurize_time_series_pandas_metafeatures(): """Test featurize function for metafeatures passed as Series/DataFrames.""" t, m, e = sample_values() features_to_use = ['amplitude', 'std_err'] meta_features = pd.Series({'meta1': 0.5}) fset = featurize.featurize_time_series(t, m, e, features_to_use, meta_features, scheduler=dask.get) npt.assert_allclose(fset['meta1'], 0.5) n_series = 5 list_of_series = [sample_values() for i in range(n_series)] times, values, errors = [list(x) for x in zip(*list_of_series)] features_to_use = ['amplitude', 'std_err'] meta_features = pd.DataFrame({ 'meta1': [0.5] * n_series, 'meta2': [0.8] * n_series }) fset = featurize.featurize_time_series(times, values, errors, features_to_use, meta_features, scheduler=dask.get) npt.assert_allclose(fset['meta1'], 0.5) npt.assert_allclose(fset['meta2'], 0.8)
def post(self): ts_data = json_decode(self.get_argument('ts_data')) model_id = json_decode(self.get_argument('modelID')) meta_feats = json_decode(self.get_argument('meta_features', 'null')) impute_kwargs = json_decode(self.get_argument('impute_kwargs', '{}')) model = Model.query.get(model_id) model_data = joblib.load(model.file_uri) if hasattr(model_data, 'best_estimator_'): model_data = model_data.best_estimator_ features_to_use = model.featureset.features_list fset = featurize.featurize_time_series(*ts_data, features_to_use=features_to_use, meta_features=meta_feats, raise_exceptions=False) fset = featurize.impute_featureset(fset, **impute_kwargs) fset.index = fset.index.astype(str) # ensure JSON-encodable data = {'preds': model_data.predict(fset)} if hasattr(model_data, 'predict_proba'): data['pred_probs'] = pd.DataFrame(model_data.predict_proba(fset), index=fset.index, columns=model_data.classes_) else: data['pred_probs'] = [] pred_info = Prediction.format_pred_data(fset, data) return self.success(pred_info)
def get_freq_features(N, train_series, times_list, flux_list, train_metadata, subsetting_pos=None): if subsetting_pos is None: subset_times_list = times_list subset_flux_list = flux_list else: subset_times_list = [ v for i, v in enumerate(times_list) if i in set(subsetting_pos) ] subset_flux_list = [ v for i, v in enumerate(flux_list) if i in set(subsetting_pos) ] feats = featurize.featurize_time_series( times=subset_times_list[:N], values=subset_flux_list[:N], features_to_use=[ 'skew', 'percent_beyond_1_std', 'percent_difference_flux_percentile' ], scheduler=None, ) subset = train_series[train_series['object_id'].isin( train_metadata['object_id'].iloc[subsetting_pos].iloc[:N])] models = list(map(fit_multiband_freq, subset.groupby('object_id'))) feats['object_pos'] = subsetting_pos[:N] feats['freq1_freq'] = [model.best_period for model in models] return feats, models
def test_featurize_time_series_single(): """Test featurize wrapper function for single time series""" t, m, e = sample_values() features_to_use = ['amplitude', 'std_err'] meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, meta_features, scheduler=dask.get) assert fset['amplitude'].values.dtype == np.float64
def get_features(time_series): times = [df['mjd'].tolist() for df in time_series] values = [df['flux'].tolist() for df in time_series] errors = [df['flux_err'].tolist() for df in time_series] features = featurize_time_series(times, values, errors, features_to_use=FEATURES) return features.values
def test_featurize_time_series_single_multichannel(): """Test featurize wrapper function for single multichannel time series""" n_channels = 3 t, m, e = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err'] meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, meta_features, scheduler=dask.get) assert ('amplitude', 0) in fset.columns assert 'meta1' in fset.columns
def test_featurize_time_series_single(): """Test featurize wrapper function for single time series""" t, m, e = sample_values() features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.target.values, ['class1'])
def test_featurize_time_series_single(): """Test featurize wrapper function for single time series""" t, m, e = sample_time_series() features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, use_celery=False) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.target.values, ['class1'])
def test_featurize_time_series_pandas_metafeatures(): """Test featurize function for metafeatures passed as Series/DataFrames.""" t, m, e = sample_values() features_to_use = ['amplitude', 'std_err'] meta_features = pd.Series({'meta1': 0.5}) fset = featurize.featurize_time_series(t, m, e, features_to_use, meta_features, scheduler=dask.get) npt.assert_allclose(fset['meta1'], 0.5) n_series = 5 list_of_series = [sample_values() for i in range(n_series)] times, values, errors = [list(x) for x in zip(*list_of_series)] features_to_use = ['amplitude', 'std_err'] meta_features = pd.DataFrame({'meta1': [0.5] * n_series, 'meta2': [0.8] * n_series}) fset = featurize.featurize_time_series(times, values, errors, features_to_use, meta_features, scheduler=dask.get) npt.assert_allclose(fset['meta1'], 0.5) npt.assert_allclose(fset['meta2'], 0.8)
def test_featurize_time_series_default_errors(): """Test featurize wrapper function for time series w/ missing errors""" n_channels = 3 t, m, _ = sample_time_series(channels=n_channels) features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {} fset = featurize.featurize_time_series(t, m, None, features_to_use, target, meta_features, use_celery=False) npt.assert_array_equal(fset.channel, np.arange(n_channels)) t = [[t, t[0:-5], t[0:-10]]] m = [[m[0], m[1][0:-5], m[2][0:-10]]] fset = featurize.featurize_time_series(t, m, None, features_to_use, target, meta_features, use_celery=False) npt.assert_array_equal(fset.channel, np.arange(n_channels)) t = t[0][0] m = m[0][0] fset = featurize.featurize_time_series(t, m, None, features_to_use, target, meta_features, use_celery=False) npt.assert_array_equal(fset.channel, [0])
def test_featurize_time_series_multiple(): """Test featurize wrapper function for multiple time series""" n_series = 5 list_of_series = [sample_values() for i in range(n_series)] times, values, errors = [list(x) for x in zip(*list_of_series)] features_to_use = ['amplitude', 'std_err'] meta_features = [{'meta1': 0.5}] * n_series fset = featurize.featurize_time_series(times, values, errors, features_to_use, meta_features, scheduler=dask.get) npt.assert_array_equal(sorted(fset.columns.get_level_values('feature')), ['amplitude', 'meta1', 'std_err'])
def test_featurize_time_series_no_targets(): t, m, e = sample_values() features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, targets=None, meta_features=meta_features, scheduler=get_sync) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) assert('target' not in fset)
def test_featurize_time_series_uneven_multichannel(): """Test featurize wrapper function for uneven-length multichannel data""" n_channels = 3 t, m, e = sample_values(channels=n_channels) t = [[t, t[0:-5], t[0:-10]]] m = [[m[0], m[1][0:-5], m[2][0:-10]]] e = [[e[0], e[1][0:-5], e[2][0:-10]]] features_to_use = ['amplitude', 'std_err'] meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, meta_features, scheduler=dask.get) assert ('amplitude', 0) in fset.columns assert 'meta1' in fset.columns
def test_featurize_time_series_default_times(): """Test featurize wrapper function for time series w/ missing times""" n_channels = 3 _, m, e = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {} fset = featurize.featurize_time_series(None, m, e, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(fset.channel, np.arange(n_channels)) m = [[m[0], m[1][0:-5], m[2][0:-10]]] e = [[e[0], e[1][0:-5], e[2][0:-10]]] fset = featurize.featurize_time_series(None, m, e, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(fset.channel, np.arange(n_channels)) m = m[0][0] e = e[0][0] fset = featurize.featurize_time_series(None, m, e, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(fset.channel, [0])
def test_featurize_time_series_default_errors(): """Test featurize wrapper function for time series w/ missing errors""" n_channels = 3 t, m, _ = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {} fset = featurize.featurize_time_series(t, m, None, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(fset.channel, np.arange(n_channels)) t = [[t, t[0:-5], t[0:-10]]] m = [[m[0], m[1][0:-5], m[2][0:-10]]] fset = featurize.featurize_time_series(t, m, None, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(fset.channel, np.arange(n_channels)) t = t[0][0] m = m[0][0] fset = featurize.featurize_time_series(t, m, None, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(fset.channel, [0])
def test_featurize_time_series_multiple(): """Test featurize wrapper function for multiple time series""" n_series = 5 list_of_series = [sample_time_series() for i in range(n_series)] times, values, errors = [list(x) for x in zip(*list_of_series)] features_to_use = ['amplitude', 'std_err'] targets = np.array(['class1'] * n_series) meta_features = [{'meta1': 0.5}] * n_series fset = featurize.featurize_time_series(times, values, errors, features_to_use, targets, meta_features, use_celery=False) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.target.values, ['class1'] * n_series)
def test_featurize_time_series_multiple_multichannel(): """Test featurize wrapper function for multiple multichannel time series""" n_series = 5 n_channels = 3 list_of_series = [sample_values(channels=n_channels) for i in range(n_series)] times, values, errors = [list(x) for x in zip(*list_of_series)] features_to_use = ['amplitude', 'std_err'] meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(times, values, errors, features_to_use, meta_features, scheduler=dask.get) assert ('amplitude', 0) in fset.columns assert 'meta1' in fset.columns
def test_featurize_time_series_custom_functions(): """Test featurize wrapper function for time series w/ custom functions""" n_channels = 3 t, m, e = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err', 'test_f'] meta_features = {'meta1': 0.5} custom_functions = {'test_f': lambda t, m, e: np.pi} fset = featurize.featurize_time_series(t, m, e, features_to_use, meta_features, custom_functions=custom_functions, scheduler=dask.get) npt.assert_array_equal(fset['test_f', 0], np.pi) assert ('amplitude', 0) in fset.columns assert 'meta1' in fset.columns
def test_featurize_time_series_multiple(): """Test featurize wrapper function for multiple time series""" n_series = 5 list_of_series = [sample_values() for i in range(n_series)] times, values, errors = [list(x) for x in zip(*list_of_series)] features_to_use = ['amplitude', 'std_err'] targets = np.array(['class1'] * n_series) meta_features = [{'meta1': 0.5}] * n_series fset = featurize.featurize_time_series(times, values, errors, features_to_use, targets, meta_features, scheduler=get_sync) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.target.values, ['class1'] * n_series)
def test_featurize_time_series_custom_dask_graph(): """Test featurize wrapper function for time series w/ custom dask graph""" n_channels = 3 t, m, e = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err', 'test_f', 'test_meta'] meta_features = {'meta1': 0.5} custom_functions = {'test_f': (lambda x: x.min() - x.max(), 'amplitude'), 'test_meta': (lambda x: 2. * x, 'meta1')} fset = featurize.featurize_time_series(t, m, e, features_to_use, meta_features, custom_functions=custom_functions, scheduler=dask.get) assert ('amplitude', 0) in fset.columns assert ('test_f', 0) in fset.columns assert ('test_meta', 0) in fset.columns
def test_featurize_time_series_single_multichannel(): """Test featurize wrapper function for single multichannel time series""" n_channels = 3 t, m, e = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, ['class1'])
def test_featurize_time_series_single_multichannel(): """Test featurize wrapper function for single multichannel time series""" n_channels = 3 t, m, e = sample_time_series(channels=n_channels) features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, use_celery=False) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, ['class1'])
def encode(self, values_data, times=None): """ Encode a column data into time series :param values_data: a list of timeseries data eg: ['91.0 92.0 93.0 94.0', '92.0 93.0 94.0 95.0' ...] :param times: (optional) a list of lists such that, len(times[i])=len(values_data[i]) for all i in range(len(times)) :return: a torch.floatTensor """ features_to_use = self._features ret = [] for i, values in enumerate(values_data): if type(values) == type([]): values = list(map(float, values)) else: values = list(map(lambda x: float(x), values.split())) if times is None: times_row = np.array( [float(i) for i in range(1, len(values) + 1)]) else: times_row = np.array( list(map(lambda x: float(x), times[i].split()))) # np.array(times[i]) with warnings.catch_warnings(): warnings.simplefilter("ignore") row = featurize.featurize_time_series( times=times_row, values=np.array(values), errors=None, features_to_use=features_to_use) vector_row = [] for col in features_to_use: val = list(row[col][0])[0] val1 = 0 if (val in ['nan', None, 'NaN', False]) \ or math.isnan(val) or math.isinf(val): val = 0 val1 = 1 if col in FEATURES_WITH_DEFAULT_NONE: vector_row += [val, val1] # val1 is 1 if its null else: vector_row += [val] ret += [vector_row] ret_tensor = self._pytorch_wrapper(ret) return ret_tensor
def lcFreq(df_main): df_main = df_main.sort_values('mjd').reset_index(drop=True) groups = df_main.groupby('passband') t_list = groups.apply(lambda gr: gr['mjd'].values).tolist() flx_list = groups.apply(lambda gr: gr['flux'].values).tolist() flxer_list = groups.apply(lambda gr: gr['flux_err'].values).tolist() feats = featurize.featurize_time_series(times=t_list, values=flx_list, errors=flxer_list, features_to_use=['freq1_freq'], scheduler=None) feats.columns = feats.columns.droplevel(1) feats['freq1_freq'].mean() feats['freq1_freq'].std() return pd.Series([feats['freq1_freq'].mean(), feats['freq1_freq'].std()], index=['freq', 'freq_std'])
def test_featurize_time_series_custom_script(): """Test featurize wrapper function for time series w/ custom script path""" n_channels = 3 t, m, e = sample_time_series(channels=n_channels) features_to_use = ['amplitude', 'std_err', 'f'] target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, custom_script_path=CUSTOM_SCRIPT, use_celery=False) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'f', 'meta1', 'std_err']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, ['class1'])
def test_featurize_time_series_custom_dask_graph(): """Test featurize wrapper function for time series w/ custom dask graph""" n_channels = 3 t, m, e = sample_time_series(channels=n_channels) features_to_use = ['amplitude', 'std_err', 'test_f'] target = 'class1' meta_features = {'meta1': 0.5} custom_functions = {'test_f': (lambda x: x.min() - x.max(), 'amplitude')} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, custom_functions=custom_functions, use_celery=False) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err', 'test_f']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, ['class1'])
def test_featurize_time_series_uneven_multichannel(): """Test featurize wrapper function for uneven-length multichannel data""" n_channels = 3 t, m, e = sample_values(channels=n_channels) t = [[t, t[0:-5], t[0:-10]]] m = [[m[0], m[1][0:-5], m[2][0:-10]]] e = [[e[0], e[1][0:-5], e[2][0:-10]]] features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, scheduler=get_sync) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, ['class1'])
def test_featurize_time_series_uneven_multichannel(): """Test featurize wrapper function for uneven-length multichannel data""" n_channels = 3 t, m, e = sample_time_series(channels=n_channels) t = [[t, t[0:-5], t[0:-10]]] m = [[m[0], m[1][0:-5], m[2][0:-10]]] e = [[e[0], e[1][0:-5], e[2][0:-10]]] features_to_use = ['amplitude', 'std_err'] target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, use_celery=False) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, ['class1'])
def test_featurize_time_series_custom_functions(): """Test featurize wrapper function for time series w/ custom functions""" n_channels = 3 t, m, e = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err', 'test_f'] target = 'class1' meta_features = {'meta1': 0.5} custom_functions = {'test_f': lambda t, m, e: np.pi} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, custom_functions=custom_functions, scheduler=get_sync) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err', 'test_f']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.test_f.values, np.pi) npt.assert_array_equal(fset.target.values, ['class1'])
def featurize_time_series(ts): from cesium import featurize features_to_use = ["amplitude", "percent_beyond_1_std", "maximum", "max_slope", "median", "median_absolute_deviation", "percent_close_to_median", "minimum", "skew", "std", "weighted_average"] new = featurize.featurize_time_series(times=np.arange(0, np.shape(ts)[0]), values=ts, errors=None, features_to_use=features_to_use) return new.values
def test_featurize_time_series_multiple_multichannel(): """Test featurize wrapper function for multiple multichannel time series""" n_series = 5 n_channels = 3 list_of_series = [sample_values(channels=n_channels) for i in range(n_series)] times, values, errors = [list(x) for x in zip(*list_of_series)] features_to_use = ['amplitude', 'std_err'] targets = np.array(['class1', 'class1', 'class1', 'class2', 'class2']) meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(times, values, errors, features_to_use, targets, meta_features, scheduler=get_sync) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, targets)
def printFrequency(dataTable): """ This function obtain the frequency using cesium, which implies the period. Arguments: dataTable (array-like) : this is the .tbl data file downloaded from Caltech IRSA website """ # freq1_amplitude1: Get the amplitude of the jth harmonic of the ith frequency from a fitted Lomb-Scargle model. features_to_use = ["freq1_freq", "amplitude", "freq1_amplitude1"] fset_cesium = featurize.featurize_time_series( times=dataTable["obsmjd"], values=dataTable["mag_autocorr"], errors=dataTable["magerr_auto"], features_to_use=features_to_use) print(fset_cesium)
def test_featurize_time_series_multiple_multichannel(): """Test featurize wrapper function for multiple multichannel time series""" n_series = 5 n_channels = 3 list_of_series = [sample_time_series(channels=n_channels) for i in range(n_series)] times, values, errors = [list(x) for x in zip(*list_of_series)] features_to_use = ['amplitude', 'std_err'] targets = np.array(['class1', 'class1', 'class1', 'class2', 'class2']) meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(times, values, errors, features_to_use, targets, meta_features, use_celery=False) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, targets)
def test_featurize_time_series_custom_dask_graph(): """Test featurize wrapper function for time series w/ custom dask graph""" n_channels = 3 t, m, e = sample_values(channels=n_channels) features_to_use = ['amplitude', 'std_err', 'test_f'] target = 'class1' meta_features = {'meta1': 0.5} custom_functions = {'test_f': (lambda x: x.min() - x.max(), 'amplitude'), 'test_meta': (lambda x: 2. * x, 'meta1')} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, custom_functions=custom_functions, scheduler=get_sync) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err', 'test_f']) npt.assert_array_equal(fset.channel, np.arange(n_channels)) npt.assert_array_equal(sorted(fset.amplitude.coords), ['channel', 'name', 'target']) npt.assert_array_equal(fset.target.values, ['class1'])
def get_freq_features(N, subsetting_pos=None): if subsetting_pos is None: subset_times_list = times_list subset_flux_list = flux_list else: subset_times_list = [ v for i, v in enumerate(times_list) if i in set(subsetting_pos) ] subset_flux_list = [ v for i, v in enumerate(flux_list) if i in set(subsetting_pos) ] feats = featurize.featurize_time_series( times=subset_times_list[:N], values=subset_flux_list[:N], features_to_use=[ 'freq1_freq', 'freq1_signif', 'freq1_amplitude1', 'skew', 'percent_beyond_1_std', 'percent_difference_flux_percentile' ], scheduler=None) feats['object_pos'] = subsetting_pos[:N] return feats
def test_featurize_time_series_celery(): """Test `featurize_time_series` with Celery. The actual featurization work is being done by `featurize_tools.featurize_single_time_series`, which is called by both the Celery and non-Celery versions; thus, besides the above tests, we only need to check that the Celery task is configured properly.""" t, m, e = sample_time_series() features_to_use = ['amplitude', 'std_err', 'test_f'] # This ideally would be a dummy lambda function but celery can't do that from cesium.science_features import lomb_scargle_fast as lsf custom_functions = {'test_f': lsf.lomb_scargle_fast_period} target = 'class1' meta_features = {'meta1': 0.5} fset = featurize.featurize_time_series(t, m, e, features_to_use, target, meta_features, custom_functions=custom_functions, use_celery=True) npt.assert_array_equal(sorted(fset.data_vars), ['amplitude', 'meta1', 'std_err', 'test_f']) npt.assert_array_equal(fset.target.values, ['class1'])
def try_cesium(df): from cesium import featurize features_to_use = [ "amplitude", "percent_beyond_1_std", "maximum", "max_slope", "median", "median_absolute_deviation", "percent_close_to_median", "minimum", "skew", "std", "weighted_average", ] fset_cesium = featurize.featurize_time_series( times=eeg["times"], values=eeg["measurements"], errors=None, features_to_use=features_to_use, ) print(fset_cesium.head())
# By default, the time series will featurized in parallel using the # ``dask.threaded`` scheduler; other approaches, including serial and # distributed approaches, can be implemented by passing in other ``dask`` # schedulers as the ``get`` argument to ``featurize_time_series``. # # .. |cesium.featurize| replace:: ``cesium.featurize`` # .. _cesium.featurize: http://cesium-ml.org/docs/api/cesium.featurize.html from cesium import featurize features_to_use = [ "amplitude", "percent_beyond_1_std", "maximum", "max_slope", "median", "median_absolute_deviation", "percent_close_to_median", "minimum", "skew", "std", "weighted_average" ] fset_cesium = featurize.featurize_time_series(times=eeg["times"], values=eeg["measurements"], errors=None, features_to_use=features_to_use) print(fset_cesium.head()) ############################################################################### # The output of ``featurize_time_series`` is a ``pandas.DataFrame`` which contains all # the feature information needed to train a machine learning model: feature # names are stored as column indices (as well as channel numbers, as we'll see # later for multi-channel data), and the time series index/class label are # stored as row indices. ############################################################################### # Custom feature functions # ~~~~~~~~~~~~~~~~~~~~~~~~ # Custom feature functions not built into ``cesium`` may be passed in using the # ``custom_functions`` keyword, either as a dictionary ``{feature_name: function}``, or as a
# .. _cesium.featurize: http://cesium-ml.org/docs/api/cesium.featurize.html from cesium import featurize features_to_use = ["amplitude", "percent_beyond_1_std", "maximum", "max_slope", "median", "median_absolute_deviation", "percent_close_to_median", "minimum", "skew", "std", "weighted_average"] fset_cesium = featurize.featurize_time_series(times=eeg["times"], values=eeg["measurements"], errors=None, features_to_use=features_to_use) print(fset_cesium.head()) ############################################################################### # The output of ``featurize_time_series`` is a ``pandas.DataFrame`` which contains all # the feature information needed to train a machine learning model: feature # names are stored as column indices (as well as channel numbers, as we'll see # later for multi-channel data), and the time series index/class label are # stored as row indices. ############################################################################### # Custom feature functions # ~~~~~~~~~~~~~~~~~~~~~~~~ # Custom feature functions not built into ``cesium`` may be passed in using the # ``custom_functions`` keyword, either as a dictionary ``{feature_name: function}``, or as a
def get_cesium_features(self, recompute=False): """ Compute all relevant cesium features. """ cesium = getattr(self, 'cesium', None) if cesium is not None: if not recompute: return cesium cesium = {} outlc = self.get_lc(recompute=recompute) for i, pb in enumerate(outlc): tlc = outlc.get(pb) ttime, tFlux, tFluxErr, tFluxUnred, tFluxErrUnred, tFluxRenorm, tFluxErrRenorm, tphotflag, tzeropoint, tobsId = tlc photmask = tphotflag >= constants.GOOD_PHOTFLAG ttime = ttime[photmask] tFluxRenorm = tFluxRenorm[photmask] tFluxErrRenorm = tFluxErrRenorm[photmask] features_general = ['flux_percentile_ratio_mid20', 'flux_percentile_ratio_mid50', 'flux_percentile_ratio_mid65', 'flux_percentile_ratio_mid80', 'max_slope', 'maximum', 'median', 'median_absolute_deviation', 'percent_amplitude', 'period_fast', 'qso_log_chi2_qsonu', 'qso_log_chi2nuNULL_chi2nu', 'fold2P_slope_90percentile', 'freq1_amplitude1', 'freq1_amplitude2', 'freq1_amplitude3', 'freq1_amplitude4', 'freq1_freq', 'freq1_lambda', 'freq1_rel_phase2', 'freq1_rel_phase3', 'freq1_rel_phase4', 'freq1_signif', 'freq2_amplitude1', 'freq2_amplitude2', 'freq2_amplitude3', 'freq2_amplitude4', 'freq2_rel_phase2', 'freq2_rel_phase3', 'freq2_rel_phase4', 'freq3_amplitude1', 'freq3_amplitude2', 'freq3_amplitude3', 'freq3_amplitude4', 'freq3_rel_phase2', 'freq3_rel_phase3', 'freq3_rel_phase4', 'freq_amplitude_ratio_21', 'freq_amplitude_ratio_31', 'freq_n_alias', 'freq_signif_ratio_21', 'freq_signif_ratio_31', 'freq_varrat', 'freq_y_offset', 'medperc90_2p_p', 'p2p_scatter_pfold_over_mad', 'p2p_ssqr_diff_over_var', 'scatter_res_raw'] if len(tFluxRenorm) <= 1: # if t Flux is empty fset_cesium = pd.DataFrame({f: {'channel': {0: 0}} for f in features_general}) else: fset_cesium = featurize.featurize_time_series(times=ttime, values=tFluxRenorm, errors=tFluxErrRenorm, features_to_use=features_general) cesium[pb] = fset_cesium self.cesium = cesium return cesium