def next_split(self): data = self.p['data'] if self.count == 0: if self.p['train_only']: data = data['train'] cv = StratifiedKFold(n_splits=self.p['kfold'], random_state=self.p['seed'], shuffle=True) self.cv = cv.split(data.x, data.y) if sys.version_info[0] == 2: fit_index, pre_index = self.cv.next() else: fit_index, pre_index = self.cv.__next__() dfit = nx.Data(data.df.take(fit_index)) dpre = nx.Data(data.df.take(pre_index)) return dfit, dpre
def micro_data(index=None): """Returns a tiny data object for use in unit testing""" cols = ['era', 'region', 'x1', 'x2', 'x3', 'kazutsugi'] df = pd.DataFrame(columns=cols) d0 = ['era1', 'train'] + [0.00, 0.01, 0.02] + [0] d1 = ['era2', 'train'] + [0.10, 0.11, 0.12] + [1] d2 = ['era2', 'train'] + [0.20, 0.21, 0.22] + [0] d3 = ['era3', 'validation'] + [0.30, 0.31, 0.32] + [1] d4 = ['era3', 'validation'] + [0.40, 0.41, 0.42] + [0] d5 = ['era3', 'validation'] + [0.50, 0.51, 0.52] + [1] d6 = ['era4', 'validation'] + [0.60, 0.61, 0.62] + [0] d7 = ['eraX', 'test'] + [0.70, 0.71, 0.72] + [0] d8 = ['eraX', 'test'] + [0.80, 0.81, 0.82] + [0] d9 = ['eraX', 'live'] + [0.90, 0.91, 0.92] + [0] df.loc['index0'] = d0 df.loc['index1'] = d1 df.loc['index2'] = d2 df.loc['index3'] = d3 df.loc['index4'] = d4 df.loc['index5'] = d5 df.loc['index6'] = d6 df.loc['index7'] = d7 df.loc['index8'] = d8 df.loc['index9'] = d9 df['era'] = df['era'].map(ERA_STR_TO_FLOAT) df['region'] = df['region'].map(REGION_STR_TO_FLOAT) if index is not None: df = df.iloc[index] df = df.copy() # assure contiguous memory data = nx.Data(df) return data
def micro_data(index=None, nfeatures=3): "Returns a tiny data object for use in unit testing" cols = ['era', 'region'] cols += ['x' + str(i) for i in range(1, nfeatures + 1)] cols += ['y'] df = pd.DataFrame(columns=cols) df.loc['index0'] = ['era1', 'train'] + [0.0] * nfeatures + [0.] df.loc['index1'] = ['era2', 'train'] + [0.1] * nfeatures + [1.] df.loc['index2'] = ['era2', 'train'] + [0.2] * nfeatures + [0.] df.loc['index3'] = ['era3', 'validation'] + [0.3] * nfeatures + [1.] df.loc['index4'] = ['era3', 'validation'] + [0.4] * nfeatures + [0.] df.loc['index5'] = ['era3', 'validation'] + [0.5] * nfeatures + [1.] df.loc['index6'] = ['era4', 'validation'] + [0.6] * nfeatures + [0.] df.loc['index7'] = ['eraX', 'test'] + [0.7] * nfeatures + [1.] df.loc['index8'] = ['eraX', 'test'] + [0.8] * nfeatures + [0.] df.loc['index9'] = ['eraX', 'live'] + [0.9] * nfeatures + [1.] if index is not None: df = df.iloc[index] data = nx.Data(df) return data
def test_data_hash(): "test data.hash" d = micro_data() ok_(d.hash() == d.hash(), "data.hash not reproduceable") d2 = nx.Data(d.df[::2]) ok_(d2.hash() == d2.hash(), "data.hash not reproduceable")