예제 #1
0
파일: splitter.py 프로젝트: sovaa/numerox
 def next_split(self):
     data = self.p['data']
     if self.count == 0:
         if self.p['train_only']:
             data = data['train']
         cv = StratifiedKFold(n_splits=self.p['kfold'],
                              random_state=self.p['seed'],
                              shuffle=True)
         self.cv = cv.split(data.x, data.y)
     if sys.version_info[0] == 2:
         fit_index, pre_index = self.cv.next()
     else:
         fit_index, pre_index = self.cv.__next__()
     dfit = nx.Data(data.df.take(fit_index))
     dpre = nx.Data(data.df.take(pre_index))
     return dfit, dpre
예제 #2
0
def micro_data(index=None):
    """Returns a tiny data object for use in unit testing"""

    cols = ['era', 'region', 'x1', 'x2', 'x3', 'kazutsugi']

    df = pd.DataFrame(columns=cols)
    d0 = ['era1', 'train'] + [0.00, 0.01, 0.02] + [0]
    d1 = ['era2', 'train'] + [0.10, 0.11, 0.12] + [1]
    d2 = ['era2', 'train'] + [0.20, 0.21, 0.22] + [0]
    d3 = ['era3', 'validation'] + [0.30, 0.31, 0.32] + [1]
    d4 = ['era3', 'validation'] + [0.40, 0.41, 0.42] + [0]
    d5 = ['era3', 'validation'] + [0.50, 0.51, 0.52] + [1]
    d6 = ['era4', 'validation'] + [0.60, 0.61, 0.62] + [0]
    d7 = ['eraX', 'test'] + [0.70, 0.71, 0.72] + [0]
    d8 = ['eraX', 'test'] + [0.80, 0.81, 0.82] + [0]
    d9 = ['eraX', 'live'] + [0.90, 0.91, 0.92] + [0]
    df.loc['index0'] = d0
    df.loc['index1'] = d1
    df.loc['index2'] = d2
    df.loc['index3'] = d3
    df.loc['index4'] = d4
    df.loc['index5'] = d5
    df.loc['index6'] = d6
    df.loc['index7'] = d7
    df.loc['index8'] = d8
    df.loc['index9'] = d9
    df['era'] = df['era'].map(ERA_STR_TO_FLOAT)
    df['region'] = df['region'].map(REGION_STR_TO_FLOAT)
    if index is not None:
        df = df.iloc[index]
    df = df.copy()  # assure contiguous memory
    data = nx.Data(df)
    return data
예제 #3
0
def micro_data(index=None, nfeatures=3):
    "Returns a tiny data object for use in unit testing"
    cols = ['era', 'region']
    cols += ['x' + str(i) for i in range(1, nfeatures + 1)]
    cols += ['y']
    df = pd.DataFrame(columns=cols)
    df.loc['index0'] = ['era1', 'train'] + [0.0] * nfeatures + [0.]
    df.loc['index1'] = ['era2', 'train'] + [0.1] * nfeatures + [1.]
    df.loc['index2'] = ['era2', 'train'] + [0.2] * nfeatures + [0.]
    df.loc['index3'] = ['era3', 'validation'] + [0.3] * nfeatures + [1.]
    df.loc['index4'] = ['era3', 'validation'] + [0.4] * nfeatures + [0.]
    df.loc['index5'] = ['era3', 'validation'] + [0.5] * nfeatures + [1.]
    df.loc['index6'] = ['era4', 'validation'] + [0.6] * nfeatures + [0.]
    df.loc['index7'] = ['eraX', 'test'] + [0.7] * nfeatures + [1.]
    df.loc['index8'] = ['eraX', 'test'] + [0.8] * nfeatures + [0.]
    df.loc['index9'] = ['eraX', 'live'] + [0.9] * nfeatures + [1.]
    if index is not None:
        df = df.iloc[index]
    data = nx.Data(df)
    return data
예제 #4
0
def test_data_hash():
    "test data.hash"
    d = micro_data()
    ok_(d.hash() == d.hash(), "data.hash not reproduceable")
    d2 = nx.Data(d.df[::2])
    ok_(d2.hash() == d2.hash(), "data.hash not reproduceable")