def long2wide(in_fname, id, dvs, between=[], within=[], covariates=[], out_fname=None, nested=True): # load in_fname into a PyvtTbl object print('reading "%s"...' % in_fname) cls = DataFrame() cls.read_tbl(in_fname) # loop through DVs and append within columns d = [sorted(set(cls[id]))] header = [id] + covariates + between for col in covariates + between: z = cls.pivot(col, cols=[id], aggregate='arbitrary') d.extend(list(z)) # start controls whether nested factors are examined if nested: start = 1 else: start = len(within) for i, dv in enumerate(dvs): print('\ncollaborating %s' % dv) for j in _xrange(start, len(within) + 1): for factors in _xunique_combinations(within, j): print(' pivoting', factors, '...') z = cls.pivot(dv, rows=factors, cols=[id], aggregate='avg') d.extend(list(z)) # process headers for names in z.rnames: h = '_'.join(('%s.%s' % (f, str(c)) for (f, c) in names)) header.append('%s__%s' % (dv, h)) # Now we can write the data if out_fname == None: out_fname = 'wide_data.csv' with open(out_fname, 'wb') as f: wtr = csv.writer(f) wtr.writerow([n.upper() for n in header]) wtr.writerows(zip(*d)) # transpose and write
def test0(self): df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'], ['COURSE']) self.assertAlmostEqual(3.22222222222, pt[1, 0], 5)
def test1(self): df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY', 'MODEL'], ['COURSE']) self.assertEqual(repr(eval(repr(pt))), repr(pt))
def test3(self): """method='full', aggregate=tolist, invalid row""" R = """\ tolist(id) Name Year member=N member=Y ========================================== name1 2010 [None, None] [0.0, 0.0] name1 2011 [1.0, 1.0] [None, None] name2 2010 [None, None] [None, None] name2 2011 [None, None] [2.0, 2.0] """ df = DataFrame() df.insert({'id':0,'Name':'name1','Year':2010,'member':'Y','rep':1}) df.insert({'id':1,'Name':'name1','Year':2011,'member':'N','rep':1}) df.insert({'id':2,'Name':'name2','Year':2011,'member':'Y','rep':1}) df.insert({'id':0,'Name':'name1','Year':2010,'member':'Y','rep':2}) df.insert({'id':1,'Name':'name1','Year':2011,'member':'N','rep':2}) df.insert({'id':2,'Name':'name2','Year':2011,'member':'Y','rep':2}) my_pivot = df.pivot('id',rows = ['Name','Year'], cols = ['member'], aggregate='tolist', method='full') ## print(my_pivot) self.assertEqual(R,str(my_pivot))
def test4(self): """method='full', aggregate=tolist, invalid col""" R = """\ tolist(id) member Name=name1, Name=name1, Name=name2, Name=name2, Year=2010 Year=2011 Year=2010 Year=2011 ================================================================== N [None, None] [1.0, 1.0] [None, None] [None, None] Y [0.0, 0.0] [None, None] [None, None] [2.0, 2.0] """ df = DataFrame() df.insert({'id':0,'Name':'name1','Year':2010,'member':'Y','rep':1}) df.insert({'id':1,'Name':'name1','Year':2011,'member':'N','rep':1}) df.insert({'id':2,'Name':'name2','Year':2011,'member':'Y','rep':1}) df.insert({'id':0,'Name':'name1','Year':2010,'member':'Y','rep':2}) df.insert({'id':1,'Name':'name1','Year':2011,'member':'N','rep':2}) df.insert({'id':2,'Name':'name2','Year':2011,'member':'Y','rep':2}) my_pivot = df.pivot('id',rows = ['member'], cols = ['Name','Year'], aggregate='tolist', method='full') ## print(my_pivot) self.assertEqual(R,str(my_pivot))
def test2(self): R = """\ GROUP=AA, GROUP=AA, GROUP=AB, GROUP=AB, GROUP=LAB, GROUP=LAB, AGE=old AGE=young AGE=old AGE=young AGE=old AGE=young ======================================================================= 22.188 9.813 29.188 10.041 34.141 11.875 """ df = DataFrame() df.read_tbl('data/suppression~subjectXgroupXageXcycleXphase.csv') pt = df.pivot('SUPPRESSION', cols=['GROUP', 'AGE']) df2 = pt.to_dataframe() self.assertEqual(str(df2),R)
def test3(self): R = np.array( [[7.16666667, 6.5, 4., 3.22222222, 2.88888889, 1.55555556]]) df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', cols=['TIMEOFDAY', 'COURSE']) self.assertEqual(str(R.shape), str(pt.shape)) for r, L in zip(R.flatten(), pt.flatten()): self.assertAlmostEqual(r, L)
def test2(self): R = [ 7.16666666667, 6.5, 4.0, 3.22222222222, 2.88888888889, 1.55555555556 ] df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', cols=['TIMEOFDAY', 'COURSE']) pt_flat = pt.flatten() for r, d in zip(R, pt_flat): self.assertAlmostEqual(r, d)
def test4(self): R = """\ avg(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 ============================================= T2 3.222 2.889 1.556 """ df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'], ['COURSE']) self.assertEqual(R, str(pt[-1:]))
def test2(self): R = [ 7.16666666667, 6.5, 4.0, 3.22222222222, 2.88888888889, 1.55555555556 ] df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY', 'COURSE']) for r, L in zip(R, pt.flat): self.assertEqual('%.5f' % r, '%.5f' % L)
def test5(self): R = """\ avg(ERROR) TIMEOFDAY COURSE=C1 ===================== T1 7.167 T2 3.222 """ df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'], ['COURSE']) self.assertEqual(R, str(pt[:, 0]))
def test1(self): R ="""\ N/A(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 Total ====================================================== T1 51.361 42.250 16 31.574 T2 10.383 8.346 2.420 6.531 ====================================================== Total 23.040 18.778 7.716 15.178 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') sums = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE'],aggregate='sum') counts = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE'],aggregate='count') aves = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE'],aggregate='avg') calc_aves = sums/counts.astype(np.float64) ## print('\n'.join(str(aves).split('\n')[1:])) ## print('\n'.join(str(calc_aves).split('\n')[1:])) self.assertEqual('\n'.join(str(aves).split('\n')[1:]), '\n'.join(str(calc_aves).split('\n')[1:]))
def test0(self): R = """\ TIME CONDITION=A CONDITION=B ================================= day 1.864 1.933 night 1.622 1.731 """ df = DataFrame() df.read_tbl('data/example.csv') df['LOG10_X']=[math.log10(x) for x in df['X']] pt = df.pivot('LOG10_X', ['TIME'], ['CONDITION']) df2 = pt.to_dataframe() self.assertEqual(str(df2),R)
def test2(self): # __add__ ndarray R ="""\ N/A(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 ============================================= T1 12.167 11.500 9 T2 8.222 7.889 6.556 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE']) pt2=pt+np.array([[5,5,5], [5,5,5]]) self.assertEqual(str(pt2),R)
def test1(self): R ="""\ avg(ERROR) TIMEOFDAY=T1, TIMEOFDAY=T1, TIMEOFDAY=T1, TIMEOFDAY=T2, TIMEOFDAY=T2, TIMEOFDAY=T2, Total COURSE=C1 COURSE=C2 COURSE=C3 COURSE=C1 COURSE=C2 COURSE=C3 ===================================================================================================== 7.167 6.500 4 3.222 2.889 1.556 3.896 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', rows=['TIMEOFDAY','COURSE']) pt2 = pt.transpose() self.assertEqual(str(pt2),R)
def test0(self): # __add__ constant R ="""\ N/A(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 Total ====================================================== T1 12.167 11.500 9 10.619 T2 8.222 7.889 6.556 7.556 ====================================================== Total 9.800 9.333 7.778 8.896 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE']) self.assertAlmostEqual(np.sum(pt),25.3333333333, 5)
def test2(self): # __mul__ ndarray R ="""\ N/A(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 ============================================= T1 35.833 32.500 20 T2 16.111 14.444 7.778 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE']) pt2=pt*np.array([[5,5,5], [5,5,5]]) ## print(pt2) self.assertEqual(str(pt2),R)
def test0(self): R = """\ avg(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 Total ===================================================== T1 7.167 6.500 4 5.619 T2 0 2.889 1.556 2.556 ===================================================== Total 4.800 4.333 2.778 3.896 """ df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'], ['COURSE']) pt[1, 0] = 0. self.assertEqual(R, str(pt))
def test3(self): R = """\ GROUP AGE Value ====================== AA old 22.188 AA young 9.813 AB old 29.188 AB young 10.041 LAB old 34.141 LAB young 11.875 """ df = DataFrame() df.read_tbl('data/suppression~subjectXgroupXageXcycleXphase.csv') pt = df.pivot('SUPPRESSION', rows=['GROUP', 'AGE']) df2 = pt.to_dataframe() self.assertEqual(str(df2),R)
def test3(self): R = [ 7.16666666667, 6.5, 4.0, 3.22222222222, 2.88888888889, 1.55555555556 ] Rinds = [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5)] df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', cols=['TIMEOFDAY', 'COURSE']) i = 0 for inds, L in pt.ndenumerate(): self.assertEqual('%.5f' % L, '%.5f' % R[i]) self.assertEqual(str(inds), str(Rinds[i])) i += 1
def test0(self): R ="""\ avg(ERROR) COURSE TIMEOFDAY=T1 TIMEOFDAY=T2 Total ============================================ C1 7.167 3.222 4.800 C2 6.500 2.889 4.333 C3 4 1.556 2.778 ============================================ Total 5.619 2.556 3.896 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE']) pt2 = pt.transpose() self.assertEqual(str(pt2),R)
def test1(self): # __add__ PyvtTbl R ="""\ N/A(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 Total ====================================================== T1 14.333 13 8 11.238 T2 6.444 5.778 3.111 5.111 ====================================================== Total 9.600 8.667 5.556 7.792 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE']) pt2=pt+pt self.assertEqual(str(pt2),R)
def test1(self): R = [ """\ avg(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 ============================================= T1 7.167 6.500 4 """, """\ avg(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 ============================================= T2 3.222 2.889 1.556 """ ] df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'], ['COURSE']) for r, L in zip(R, pt): self.assertAlmostEqual(r, str(L))
def test2(self): R = """\ avg(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 Total ===================================================== T1 -- -- -- 5.619 T2 3.222 2.889 1.556 2.556 ===================================================== Total 4.800 4.333 2.778 3.896 """ df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'], ['COURSE']) pt[0, :] = [0, 0, 0] pt.mask[0, :] = [True, True, True] self.assertEqual(R, str(pt))
def test2(self): R ="""\ avg(ERROR) TIMEOFDAY COURSE Value ========================== T1 C1 7.167 T1 C2 6.500 T1 C3 4 T2 C1 3.222 T2 C2 2.889 T2 C3 1.556 ========================== Total 3.896 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', cols=['TIMEOFDAY','COURSE']) pt2 = pt.transpose() self.assertEqual(str(pt2),R)
def test0(self): # __mul__ constant R ="""\ N/A(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 Total ====================================================== T1 35.833 32.500 20 28.095 T2 16.111 14.444 7.778 12.778 ====================================================== Total 24 21.667 13.889 19.479 """ df=DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'],['COURSE']) pt2=pt*5 ## print(pt2) self.assertEqual(str(pt2),R)
def test4(self): R = [ """\ avg(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 ================================================================================================================================================================= T1 [10.0, 8.0, 6.0, 8.0, 7.0, 4.0, None, None, None] [9.0, 10.0, 6.0, 4.0, 7.0, 3.0, None, None, None] [7.0, 6.0, 3.0, 4.0, 5.0, 2.0, 3.0, 4.0, 2.0] """, """\ avg(ERROR) TIMEOFDAY COURSE=C1 COURSE=C2 COURSE=C3 ========================================================================================================================================================= T2 [5.0, 4.0, 3.0, 4.0, 3.0, 3.0, 4.0, 1.0, 2.0] [4.0, 3.0, 3.0, 4.0, 2.0, 2.0, 3.0, 3.0, 2.0] [2.0, 2.0, 1.0, 2.0, 3.0, 2.0, 1.0, 0.0, 1.0] """ ] df = DataFrame() df.read_tbl('data/error~subjectXtimeofdayXcourseXmodel_MISSING.csv') pt = df.pivot('ERROR', ['TIMEOFDAY'], ['COURSE'], aggregate='tolist') for r, L in zip(R, pt): self.assertEqual(r, str(L))
def test1(self): R = """\ CYCLE PHASE GROUP=AA, GROUP=AA, GROUP=AB, GROUP=AB, GROUP=LAB, GROUP=LAB, AGE=old AGE=young AGE=old AGE=young AGE=old AGE=young ======================================================================================= 1 I 17.750 8.675 12.625 5.525 21.625 7.825 1 II 20.875 8.300 22.750 8.675 36.250 13.750 2 I 22.375 10.225 23.500 8.825 21.375 9.900 2 II 28.125 10.250 41.125 13.100 46.875 14.375 3 I 23.125 10.500 20.000 9.125 23.750 9.500 3 II 20.750 9.525 46.125 14.475 50.375 15.575 4 I 20.250 9.925 15.625 7.750 26.375 9.650 4 II 24.250 11.100 51.750 12.850 46.500 14.425 """ df = DataFrame() df.read_tbl('data/suppression~subjectXgroupXageXcycleXphase.csv') pt = df.pivot('SUPPRESSION', rows=['CYCLE', 'PHASE'], cols=['GROUP', 'AGE']) df2 = pt.to_dataframe() self.assertEqual(str(df2),R)
def test1(self): """method='valid', aggregate=count, invalid row""" R = """\ count(id) Name Year member=N member=Y Total ========================================== name1 2010 0 1 1 name1 2011 1 0 1 name2 2011 0 1 1 ========================================== Total 1 2 3 """ df = DataFrame() df.insert({'id':0,'Name':'name1','Year':2010,'member':'Y'}) df.insert({'id':1,'Name':'name1','Year':2011,'member':'N'}) df.insert({'id':2,'Name':'name2','Year':2011,'member':'Y'}) my_pivot = df.pivot('id',rows = ['Name','Year'], cols = ['member'], aggregate='count') self.assertEqual(R,str(my_pivot))
def test2(self): """method='valid', aggregate=count, invalid col""" R = """\ count(id) member Name=name1, Name=name1, Name=name2, Total Year=2010 Year=2011 Year=2011 ======================================================== N 0 1 0 1 Y 1 0 1 2 ======================================================== Total 1 1 1 3 """ df = DataFrame() df.insert({'id':0,'Name':'name1','Year':2010,'member':'Y'}) df.insert({'id':1,'Name':'name1','Year':2011,'member':'N'}) df.insert({'id':2,'Name':'name2','Year':2011,'member':'Y'}) my_pivot = df.pivot('id',rows = ['member'], cols = ['Name','Year'], aggregate='count') self.assertEqual(R,str(my_pivot))