def test_group_boolean4(): n = 43701 DT = dt.Frame(A=range(2*n), B=[False, True]*n) DTR = DT[:, dt.sum(f.A), by(f.B)] assert_equals(DTR, dt.Frame(B=[False, True], A=[sum(range(0, 2*n, 2)), sum(range(1, 2*n, 2))], stypes={"A": dt.int64}))
def test_groupby_with_filter2(): # Check that rowindex works even when applied to a view n = 10000 src0 = [random.getrandbits(2) for _ in range(n)] src1 = [random.gauss(1, 1) for _ in range(n)] f0 = dt.Frame({"key": src0, "val": src1}) f1 = f0[f.val >= 0, :] f2 = f1[f.val <= 2, sum(f.val), f.key] answer = [sum(src1[i] for i in range(n) if src0[i] == key and 0 <= src1[i] <= 2) for key in range(4)] assert f2.to_list() == [[0, 1, 2, 3], answer]
def test_reduce_sum(): f0 = dt.Frame({"color": ["red", "blue", "green", "red", "green"], "size": [5, 2, 7, 13, -1]}) f1 = f0[:, sum(f.size), f.color] f1.internal.check() assert f1.topython() == [["blue", "green", "red"], [2, 6, 18]]
def test_groupby_multi_large(seed): random.seed(seed) letters = "abcdefghijklmn" n = 100 + int(random.expovariate(0.0001)) col0 = [random.choice([True, False]) for _ in range(n)] col1 = [random.randint(-10, 10) for _ in range(n)] col2 = [random.choice(letters) for _ in range(n)] col3 = [random.random() for _ in range(n)] rows = [(col0[i], col1[i], col2[i], col3[i]) for i in range(n)] rows.sort() grouped = [] lastkey = rows[0][:3] sumval = 0 for i in range(n): ikey = rows[i][:3] if ikey != lastkey: grouped.append(lastkey + (sumval, )) lastkey = ikey sumval = 0 sumval += rows[i][3] grouped.append(lastkey + (sumval, )) DT0 = dt.Frame([col0, col1, col2, col3], names=["A", "B", "C", "D"]) DT1 = DT0[:, sum(f.D), by(f.A, f.B, f.C)] DT2 = dt.Frame(grouped) assert same_iterables(DT1.to_list(), DT2.to_list())
def test_reduce_sum(): f0 = dt.Frame({"color": ["red", "blue", "green", "red", "green"], "size": [5, 2, 7, 13, -1]}) f1 = f0[:, sum(f.size), f.color] frame_integrity_check(f1) assert f1.to_list() == [["blue", "green", "red"], [2, 6, 18]]
def py_dt_one_group_proportions_summary(DT,por): DT_summary = DT[:,dt.count(),by(f[por]) ][:,f[:].extend({'grand_tot':dt.sum(f.count)}) ][:,f[:].extend({'prop':f.count/f.grand_tot}) ][:,f[:].remove(f.grand_tot),dt.sort(-f.prop) ] return DT_summary
def py_dt_two_group_proportions_summary(DT,por1,por2): DT_summary = DT[:,dt.count(),by(f[por1],f[por2]) ][:,f[:].extend({'group_tot':dt.sum(f.count)}),by(f[por1]) ][:,f[:].extend({'prop':f.count/f.group_tot}) ][:,f[:].remove(f[1]) ] return DT_summary
def test_reduce_sum_same_column(): # See issue #3110 f0 = dt.Frame({"ints": [0, 1, 0, 0, 1, 2]}) f1 = f0[:, {"sum": sum(f.ints)}, f.ints] frame_integrity_check(f1) assert_equals(f1, dt.Frame({ "ints": [0, 1, 2], "sum": [0, 2, 2] / dt.int64 }))
def test_issue_2242(seed): n = 25000 X = dt.Frame(AGE=[random.randint(1, 50) for i in range(n)], PAY=[random.choice([True, False]) for i in range(n)]) RES = X[:, dt.math.log((count() + 1) / (sum(f.PAY) + 0.5) - 1), by(f.AGE)] assert RES.shape == (50, 2) data = RES.to_list() assert data[0] == list(range(1, 51)) assert all(isinstance(x, float) for x in data[1])
def test_shift_reduced_column(): DT = dt.Frame(A=[1, 2, 1, 1, 2, 1], B=range(6)) RES = DT[:, shift(dt.sum(f.B)), by(f.A)] assert_equals( RES, dt.Frame(A=[1, 1, 1, 1, 2, 2], B=[None, 10, 10, 10, None, 5], stypes={ "A": dt.int32, "B": dt.int64 }))
def test_sum_empty_frame(): DT = dt.Frame([[]] * 4, names=list("ABCD"), stypes=(dt.bool8, dt.int32, dt.float32, dt.float64)) assert DT.shape == (0, 4) RZ = DT[:, sum(f[:])] frame_integrity_check(RZ) assert RZ.shape == (1, 4) assert RZ.names == ("A", "B", "C", "D") assert RZ.stypes == (dt.int64, dt.int64, dt.float32, dt.float64) assert RZ.to_list() == [[0], [0], [0], [0]] assert str(RZ)
def test_group_reduce_all_columns(): DT = dt.Frame(id=[3, 3, 3, 3, 4, 4, 4, 4], beef=[23, None, None, None, None, None, None, None], eggs=[None, 33, None, None, 197, 103, None, None], fork=[None, None, 10, None, None, None, 210, None], veg=[17, None, None, 40, 1, 2, None, 340]) assert_equals( DT[:, sum(f[:]), by(f.id)], dt.Frame(id=[3, 4], beef=[23, 0] / dt.int64, eggs=[33, 300] / dt.int64, fork=[10, 210] / dt.int64, veg=[57, 343] / dt.int64))
def loadFlaechen(fileName="covid-19-germany-landkreise.csv"): geodata = dt.fread(fileName) sKeys = geodata[:, 'Regional code'].to_list()[0] values = geodata[:, 'Cadastral area'].to_list()[0] bundeslaenderFlaechen = geodata[:, [dt.sum(dt.f['Cadastral area'])],dt.by(dt.f['Land ID'])] sKeys = sKeys + bundeslaenderFlaechen[:, 'Land ID'].to_list()[0] values = values + bundeslaenderFlaechen[:, 'Cadastral area'].to_list()[0] deutschlandFlaeche = bundeslaenderFlaechen[:,'Cadastral area'].sum() sKeys = sKeys + [0] values = values + deutschlandFlaeche.to_list()[0] valuesDict = dict(zip(sKeys, values)) #print(valuesDict) return valuesDict
def test_groupby_large_random_integers(seed): random.seed(seed) ngrps1 = random.choice([1, 1, 2, 2, 2, 3, 4, 5]) n0 = 1 << random.choice([1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6, 7]) chunks = ([random.sample(range(n0), random.randint(1, n0))] + [random.sample([0] * 100 + list(range(256)), random.randint(1, 20)) for i in range(ngrps1)]) n = int(random.expovariate(0.0001)) + 10 sample = [sum(random.choice(chunks[i]) << (8 * i) for i in range(len(chunks))) for _ in range(n)] nuniques = len(set(sample)) f0 = dt.Frame(sample) assert f0.nunique1() == nuniques f1 = dt.rbind(*([f0] * random.randint(2, 20))) assert f1.nunique1() == nuniques
def analyzeDayRange(fullTable, fromDay, toDay): dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay), :] cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1), :] cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1), :] new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1), :] dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)], dt.by(dt.f.DatenstandTag)] new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1), :] new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)], dt.by(dt.f.DatenstandTag)] if "NeuGenesen" in dayTable.keys(): recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1), :] recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)], dt.by(dt.f.DatenstandTag)] new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1), :] new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)], dt.by(dt.f.DatenstandTag)] #lastDay=fullTable[:,'MeldeDay'].max()[0,0] #lastnewCaseOnDay=fullTable[:,'newCaseOnDay'].max()[0,0] print( "From {}-{} Day {}-{}: cases {} (+{}), dead {} (+{}), recovered {} (+{})" .format(cd.dateStrDMFromDay(fromDay), cd.dateStrDMFromDay(toDay), fromDay, toDay, cases.to_list(), new_cases.to_list(), dead.to_list(), new_dead.to_list(), recovered.to_list(), new_recovered.to_list())) return cases, new_cases, dead, new_dead, recovered, new_recovered
def test_groupby_multi(): DT = dt.Frame(A=[1, 2, 3] * 3, B=[1, 2] * 4 + [1], C=range(9)) res = DT[:, sum(f.C), by("A", "B")] assert res.to_list() == [[1, 1, 2, 2, 3, 3], [1, 2, 1, 2, 1, 2], [6, 3, 4, 8, 10, 5]]
def test_groupby_with_filter1(): f0 = dt.Frame({"KEY": [1, 2, 1, 2, 1, 2], "X": [-10, 2, 3, 0, 1, -7]}) f1 = f0[f.X > 0, sum(f.X), f.KEY] assert f1.to_list() == [[1, 2], [4, 2]]
cache = "TRUE" src_grp = os.environ['SRC_GRP_LOCAL'] data_name = src_grp[:-4] print("loading dataset %s" % data_name) x = dt.fread(os.path.join("data", src_grp)) print(x.nrows) print("grouping...") question = "sum v1 by id1" #1 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v1)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
on_disk = "FALSE" data_name = os.environ['SRC_GRP_LOCAL'] src_grp = os.path.join("data", data_name+".csv") print("loading dataset %s" % data_name, flush=True) x = dt.fread(src_grp) print(x.nrows, flush=True) task_init = timeit.default_timer() print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v1)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()
flush=True, file=sys.stderr) exit( 0 ) # fread string with NAs generates extra distinct group h2oai/datatable#2808 x = dt.fread(src_grp, na_strings=['']) print(x.nrows, flush=True) task_init = timeit.default_timer() print("grouping...", flush=True) question = 'sum v1 by id1' # q1 gc.collect() t_start = timeit.default_timer() ans = x[:, {'v1': sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v1)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
def test_sum_simple(): DT = dt.Frame(A=range(5)) R = DT[:, sum(f.A)] frame_integrity_check(R) assert R.to_list() == [[10]] assert str(R)
task_init = timeit.default_timer() print("joining...", flush=True) question = "small inner on int" # q1 gc.collect() y = small.copy(deep=True) t_start = timeit.default_timer() y.key = 'id1' ans = x[:, :, join(y)][isfinite(f.v2), :] # , on='id1' tmp = ans.copy(deep=True) ## ensure join results materialized #141 print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.v1), sum(f.v2)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache,
def loadAndProcessData(dataFilename): print("Loading " + dataFilename) fullTable = dt.fread(dataFilename) print("Loading done loading table from ‘" + dataFilename + "‘, keys:") print(fullTable.keys()) cases = fullTable[:, 'AnzahlFall'].sum()[0, 0] dead = fullTable[:, 'AnzahlTodesfall'].sum()[0, 0] lastDay = fullTable[:, 'MeldeDay'].max()[0, 0] lastnewCaseOnDay = fullTable[:, 'newCaseOnDay'].max()[0, 0] print("File stats: lastDay {} lastnewCaseOnDay {} cases {} dead {}".format( lastDay, lastnewCaseOnDay, cases, dead)) newTable = fullTable[:, dt.f[:]. extend({"erkMeldeDelay": dt.f.MeldeDay - dt.f.RefDay})] #print(newTable.keys()) #dt.by(dt.f.Bundesland)] alldays = fullTable[:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k), dt.mean(dt.f.Bevoelkerung), dt.max(dt.f.MeldeDay), dt.first(dt.f.LandkreisTyp), dt.first(dt.f.Bundesland) ], dt.by(dt.f.Landkreis)] last7days = fullTable[dt.f.newCaseOnDay > lastDay - 7, :][:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k) ], dt.by(dt.f.Landkreis)] last7days.names = [ "Landkreis", "AnzahlFallLetzte7Tage", "FaellePro100kLetzte7Tage", "AnzahlTodesfallLetzte7Tage", "TodesfaellePro100kLetzte7Tage" ] last7days[dt.f.AnzahlFallLetzte7Tage < 0, "AnzahlFallLetzte7Tage"] = 0 last7days[dt.f.FaellePro100kLetzte7Tage < 0, "FaellePro100kLetzte7Tage"] = 0 last7days[dt.f.AnzahlTodesfallLetzte7Tage < 0, "AnzahlTodesfallLetzte7Tage"] = 0 last7days[dt.f.TodesfaellePro100kLetzte7Tage < 0, "TodesfaellePro100kLetzte7Tage"] = 0 lastWeek7days = fullTable[(dt.f.newCaseOnDay > lastDay - 14) & ( dt.f.newCaseOnDay <= lastDay - 7), :][:, [ dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k) ], dt.by(dt.f.Landkreis)] #lastWeek7days[dt.f[1:] < 0, dt.f[1:]] = 0 lastWeek7days.names = [ "Landkreis", "AnzahlFallLetzte7TageDavor", "FaellePro100kLetzte7TageDavor", "AnzahlTodesfallLetzte7TageDavor", "TodesfaellePro100kLetzte7TageDavor" ] lastWeek7days[dt.f.AnzahlFallLetzte7TageDavor < 0, "AnzahlFallLetzte7TageDavor"] = 0 lastWeek7days[dt.f.FaellePro100kLetzte7TageDavor < 0, "FaellePro100kLetzte7TageDavor"] = 0 lastWeek7days[dt.f.AnzahlTodesfallLetzte7TageDavor < 0, "AnzahlTodesfallLetzte7TageDavor"] = 0 lastWeek7days[dt.f.TodesfaellePro100kLetzte7TageDavor < 0, "TodesfaellePro100kLetzte7TageDavor"] = 0 allDaysExt0 = merge(alldays, last7days, "Landkreis") allDaysExt1 = merge(allDaysExt0, lastWeek7days, "Landkreis") Rw = dt.f.AnzahlFallLetzte7Tage / dt.f.AnzahlFallLetzte7TageDavor allDaysExt2 = allDaysExt1[:, dt.f[:].extend({"AnzahlFallTrend": Rw})] allDaysExt3 = allDaysExt2[:, dt.f[:].extend({ "FaellePro100kTrend": dt.f.FaellePro100kLetzte7Tage - dt.f.FaellePro100kLetzte7TageDavor })] allDaysExt4 = allDaysExt3[:, dt.f[:].extend({ "TodesfaellePro100kTrend": dt.f.TodesfaellePro100kLetzte7Tage - dt.f.TodesfaellePro100kLetzte7TageDavor })] allDaysExt5 = allDaysExt4[:, dt.f[:].extend({ "Kontaktrisiko": dt.f.Bevoelkerung / 6.25 / ((dt.f.AnzahlFallLetzte7Tage + dt.f.AnzahlFallLetzte7TageDavor) * Rw) })] allDaysExt6 = allDaysExt5[:, dt.f[:].extend( {"LetzteMeldung": lastDay - dt.f.MeldeDay})] allDaysExt6[dt.f.Kontaktrisiko * 2 == dt.f.Kontaktrisiko, "Kontaktrisiko"] = 999999 sortedByRisk = allDaysExt6.sort( ["Kontaktrisiko", "LetzteMeldung", "FaellePro100k"]) #print(sortedByRisk) allDaysExt = sortedByRisk[:, dt.f[:].extend({"Rang": 0})] allDaysExt[:, "Rang"] = np.arange(1, allDaysExt.nrows + 1) #print(allDaysExt) print("Column names frame order:", list(enumerate(allDaysExt.names))) data = allDaysExt.to_pandas() return data
def transform(self, X: dt.Frame): return X[:, dt.sum([(dt.f[x] > 0) for x in range(X.ncols)])]
cache = "TRUE" wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0] in_rows = int(wc_lines)-1 print("reading...") question = "all rows" #1 gc.collect() t_start = timeit.default_timer() ans = dt.fread(data_name, show_progress=False) print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v3)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = dt.fread(data_name, show_progress=False) print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v3)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) del ans gc.collect()
fun = "[.datatable" cache = "TRUE" data_name = os.environ['SRC_GRP_LOCAL'] src_grp = os.path.join("data", data_name+".csv") print("loading dataset %s" % data_name, flush=True) x = dt.fread(src_grp) print(x.nrows, flush=True) print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v1)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()
def transform(self, X: dt.Frame): if X.ncols == 0: return np.zeros((X.nrows, 1)) return X[:, dt.sum([dt.isna(dt.f[x]) for x in range(X.ncols)])]
def test_group_empty_frame4(): DT = dt.Frame(A=[], stype=dt.float32) D2 = DT[:, sum(f.A), by(f.A)] frame_integrity_check(D2) assert D2.shape == (0, 2) assert D2.stypes == (dt.float32, dt.float32)
cache = "TRUE" print("loading dataset...") x = dt.fread(data_name) print("sorting...") gc.collect() t_start = timeit.default_timer() ans = x.sort('KEY') print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.X2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = x.sort('KEY') print(ans.shape) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.X2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt) del ans
fun = "[.datatable" cache = "TRUE" data_name = os.environ['SRC_GRP_LOCAL'] src_grp = os.path.join("data", data_name + ".csv") print("loading dataset %s" % data_name, flush=True) x = dt.fread(src_grp) print(x.nrows, flush=True) print("grouping...", flush=True) question = "sum v1 by id1" # q1 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v1": sum(f.v1)}, by(f.id1)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.v1)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
Same as above ones # Groupig based on NA's field seatle_bikes_dt_tidy[:,{ 'total':count() },by(dt.isna(f.ped_count), f.year,f.crossing) ] ##### EDA - 4.1 Syntax explanation dt.isna is a function to check an observation is a Null or Not, here how many of ped_count NA's per year and crossing were caluclated #### 3. EDA - 5: When in the day(Hours) do people bike through these Seattle crossings ? alt.Chart(seatle_bikes_dt_tidy[:,{'bike_count':dt.sum(f.bike_count)},by(f.crossing,f.hour) ][:,{'hour':f.hour, 'pct_bike' : dt.math.rint((f.bike_count/dt.sum(f.bike_count))*100) },by(f.crossing) ].to_pandas()).mark_line().encode( alt.X('hour:N'),alt.Y('pct_bike'),alt.Color('crossing',legend=None)).properties( width=280,height=160).facet( facet='crossing', columns=4) ##### EDA - 5 Syntax explanation In first chain of DT: grouping has been done on crossing and hour fields so that the total bike count was summed and assigned to a variable-bike_count and in second chain, grouping applied only on crossing to caluclate the percent of bike crossed each direction and an hour. here **dt.math** is a module provides mathematical functions. # Glance at data seatle_bikes_dt_tidy
fullTable = dt.fread("full-latest.csv") print(fullTable.keys()) cases = fullTable[:,'AnzahlFall'].sum()[0,0] dead = fullTable[:,'AnzahlTodesfall'].sum()[0,0] lastDay=fullTable[:,'MeldeDay'].max()[0,0] print("lastDay {} cases {} dead{}".format(lastDay, cases, dead)) newTable=fullTable[:,dt.f[:].extend({"erkMeldeDelay": dt.f.MeldeDay-dt.f.RefDay})] #print(newTable.keys()) #dt.by(dt.f.Bundesland)] alldays=fullTable[:, [dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k), dt.mean(dt.f.Bevoelkerung)], dt.by(dt.f.Landkreis)] last7days=fullTable[dt.f.newCaseOnDay>lastDay-7,:][:, [dt.sum(dt.f.AnzahlFall), dt.sum(dt.f.FaellePro100k), dt.sum(dt.f.AnzahlTodesfall), dt.sum(dt.f.TodesfaellePro100k)], dt.by(dt.f.Landkreis)] last7days.names=["Landkreis","AnzahlFallLetzte7Tage","FaellePro100kLetzte7Tage","AnzahlTodesfallLetzte7Tage","TodesfaellePro100kLetzte7Tage"] def merge(largerTable, smallerTable, keyFieldName):