def test_median_wrong_stype(): DT = dt.Frame(A=["foo"], B=["moo"], stypes={"A": dt.str32, "B": dt.str64}) with pytest.raises(TypeError) as e: noop(DT[:, median(f.A)]) assert ("Unable to apply reduce function median() to a column of " "type str32" in str(e.value)) with pytest.raises(TypeError) as e: noop(DT[:, median(f.B)]) assert ("Unable to apply reduce function median() to a column of " "type str64" in str(e.value))
def test_median_int_odd_nrows(st): # data points in the middle: 5 and 7 DT = dt.Frame(A=[4, -5, 12, 11, 4, 7, 0, 23, 45, 8, 10], stype=st) RES = DT[:, median(f.A)] assert RES.shape == (1, 1) assert RES.stypes == (dt.float64,) assert RES[0, 0] == 8.0
def _infer_caluclate(DT, stat): if stat == 'mean': return DT[:, {'mean_val': dt.mean(f[1])}, by(f[0])] elif stat == 'median': return DT[:, {'median_val': dt.median(f[1])}, by(f[0])] else: pass
def test_median_int_even_nrows(st): # data points in the middle: 5 and 7 DT = dt.Frame(A=[7, 11, -2, 3, 0, 12, 12, 3, 5, 91], stype=st) RES = DT[:, median(f.A)] assert RES.shape == (1, 1) assert RES.stypes == (dt.float64,) assert RES[0, 0] == 6.0
def test_median_grouped(): DT = dt.Frame(A=[0, 0, 0, 0, 1, 1, 1, 1, 1], B=[2, 6, 1, 0, -3, 4, None, None, -1], stypes={"A": dt.int16, "B": dt.int32}) RES = DT[:, median(f.B), by(f.A)] assert RES.shape == (2, 2) assert RES.stypes == (dt.int16, dt.float64) assert RES.to_list() == [[0, 1], [1.5, -1.0]]
def test_issue1857(numpy): nrows = 3620 numpy.random.seed(364) DT = dt.Frame(n1=numpy.random.rand(nrows).astype(numpy.float32), g1=numpy.random.randint(0, 10, nrows), g2=numpy.random.randint(0, 10, nrows)) agg1 = DT[:, {"M": dt.median(f.n1)}, by(f.g1, f.g2)] assert agg1.shape == (100, 3) assert agg1.names == ("g1", "g2", "M") assert agg1.stypes == (stype.int64, stype.int64, stype.float32) assert agg1.sum().to_tuples()[0] == (450, 450, 51.63409462571144)
def fit_transform(self, X: dt.Frame, y: np.array = None): target = '__internal_target__' X[:, target] = dt.Frame(y) target_is_numeric = X[:, target][:, [bool, int, float]].shape[1] > 0 if target_is_numeric: self._group_means = X[:, dt.mean(dt.f[target]), dt.by(*self.input_feature_names)] else: X[:, target] = dt.Frame(LabelEncoder().fit_transform(X[:, target].to_pandas().iloc[:, 0].values).ravel()) self._group_means = X[:, dt.median(dt.f[target]), dt.by(*self.input_feature_names)] del X[:, target] self._group_means.key = self.input_feature_names return self.transform(X)
def analyzeDaily(fullTable, filter, prefix, postfix, byDateColName): print("analyzeDaily prefix='{}' postfix='{}' byDateColName='{}'".format(prefix, postfix, byDateColName)) #print("analyzeDaily filter='{}' '".format(filter)) byDate = dt.f[byDateColName] #print("----- analyzeDaily:"+postfix) #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:] dayTable = fullTable[filter,:] cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1),:] cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)] cases.names = [byDateColName, prefix+"AnzahlFall"+postfix] cases.key = byDateColName print("cases rows = {}, cases_to_count = {}".format(cases.nrows, cases_to_count.nrows)) #print(cases) byDayTable = cases if byDateColName == "DatenstandTag": new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1),:] new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)] new_cases.names = [byDateColName, prefix+"AnzahlFallNeu"+postfix] new_cases.key = byDateColName print("new_cases rows = {}, new_cases_to_count = {}".format(new_cases.nrows, new_cases_to_count.nrows)) #new_cases_to_count.to_csv("new_cases_to_count.csv") byDayTable = byDayTable[:,:,dt.join(new_cases)] else: # add days by MeldeTag byDayTable.names = {prefix+"AnzahlFall"+postfix: prefix+"AnzahlFallNeu"+postfix} byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlFallNeu"+postfix, prefix+"AnzahlFall"+postfix) dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1),:] dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)] dead.names = [byDateColName, prefix+"AnzahlTodesfall"+postfix] dead.key = byDateColName #print("dead rows = {}".format(dead.nrows)) byDayTable = byDayTable[:,:,dt.join(dead)] if byDateColName == "DatenstandTag": new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1),:] new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)] new_dead.names = [byDateColName, prefix+"AnzahlTodesfallNeu"+postfix] new_dead.key = byDateColName #print("new_dead rows = {}".format(new_dead.nrows)) byDayTable = byDayTable[:,:,dt.join(new_dead)] else: # add days by MeldeTag byDayTable.names = {prefix+"AnzahlTodesfall"+postfix: prefix+"AnzahlTodesfallNeu"+postfix} byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlTodesfallNeu"+postfix, prefix+"AnzahlTodesfall"+postfix) byDayTable.key = byDateColName if postfix == "" and prefix == "" and byDateColName == "DatenstandTag": new_cases_to_count_delay = new_cases_to_count[(dt.f.AnzahlFall > 0), :] # measure delay only for positive cases new_cases_to_count_delay.materialize() new_cases_delay = new_cases_to_count_delay[:, [dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay), dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall), dt.max(dt.f.DatenstandTag)], dt.by(byDate)] new_cases_delay.names = ["DatenstandTag", "PublikationsdauerFallNeu_Min" + postfix, "PublikationsdauerFallNeu_Max" + postfix, "PublikationsdauerFallNeu_Schnitt" + postfix, "PublikationsdauerFallNeu_Median" + postfix, "PublikationsdauerFallNeu_StdAbw" + postfix, "PublikationsdauerFallNeu_Fallbasis" + postfix, "DatenstandTag_Max" + postfix] new_cases_delay.key = "DatenstandTag" print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows, new_cases_to_count_delay.nrows)) recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1),:] recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)] recovered.names = ["DatenstandTag", "AnzahlGenesen"+postfix] recovered.key = "DatenstandTag" #print("recovered rows = {}".format(recovered.nrows)) new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1),:] new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)] new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu"+postfix] new_recovered.key = "DatenstandTag" #print("new_recovered rows = {}".format(new_recovered.nrows)) byDayTable = byDayTable[:, :, dt.join(recovered)][:, :, dt.join(new_recovered)][:, :,dt.join(new_cases_delay)] #byDayTable = byDayTable[:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\ # [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)] byDayTable.key = byDateColName #print("byDayTable rows = {}".format(byDayTable.nrows)) #print(byDayTable) return byDayTable
def test_median_issue2802_2(): I = dt.Frame(list(range(13)), stype=dt.int64) DT = dt.Frame(A=range(13))[I, :] RES = DT[:, median(f.A)] assert_equals(RES, dt.Frame(A=[6.0]))
def analyzeDaily(fullTable, filter, postfix): #print("----- analyzeDaily:"+postfix) #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:] dayTable = fullTable[filter, :] cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1), :] cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] cases.names = ["DatenstandTag", "AnzahlFall" + postfix] cases.key = "DatenstandTag" print("cases rows = {}, cases_to_count = {}".format( cases.nrows, cases_to_count.nrows)) #print(cases) new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1), :] new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] new_cases.names = ["DatenstandTag", "AnzahlFallNeu" + postfix] new_cases.key = "DatenstandTag" print("new_cases rows = {}, new_cases_to_count = {}".format( new_cases.nrows, new_cases_to_count.nrows)) #new_cases_to_count.to_csv("new_cases_to_count.csv") new_cases_to_count_delay = new_cases_to_count[( dt.f.AnzahlFall > 0), :] # measure delay only for positive cases new_cases_to_count_delay.materialize() new_cases_delay = new_cases_to_count_delay[:, [ dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay), dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall), dt.max(dt.f.DatenstandTag) ], dt.by(dt.f.DatenstandTag)] new_cases_delay.names = [ "DatenstandTag", "MeldeDauerFallNeu-Min" + postfix, "MeldeDauerFallNeu-Max" + postfix, "MeldeDauerFallNeu-Schnitt" + postfix, "MeldeDauerFallNeu-Median" + postfix, "MeldeDauerFallNeu-StdAbw" + postfix, "MeldeDauerFallNeu-Fallbasis" + postfix, "DatenstandTag-Max" + postfix ] new_cases_delay.key = "DatenstandTag" print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format( new_cases_delay.nrows, new_cases_to_count_delay.nrows)) #new_cases_delay = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag-dt.f.MeldeTag)],dt.by(dt.f.DatenstandTag)] # delays = delayRecs[:, [dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.Landkreis)] # new_cases_stddev = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag - dt.f.MeldeTag)], # dt.by(dt.f.DatenstandTag)] # new_cases_delay.names = ["DatenstandTag", "AnzahlFallNeu-MeldeDauer" + postfix] # new_cases_delay.key = "DatenstandTag" # print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows, # new_cases_to_count_delay.nrows)) new_cases_to_count_strict = new_cases_to_count[( dt.f.DatenstandTag - dt.f.MeldeTag < 7) | (dt.f.AnzahlFall < 0), :] new_cases_strict = new_cases_to_count_strict[:, [dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.DatenstandTag)] new_cases_strict.names = [ "DatenstandTag", "AnzahlFallNeu-Meldung-letze-7-Tage" + postfix ] new_cases_strict.key = "DatenstandTag" print("new_cases_strict rows = {}, new_cases_to_count_strict = {}".format( new_cases_strict.nrows, new_cases_to_count_strict.nrows)) #new_cases_to_count_strict.to_csv("new_cases_to_count_strict.csv") new_cases_to_count_strict_14 = new_cases_to_count[( dt.f.DatenstandTag - dt.f.MeldeTag < 14) | (dt.f.AnzahlFall < 0), :] new_cases_strict_14 = new_cases_to_count_strict_14[:, [ dt.sum(dt.f.AnzahlFall) ], dt.by(dt.f.DatenstandTag)] new_cases_strict_14.names = [ "DatenstandTag", "AnzahlFallNeu-Meldung-letze-14-Tage" + postfix ] new_cases_strict_14.key = "DatenstandTag" print("new_cases_strict_14 rows = {}, new_cases_to_count_strict_14 = {}". format(new_cases_strict_14.nrows, new_cases_to_count_strict_14.nrows)) dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1), :] dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)], dt.by(dt.f.DatenstandTag)] dead.names = ["DatenstandTag", "AnzahlTodesfall" + postfix] dead.key = "DatenstandTag" #print("dead rows = {}".format(dead.nrows)) new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1), :] new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)], dt.by(dt.f.DatenstandTag)] new_dead.names = ["DatenstandTag", "AnzahlTodesfallNeu" + postfix] new_dead.key = "DatenstandTag" #print("new_dead rows = {}".format(new_dead.nrows)) recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1), :] recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)], dt.by(dt.f.DatenstandTag)] recovered.names = ["DatenstandTag", "AnzahlGenesen" + postfix] recovered.key = "DatenstandTag" #print("recovered rows = {}".format(recovered.nrows)) new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1), :] new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)], dt.by(dt.f.DatenstandTag)] new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu" + postfix] new_recovered.key = "DatenstandTag" #print("new_recovered rows = {}".format(new_recovered.nrows)) byDayTable = cases[:,:,dt.join(new_cases)]\ [:,:,dt.join(dead)][:,:,dt.join(new_dead)][:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\ [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)] byDayTable.key = "DatenstandTag" #print("byDayTable rows = {}".format(byDayTable.nrows)) print(byDayTable) return byDayTable
fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans question = 'median v3 sd v3 by id4 id5' # q6 gc.collect() t_start = timeit.default_timer() ans = x[:, {'median_v3': median(f.v3), 'sd_v3': sd(f.v3)}, by(f.id4, f.id5)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.median_v3), sum(f.sd_v3)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git,
def test_median_some_nas(): DT = dt.Frame(S=[None, 5, None, 12, None, -3, None, None, None, 4]) RES = DT[:, median(f.S)] assert RES.shape == (1, 1) assert RES.stypes == (dt.float64,) assert RES[0, 0] == 4.5
def test_median_all_nas(): DT = dt.Frame(N=[math.nan] * 8) RES = DT[:, median(f.N)] assert RES.shape == (1, 1) assert RES.stypes == (dt.float64,) assert RES[0, 0] is None
def test_median_float(st): DT = dt.Frame(W=[0.0, 5.5, 7.9, math.inf, -math.inf], stype=st) RES = DT[:, median(f.W)] assert RES.shape == (1, 1) assert RES.stypes == (st,) assert RES[0, 0] == 5.5 # 5.5 has same value in float64 and float32
def test_median_int_no_overflow(): # If median calculation done inaccurately, 111+112 may overflow int8, # giving a negative result DT = dt.Frame(A=[111, 112], stype=dt.int8) RES = DT[:, median(f.A)] assert RES[0, 0] == 111.5
def test_median(): assert str(dt.median(f.A)) == str(f.A.median()) assert str(dt.median(f[:])) == str(f[:].median()) DT = dt.Frame(A=[2, 3, 5, 5, 9, -1, 2.2]) assert_equals(DT[:, f.A.median()], DT[:, dt.median(f.A)])
def test_median_bygroup(): DT = dt.Frame(A=[0.1, 0.2, 0.5, 0.4, 0.3, 0], B=[1, 2, 1, 1, 2, 2]) RZ = DT[:, median(f.A), by(f.B)] # group 1: 0.1, 0.4, 0.5 # group 2: 0.0, 0.2, 0.3 assert RZ.to_list() == [[1, 2], [0.4, 0.2]]
def test_median_bool_odd_nrows(): DT2 = dt.Frame(B=[True, False, True]) RES2 = DT2[:, median(f.B)] assert RES2.shape == (1, 1) assert RES2.stypes == (dt.float64,) assert RES2[0, 0] == 1.0
def test_median_bool_even_nrows(): DT = dt.Frame(A=[True, False, True, False]) RES = DT[:, median(f.A)] assert RES.shape == (1, 1) assert RES.stypes == (dt.float64,) assert RES[0, 0] == 0.5
def test_median_empty_frame(): DT = dt.Frame(A=[]) RES = DT[:, median(f.A)] assert RES.shape == (1, 1) assert RES.to_list() == [[None]]