示例#1
0
def test_group_boolean4():
    n = 43701
    DT = dt.Frame(A=range(2*n), B=[False, True]*n)
    DTR = DT[:, dt.sum(f.A), by(f.B)]
    assert_equals(DTR, dt.Frame(B=[False, True],
                                A=[sum(range(0, 2*n, 2)),
                                   sum(range(1, 2*n, 2))],
                                stypes={"A": dt.int64}))
示例#2
0
def test_groupby_with_filter2():
    # Check that rowindex works even when applied to a view
    n = 10000
    src0 = [random.getrandbits(2) for _ in range(n)]
    src1 = [random.gauss(1, 1) for _ in range(n)]
    f0 = dt.Frame({"key": src0, "val": src1})
    f1 = f0[f.val >= 0, :]
    f2 = f1[f.val <= 2, sum(f.val), f.key]
    answer = [sum(src1[i] for i in range(n)
                  if src0[i] == key and 0 <= src1[i] <= 2)
              for key in range(4)]
    assert f2.to_list() == [[0, 1, 2, 3], answer]
示例#3
0
def test_reduce_sum():
    f0 = dt.Frame({"color": ["red", "blue", "green", "red", "green"],
                   "size": [5, 2, 7, 13, -1]})
    f1 = f0[:, sum(f.size), f.color]
    f1.internal.check()
    assert f1.topython() == [["blue", "green", "red"],
                             [2, 6, 18]]
示例#4
0
def test_groupby_multi_large(seed):
    random.seed(seed)
    letters = "abcdefghijklmn"
    n = 100 + int(random.expovariate(0.0001))
    col0 = [random.choice([True, False]) for _ in range(n)]
    col1 = [random.randint(-10, 10) for _ in range(n)]
    col2 = [random.choice(letters) for _ in range(n)]
    col3 = [random.random() for _ in range(n)]
    rows = [(col0[i], col1[i], col2[i], col3[i]) for i in range(n)]
    rows.sort()
    grouped = []
    lastkey = rows[0][:3]
    sumval = 0
    for i in range(n):
        ikey = rows[i][:3]
        if ikey != lastkey:
            grouped.append(lastkey + (sumval, ))
            lastkey = ikey
            sumval = 0
        sumval += rows[i][3]
    grouped.append(lastkey + (sumval, ))
    DT0 = dt.Frame([col0, col1, col2, col3], names=["A", "B", "C", "D"])
    DT1 = DT0[:, sum(f.D), by(f.A, f.B, f.C)]
    DT2 = dt.Frame(grouped)
    assert same_iterables(DT1.to_list(), DT2.to_list())
示例#5
0
def test_reduce_sum():
    f0 = dt.Frame({"color": ["red", "blue", "green", "red", "green"],
                   "size": [5, 2, 7, 13, -1]})
    f1 = f0[:, sum(f.size), f.color]
    frame_integrity_check(f1)
    assert f1.to_list() == [["blue", "green", "red"],
                            [2, 6, 18]]
示例#6
0
def py_dt_one_group_proportions_summary(DT,por):
    
    DT_summary = DT[:,dt.count(),by(f[por])
                   ][:,f[:].extend({'grand_tot':dt.sum(f.count)})
                    ][:,f[:].extend({'prop':f.count/f.grand_tot})
                     ][:,f[:].remove(f.grand_tot),dt.sort(-f.prop)
                      ]
    return DT_summary
示例#7
0
def py_dt_two_group_proportions_summary(DT,por1,por2):
    
    DT_summary = DT[:,dt.count(),by(f[por1],f[por2])
                   ][:,f[:].extend({'group_tot':dt.sum(f.count)}),by(f[por1])
                    ][:,f[:].extend({'prop':f.count/f.group_tot})
                     ][:,f[:].remove(f[1])
                      ]
    
    return DT_summary
示例#8
0
def test_reduce_sum_same_column():
    # See issue #3110
    f0 = dt.Frame({"ints": [0, 1, 0, 0, 1, 2]})
    f1 = f0[:, {"sum": sum(f.ints)}, f.ints]
    frame_integrity_check(f1)
    assert_equals(f1, dt.Frame({
        "ints": [0, 1, 2],
        "sum": [0, 2, 2] / dt.int64
    }))
示例#9
0
def test_issue_2242(seed):
    n = 25000
    X = dt.Frame(AGE=[random.randint(1, 50) for i in range(n)],
                 PAY=[random.choice([True, False]) for i in range(n)])
    RES = X[:, dt.math.log((count() + 1) / (sum(f.PAY) + 0.5) - 1), by(f.AGE)]
    assert RES.shape == (50, 2)
    data = RES.to_list()
    assert data[0] == list(range(1, 51))
    assert all(isinstance(x, float) for x in data[1])
示例#10
0
def test_shift_reduced_column():
    DT = dt.Frame(A=[1, 2, 1, 1, 2, 1], B=range(6))
    RES = DT[:, shift(dt.sum(f.B)), by(f.A)]
    assert_equals(
        RES,
        dt.Frame(A=[1, 1, 1, 1, 2, 2],
                 B=[None, 10, 10, 10, None, 5],
                 stypes={
                     "A": dt.int32,
                     "B": dt.int64
                 }))
示例#11
0
def test_sum_empty_frame():
    DT = dt.Frame([[]] * 4, names=list("ABCD"),
                  stypes=(dt.bool8, dt.int32, dt.float32, dt.float64))
    assert DT.shape == (0, 4)
    RZ = DT[:, sum(f[:])]
    frame_integrity_check(RZ)
    assert RZ.shape == (1, 4)
    assert RZ.names == ("A", "B", "C", "D")
    assert RZ.stypes == (dt.int64, dt.int64, dt.float32, dt.float64)
    assert RZ.to_list() == [[0], [0], [0], [0]]
    assert str(RZ)
示例#12
0
def test_group_reduce_all_columns():
    DT = dt.Frame(id=[3, 3, 3, 3, 4, 4, 4, 4],
                  beef=[23, None, None, None, None, None, None, None],
                  eggs=[None, 33, None, None, 197, 103, None, None],
                  fork=[None, None, 10, None, None, None, 210, None],
                  veg=[17, None, None, 40, 1, 2, None, 340])
    assert_equals(
        DT[:, sum(f[:]), by(f.id)],
        dt.Frame(id=[3, 4],
                 beef=[23, 0] / dt.int64,
                 eggs=[33, 300] / dt.int64,
                 fork=[10, 210] / dt.int64,
                 veg=[57, 343] / dt.int64))
示例#13
0
def loadFlaechen(fileName="covid-19-germany-landkreise.csv"):
    geodata = dt.fread(fileName)
    sKeys = geodata[:, 'Regional code'].to_list()[0]
    values = geodata[:, 'Cadastral area'].to_list()[0]

    bundeslaenderFlaechen = geodata[:, [dt.sum(dt.f['Cadastral area'])],dt.by(dt.f['Land ID'])]
    sKeys = sKeys + bundeslaenderFlaechen[:, 'Land ID'].to_list()[0]
    values = values + bundeslaenderFlaechen[:, 'Cadastral area'].to_list()[0]

    deutschlandFlaeche = bundeslaenderFlaechen[:,'Cadastral area'].sum()
    sKeys = sKeys + [0]
    values = values + deutschlandFlaeche.to_list()[0]

    valuesDict = dict(zip(sKeys, values))
    #print(valuesDict)
    return valuesDict
示例#14
0
def test_groupby_large_random_integers(seed):
    random.seed(seed)
    ngrps1 = random.choice([1, 1, 2, 2, 2, 3, 4, 5])
    n0 = 1 << random.choice([1, 1, 2, 2, 2, 3, 3, 3, 4, 5, 6, 7])
    chunks = ([random.sample(range(n0), random.randint(1, n0))] +
              [random.sample([0] * 100 + list(range(256)),
                             random.randint(1, 20))
               for i in range(ngrps1)])
    n = int(random.expovariate(0.0001)) + 10
    sample = [sum(random.choice(chunks[i]) << (8 * i)
                  for i in range(len(chunks)))
              for _ in range(n)]
    nuniques = len(set(sample))
    f0 = dt.Frame(sample)
    assert f0.nunique1() == nuniques
    f1 = dt.rbind(*([f0] * random.randint(2, 20)))
    assert f1.nunique1() == nuniques
示例#15
0
def analyzeDayRange(fullTable, fromDay, toDay):
    dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) &
                         (dt.f.DatenstandTag < toDay), :]

    cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1), :]
    cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],
                           dt.by(dt.f.DatenstandTag)]

    new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) |
                                  (dt.f.NeuerFall == 1), :]
    new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],
                                   dt.by(dt.f.DatenstandTag)]

    dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) |
                             (dt.f.NeuerTodesfall == 1), :]
    dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],
                         dt.by(dt.f.DatenstandTag)]

    new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) |
                                 (dt.f.NeuerTodesfall == 1), :]
    new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],
                                 dt.by(dt.f.DatenstandTag)]

    if "NeuGenesen" in dayTable.keys():
        recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) |
                                      (dt.f.NeuGenesen == 1), :]
        recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],
                                       dt.by(dt.f.DatenstandTag)]
        new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) |
                                          (dt.f.NeuGenesen == 1), :]
        new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],
                                               dt.by(dt.f.DatenstandTag)]

    #lastDay=fullTable[:,'MeldeDay'].max()[0,0]
    #lastnewCaseOnDay=fullTable[:,'newCaseOnDay'].max()[0,0]
    print(
        "From {}-{} Day {}-{}: cases {} (+{}), dead {} (+{}), recovered {} (+{})"
        .format(cd.dateStrDMFromDay(fromDay), cd.dateStrDMFromDay(toDay),
                fromDay, toDay, cases.to_list(), new_cases.to_list(),
                dead.to_list(), new_dead.to_list(), recovered.to_list(),
                new_recovered.to_list()))
    return cases, new_cases, dead, new_dead, recovered, new_recovered
示例#16
0
def test_groupby_multi():
    DT = dt.Frame(A=[1, 2, 3] * 3, B=[1, 2] * 4 + [1], C=range(9))
    res = DT[:, sum(f.C), by("A", "B")]
    assert res.to_list() == [[1, 1, 2, 2, 3, 3], [1, 2, 1, 2, 1, 2],
                             [6, 3, 4, 8, 10, 5]]
示例#17
0
def test_groupby_with_filter1():
    f0 = dt.Frame({"KEY": [1, 2, 1, 2, 1, 2], "X": [-10, 2, 3, 0, 1, -7]})
    f1 = f0[f.X > 0, sum(f.X), f.KEY]
    assert f1.to_list() == [[1, 2], [4, 2]]
cache = "TRUE"

src_grp = os.environ['SRC_GRP_LOCAL']
data_name = src_grp[:-4]
print("loading dataset %s" % data_name)

x = dt.fread(os.path.join("data", src_grp))

print(x.nrows)

print("grouping...")

question = "sum v1 by id1"  #1
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
示例#19
0
on_disk = "FALSE"

data_name = os.environ['SRC_GRP_LOCAL']
src_grp = os.path.join("data", data_name+".csv")
print("loading dataset %s" % data_name, flush=True)

x = dt.fread(src_grp)
print(x.nrows, flush=True)

task_init = timeit.default_timer()
print("grouping...", flush=True)

question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
示例#20
0
          flush=True,
          file=sys.stderr)
    exit(
        0
    )  # fread string with NAs generates extra distinct group h2oai/datatable#2808

x = dt.fread(src_grp, na_strings=[''])
print(x.nrows, flush=True)

task_init = timeit.default_timer()
print("grouping...", flush=True)

question = 'sum v1 by id1'  # q1
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {'v1': sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
示例#21
0
def test_sum_simple():
    DT = dt.Frame(A=range(5))
    R = DT[:, sum(f.A)]
    frame_integrity_check(R)
    assert R.to_list() == [[10]]
    assert str(R)
示例#22
0
task_init = timeit.default_timer()
print("joining...", flush=True)

question = "small inner on int"  # q1
gc.collect()
y = small.copy(deep=True)
t_start = timeit.default_timer()
y.key = 'id1'
ans = x[:, :, join(y)][isfinite(f.v2), :]  # , on='id1'
tmp = ans.copy(deep=True)  ## ensure join results materialized #141
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v2)]]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
          fun=fun,
          run=1,
          time_sec=t,
          mem_gb=m,
          cache=cache,
示例#23
0
def loadAndProcessData(dataFilename):
    print("Loading " + dataFilename)

    fullTable = dt.fread(dataFilename)
    print("Loading done loading table from ‘" + dataFilename + "‘, keys:")
    print(fullTable.keys())
    cases = fullTable[:, 'AnzahlFall'].sum()[0, 0]
    dead = fullTable[:, 'AnzahlTodesfall'].sum()[0, 0]

    lastDay = fullTable[:, 'MeldeDay'].max()[0, 0]
    lastnewCaseOnDay = fullTable[:, 'newCaseOnDay'].max()[0, 0]
    print("File stats: lastDay {} lastnewCaseOnDay {} cases {} dead {}".format(
        lastDay, lastnewCaseOnDay, cases, dead))

    newTable = fullTable[:, dt.f[:].
                         extend({"erkMeldeDelay": dt.f.MeldeDay -
                                 dt.f.RefDay})]
    #print(newTable.keys())

    #dt.by(dt.f.Bundesland)]
    alldays = fullTable[:, [
        dt.sum(dt.f.AnzahlFall),
        dt.sum(dt.f.FaellePro100k),
        dt.sum(dt.f.AnzahlTodesfall),
        dt.sum(dt.f.TodesfaellePro100k),
        dt.mean(dt.f.Bevoelkerung),
        dt.max(dt.f.MeldeDay),
        dt.first(dt.f.LandkreisTyp),
        dt.first(dt.f.Bundesland)
    ],
                        dt.by(dt.f.Landkreis)]

    last7days = fullTable[dt.f.newCaseOnDay > lastDay -
                          7, :][:, [
                              dt.sum(dt.f.AnzahlFall),
                              dt.sum(dt.f.FaellePro100k),
                              dt.sum(dt.f.AnzahlTodesfall),
                              dt.sum(dt.f.TodesfaellePro100k)
                          ],
                                dt.by(dt.f.Landkreis)]
    last7days.names = [
        "Landkreis", "AnzahlFallLetzte7Tage", "FaellePro100kLetzte7Tage",
        "AnzahlTodesfallLetzte7Tage", "TodesfaellePro100kLetzte7Tage"
    ]
    last7days[dt.f.AnzahlFallLetzte7Tage < 0, "AnzahlFallLetzte7Tage"] = 0
    last7days[dt.f.FaellePro100kLetzte7Tage < 0,
              "FaellePro100kLetzte7Tage"] = 0
    last7days[dt.f.AnzahlTodesfallLetzte7Tage < 0,
              "AnzahlTodesfallLetzte7Tage"] = 0
    last7days[dt.f.TodesfaellePro100kLetzte7Tage < 0,
              "TodesfaellePro100kLetzte7Tage"] = 0

    lastWeek7days = fullTable[(dt.f.newCaseOnDay > lastDay - 14) & (
        dt.f.newCaseOnDay <= lastDay - 7), :][:, [
            dt.sum(dt.f.AnzahlFall),
            dt.sum(dt.f.FaellePro100k),
            dt.sum(dt.f.AnzahlTodesfall),
            dt.sum(dt.f.TodesfaellePro100k)
        ],
                                              dt.by(dt.f.Landkreis)]
    #lastWeek7days[dt.f[1:] < 0, dt.f[1:]] = 0
    lastWeek7days.names = [
        "Landkreis", "AnzahlFallLetzte7TageDavor",
        "FaellePro100kLetzte7TageDavor", "AnzahlTodesfallLetzte7TageDavor",
        "TodesfaellePro100kLetzte7TageDavor"
    ]
    lastWeek7days[dt.f.AnzahlFallLetzte7TageDavor < 0,
                  "AnzahlFallLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.FaellePro100kLetzte7TageDavor < 0,
                  "FaellePro100kLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.AnzahlTodesfallLetzte7TageDavor < 0,
                  "AnzahlTodesfallLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.TodesfaellePro100kLetzte7TageDavor < 0,
                  "TodesfaellePro100kLetzte7TageDavor"] = 0

    allDaysExt0 = merge(alldays, last7days, "Landkreis")
    allDaysExt1 = merge(allDaysExt0, lastWeek7days, "Landkreis")

    Rw = dt.f.AnzahlFallLetzte7Tage / dt.f.AnzahlFallLetzte7TageDavor

    allDaysExt2 = allDaysExt1[:, dt.f[:].extend({"AnzahlFallTrend": Rw})]
    allDaysExt3 = allDaysExt2[:, dt.f[:].extend({
        "FaellePro100kTrend":
        dt.f.FaellePro100kLetzte7Tage - dt.f.FaellePro100kLetzte7TageDavor
    })]
    allDaysExt4 = allDaysExt3[:, dt.f[:].extend({
        "TodesfaellePro100kTrend":
        dt.f.TodesfaellePro100kLetzte7Tage -
        dt.f.TodesfaellePro100kLetzte7TageDavor
    })]

    allDaysExt5 = allDaysExt4[:, dt.f[:].extend({
        "Kontaktrisiko":
        dt.f.Bevoelkerung / 6.25 /
        ((dt.f.AnzahlFallLetzte7Tage + dt.f.AnzahlFallLetzte7TageDavor) * Rw)
    })]
    allDaysExt6 = allDaysExt5[:, dt.f[:].extend(
        {"LetzteMeldung": lastDay - dt.f.MeldeDay})]

    allDaysExt6[dt.f.Kontaktrisiko * 2 == dt.f.Kontaktrisiko,
                "Kontaktrisiko"] = 999999

    sortedByRisk = allDaysExt6.sort(
        ["Kontaktrisiko", "LetzteMeldung", "FaellePro100k"])
    #print(sortedByRisk)
    allDaysExt = sortedByRisk[:, dt.f[:].extend({"Rang": 0})]
    allDaysExt[:, "Rang"] = np.arange(1, allDaysExt.nrows + 1)
    #print(allDaysExt)

    print("Column names frame order:", list(enumerate(allDaysExt.names)))

    data = allDaysExt.to_pandas()
    return data
 def transform(self, X: dt.Frame):
     return X[:, dt.sum([(dt.f[x] > 0) for x in range(X.ncols)])]
示例#25
0
cache = "TRUE"

wc_lines = subprocess.run(['wc','-l',data_name], stdout=subprocess.PIPE).stdout.decode('utf-8').split(" ", 1)[0]
in_rows = int(wc_lines)-1

print("reading...")

question = "all rows" #1
gc.collect()
t_start = timeit.default_timer()
ans = dt.fread(data_name, show_progress=False)
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v3)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = dt.fread(data_name, show_progress=False)
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v3)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=in_rows, question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
gc.collect()
示例#26
0
fun = "[.datatable"
cache = "TRUE"

data_name = os.environ['SRC_GRP_LOCAL']
src_grp = os.path.join("data", data_name+".csv")
print("loading dataset %s" % data_name, flush=True)

x = dt.fread(src_grp)
print(x.nrows, flush=True)

print("grouping...", flush=True)

question = "sum v1 by id1" # q1
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
示例#27
0
 def transform(self, X: dt.Frame):
     if X.ncols == 0:
         return np.zeros((X.nrows, 1))
     return X[:, dt.sum([dt.isna(dt.f[x]) for x in range(X.ncols)])]
示例#28
0
def test_group_empty_frame4():
    DT = dt.Frame(A=[], stype=dt.float32)
    D2 = DT[:, sum(f.A), by(f.A)]
    frame_integrity_check(D2)
    assert D2.shape == (0, 2)
    assert D2.stypes == (dt.float32, dt.float32)
示例#29
0
cache = "TRUE"

print("loading dataset...")

x = dt.fread(data_name)

print("sorting...")

gc.collect()
t_start = timeit.default_timer()
ans = x.sort('KEY')
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.X2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans

gc.collect()
t_start = timeit.default_timer()
ans = x.sort('KEY')
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.X2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.topython())), chk_time_sec=chkt)
del ans
示例#30
0
fun = "[.datatable"
cache = "TRUE"

data_name = os.environ['SRC_GRP_LOCAL']
src_grp = os.path.join("data", data_name + ".csv")
print("loading dataset %s" % data_name, flush=True)

x = dt.fread(src_grp)
print(x.nrows, flush=True)

print("grouping...", flush=True)

question = "sum v1 by id1"  # q1
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1)}, by(f.id1)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
示例#31
0
Same as above ones

# Groupig based on NA's field
seatle_bikes_dt_tidy[:,{
    'total':count()
},by(dt.isna(f.ped_count),
     f.year,f.crossing)
]

##### EDA - 4.1 Syntax explanation

dt.isna is a function to check an observation is a Null or Not, here how many of ped_count NA's per year and crossing were caluclated  

#### 3. EDA - 5: When in the day(Hours) do people bike through these Seattle crossings ?

alt.Chart(seatle_bikes_dt_tidy[:,{'bike_count':dt.sum(f.bike_count)},by(f.crossing,f.hour)
                    ][:,{'hour':f.hour,
                         'pct_bike' : dt.math.rint((f.bike_count/dt.sum(f.bike_count))*100)
                        },by(f.crossing)
                     ].to_pandas()).mark_line().encode(
    alt.X('hour:N'),alt.Y('pct_bike'),alt.Color('crossing',legend=None)).properties(
    width=280,height=160).facet(
    facet='crossing',
    columns=4)

##### EDA - 5 Syntax explanation

In first chain of DT: grouping has been done on crossing and hour fields so that the total bike count was summed and assigned to a variable-bike_count and in second chain, grouping applied only on crossing to caluclate the percent of bike crossed each direction and an hour. here **dt.math** is a module provides mathematical functions.

# Glance at data
seatle_bikes_dt_tidy
示例#32
0
文件: app.py 项目: pavel-mayer/covid
fullTable = dt.fread("full-latest.csv")
print(fullTable.keys())
cases = fullTable[:,'AnzahlFall'].sum()[0,0]
dead = fullTable[:,'AnzahlTodesfall'].sum()[0,0]

lastDay=fullTable[:,'MeldeDay'].max()[0,0]
print("lastDay {} cases {} dead{}".format(lastDay, cases, dead))

newTable=fullTable[:,dt.f[:].extend({"erkMeldeDelay": dt.f.MeldeDay-dt.f.RefDay})]
#print(newTable.keys())


#dt.by(dt.f.Bundesland)]
alldays=fullTable[:,
          [dt.sum(dt.f.AnzahlFall),
           dt.sum(dt.f.FaellePro100k),
           dt.sum(dt.f.AnzahlTodesfall),
           dt.sum(dt.f.TodesfaellePro100k),
           dt.mean(dt.f.Bevoelkerung)],
   dt.by(dt.f.Landkreis)]

last7days=fullTable[dt.f.newCaseOnDay>lastDay-7,:][:,
          [dt.sum(dt.f.AnzahlFall),
           dt.sum(dt.f.FaellePro100k),
           dt.sum(dt.f.AnzahlTodesfall),
           dt.sum(dt.f.TodesfaellePro100k)],
   dt.by(dt.f.Landkreis)]
last7days.names=["Landkreis","AnzahlFallLetzte7Tage","FaellePro100kLetzte7Tage","AnzahlTodesfallLetzte7Tage","TodesfaellePro100kLetzte7Tage"]

def merge(largerTable, smallerTable, keyFieldName):