示例#1
0
          fun=fun,
          run=2,
          time_sec=t,
          mem_gb=m,
          cache=cache,
          chk=make_chk(flatten(chk.to_list())),
          chk_time_sec=chkt,
          on_disk=on_disk)
print(ans.head(3), flush=True)
print(ans.tail(3), flush=True)
del ans

question = 'sum v1 mean v3 by id3'  # q3
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {'v1': sum(f.v1), 'v3': mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
示例#2
0
def test_mean_simple():
    DT = dt.Frame(A=range(5))
    RZ = DT[:, mean(f.A)]
    frame_integrity_check(RZ)
    assert RZ.stypes == (dt.float64,)
    assert RZ.to_list() == [[2.0]]
示例#3
0
def loadAndProcessData(dataFilename):
    print("Loading " + dataFilename)

    fullTable = dt.fread(dataFilename)
    print("Loading done loading table from ‘" + dataFilename + "‘, keys:")
    print(fullTable.keys())
    cases = fullTable[:, 'AnzahlFall'].sum()[0, 0]
    dead = fullTable[:, 'AnzahlTodesfall'].sum()[0, 0]

    lastDay = fullTable[:, 'MeldeDay'].max()[0, 0]
    lastnewCaseOnDay = fullTable[:, 'newCaseOnDay'].max()[0, 0]
    print("File stats: lastDay {} lastnewCaseOnDay {} cases {} dead {}".format(
        lastDay, lastnewCaseOnDay, cases, dead))

    newTable = fullTable[:, dt.f[:].
                         extend({"erkMeldeDelay": dt.f.MeldeDay -
                                 dt.f.RefDay})]
    #print(newTable.keys())

    #dt.by(dt.f.Bundesland)]
    alldays = fullTable[:, [
        dt.sum(dt.f.AnzahlFall),
        dt.sum(dt.f.FaellePro100k),
        dt.sum(dt.f.AnzahlTodesfall),
        dt.sum(dt.f.TodesfaellePro100k),
        dt.mean(dt.f.Bevoelkerung),
        dt.max(dt.f.MeldeDay),
        dt.first(dt.f.LandkreisTyp),
        dt.first(dt.f.Bundesland)
    ],
                        dt.by(dt.f.Landkreis)]

    last7days = fullTable[dt.f.newCaseOnDay > lastDay -
                          7, :][:, [
                              dt.sum(dt.f.AnzahlFall),
                              dt.sum(dt.f.FaellePro100k),
                              dt.sum(dt.f.AnzahlTodesfall),
                              dt.sum(dt.f.TodesfaellePro100k)
                          ],
                                dt.by(dt.f.Landkreis)]
    last7days.names = [
        "Landkreis", "AnzahlFallLetzte7Tage", "FaellePro100kLetzte7Tage",
        "AnzahlTodesfallLetzte7Tage", "TodesfaellePro100kLetzte7Tage"
    ]
    last7days[dt.f.AnzahlFallLetzte7Tage < 0, "AnzahlFallLetzte7Tage"] = 0
    last7days[dt.f.FaellePro100kLetzte7Tage < 0,
              "FaellePro100kLetzte7Tage"] = 0
    last7days[dt.f.AnzahlTodesfallLetzte7Tage < 0,
              "AnzahlTodesfallLetzte7Tage"] = 0
    last7days[dt.f.TodesfaellePro100kLetzte7Tage < 0,
              "TodesfaellePro100kLetzte7Tage"] = 0

    lastWeek7days = fullTable[(dt.f.newCaseOnDay > lastDay - 14) & (
        dt.f.newCaseOnDay <= lastDay - 7), :][:, [
            dt.sum(dt.f.AnzahlFall),
            dt.sum(dt.f.FaellePro100k),
            dt.sum(dt.f.AnzahlTodesfall),
            dt.sum(dt.f.TodesfaellePro100k)
        ],
                                              dt.by(dt.f.Landkreis)]
    #lastWeek7days[dt.f[1:] < 0, dt.f[1:]] = 0
    lastWeek7days.names = [
        "Landkreis", "AnzahlFallLetzte7TageDavor",
        "FaellePro100kLetzte7TageDavor", "AnzahlTodesfallLetzte7TageDavor",
        "TodesfaellePro100kLetzte7TageDavor"
    ]
    lastWeek7days[dt.f.AnzahlFallLetzte7TageDavor < 0,
                  "AnzahlFallLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.FaellePro100kLetzte7TageDavor < 0,
                  "FaellePro100kLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.AnzahlTodesfallLetzte7TageDavor < 0,
                  "AnzahlTodesfallLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.TodesfaellePro100kLetzte7TageDavor < 0,
                  "TodesfaellePro100kLetzte7TageDavor"] = 0

    allDaysExt0 = merge(alldays, last7days, "Landkreis")
    allDaysExt1 = merge(allDaysExt0, lastWeek7days, "Landkreis")

    Rw = dt.f.AnzahlFallLetzte7Tage / dt.f.AnzahlFallLetzte7TageDavor

    allDaysExt2 = allDaysExt1[:, dt.f[:].extend({"AnzahlFallTrend": Rw})]
    allDaysExt3 = allDaysExt2[:, dt.f[:].extend({
        "FaellePro100kTrend":
        dt.f.FaellePro100kLetzte7Tage - dt.f.FaellePro100kLetzte7TageDavor
    })]
    allDaysExt4 = allDaysExt3[:, dt.f[:].extend({
        "TodesfaellePro100kTrend":
        dt.f.TodesfaellePro100kLetzte7Tage -
        dt.f.TodesfaellePro100kLetzte7TageDavor
    })]

    allDaysExt5 = allDaysExt4[:, dt.f[:].extend({
        "Kontaktrisiko":
        dt.f.Bevoelkerung / 6.25 /
        ((dt.f.AnzahlFallLetzte7Tage + dt.f.AnzahlFallLetzte7TageDavor) * Rw)
    })]
    allDaysExt6 = allDaysExt5[:, dt.f[:].extend(
        {"LetzteMeldung": lastDay - dt.f.MeldeDay})]

    allDaysExt6[dt.f.Kontaktrisiko * 2 == dt.f.Kontaktrisiko,
                "Kontaktrisiko"] = 999999

    sortedByRisk = allDaysExt6.sort(
        ["Kontaktrisiko", "LetzteMeldung", "FaellePro100k"])
    #print(sortedByRisk)
    allDaysExt = sortedByRisk[:, dt.f[:].extend({"Rang": 0})]
    allDaysExt[:, "Rang"] = np.arange(1, allDaysExt.nrows + 1)
    #print(allDaysExt)

    print("Column names frame order:", list(enumerate(allDaysExt.names)))

    data = allDaysExt.to_pandas()
    return data
示例#4
0
def test_rows_mean():
    from datatable import mean
    df0 = dt.Frame(A=range(10))
    df1 = df0[f.A > mean(f.A), :]
    df1.internal.check()
    assert df1.to_list() == [[5, 6, 7, 8, 9]]
# First five observations from 2 to 5 columns in DT
penguins_dt[:5, 2:6]

# Last five observations from DT
penguins_dt[-5:, :]

# All observations for last 3 columns
penguins_dt[:, -3:]

# Filter out NA's from sex and body mass g columns
penguins_dt[(dt.isna(f.sex) & ~dt.isna(f.body_mass_g)), :]

# mean of all numerics columns per different penguin sex categories
penguins_dt[~dt.isna(f.sex), :][:,
                                dt.mean((f[dt.int32].remove(f.year),
                                         f[dt.float64])),
                                by(f.sex)]

# step - 1 : finding a max value of body_mass of penguins per sex
penguins_dt[:, update(temp=f.body_mass_g == dt.max(f.body_mass_g)), by(f.sex)]

# step - 2 : finding a max value of body_mass of penguins per sex
penguins_dt[f.temp == 1, f[:].remove(f.temp)]

# step - 1 : finding a min value of body_mass of penguins per sex
penguins_dt[:, update(temp=f.body_mass_g == dt.min(f.body_mass_g)), by(f.sex)]

penguins_dt[f.temp == 1, f[:].remove(f.temp)]

del penguins_dt["temp"]
示例#6
0
def analyzeDaily(fullTable, filter, postfix):

    #print("----- analyzeDaily:"+postfix)
    #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:]
    dayTable = fullTable[filter, :]

    cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1), :]
    cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],
                           dt.by(dt.f.DatenstandTag)]
    cases.names = ["DatenstandTag", "AnzahlFall" + postfix]
    cases.key = "DatenstandTag"
    print("cases rows = {}, cases_to_count = {}".format(
        cases.nrows, cases_to_count.nrows))
    #print(cases)

    new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) |
                                  (dt.f.NeuerFall == 1), :]
    new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],
                                   dt.by(dt.f.DatenstandTag)]
    new_cases.names = ["DatenstandTag", "AnzahlFallNeu" + postfix]
    new_cases.key = "DatenstandTag"
    print("new_cases rows = {}, new_cases_to_count = {}".format(
        new_cases.nrows, new_cases_to_count.nrows))
    #new_cases_to_count.to_csv("new_cases_to_count.csv")

    new_cases_to_count_delay = new_cases_to_count[(
        dt.f.AnzahlFall > 0), :]  # measure delay only for positive cases
    new_cases_to_count_delay.materialize()
    new_cases_delay = new_cases_to_count_delay[:, [
        dt.min(dt.f.MeldeDelay),
        dt.max(dt.f.MeldeDelay),
        dt.mean(dt.f.MeldeDelay),
        dt.median(dt.f.MeldeDelay),
        dt.sd(dt.f.MeldeDelay),
        dt.sum(dt.f.AnzahlFall),
        dt.max(dt.f.DatenstandTag)
    ],
                                               dt.by(dt.f.DatenstandTag)]
    new_cases_delay.names = [
        "DatenstandTag", "MeldeDauerFallNeu-Min" + postfix,
        "MeldeDauerFallNeu-Max" + postfix,
        "MeldeDauerFallNeu-Schnitt" + postfix,
        "MeldeDauerFallNeu-Median" + postfix,
        "MeldeDauerFallNeu-StdAbw" + postfix,
        "MeldeDauerFallNeu-Fallbasis" + postfix, "DatenstandTag-Max" + postfix
    ]
    new_cases_delay.key = "DatenstandTag"
    print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(
        new_cases_delay.nrows, new_cases_to_count_delay.nrows))
    #new_cases_delay = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag-dt.f.MeldeTag)],dt.by(dt.f.DatenstandTag)]

    #     delays = delayRecs[:, [dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay), dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall)], dt.by(dt.f.Landkreis)]

    # new_cases_stddev = new_cases_to_count_delay[:, [dt.mean(dt.f.DatenstandTag - dt.f.MeldeTag)],
    #                   dt.by(dt.f.DatenstandTag)]
    # new_cases_delay.names = ["DatenstandTag", "AnzahlFallNeu-MeldeDauer" + postfix]
    # new_cases_delay.key = "DatenstandTag"
    # print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows,
    #                                                                         new_cases_to_count_delay.nrows))

    new_cases_to_count_strict = new_cases_to_count[(
        dt.f.DatenstandTag - dt.f.MeldeTag < 7) | (dt.f.AnzahlFall < 0), :]
    new_cases_strict = new_cases_to_count_strict[:, [dt.sum(dt.f.AnzahlFall)],
                                                 dt.by(dt.f.DatenstandTag)]
    new_cases_strict.names = [
        "DatenstandTag", "AnzahlFallNeu-Meldung-letze-7-Tage" + postfix
    ]
    new_cases_strict.key = "DatenstandTag"
    print("new_cases_strict rows = {}, new_cases_to_count_strict = {}".format(
        new_cases_strict.nrows, new_cases_to_count_strict.nrows))
    #new_cases_to_count_strict.to_csv("new_cases_to_count_strict.csv")

    new_cases_to_count_strict_14 = new_cases_to_count[(
        dt.f.DatenstandTag - dt.f.MeldeTag < 14) | (dt.f.AnzahlFall < 0), :]
    new_cases_strict_14 = new_cases_to_count_strict_14[:, [
        dt.sum(dt.f.AnzahlFall)
    ], dt.by(dt.f.DatenstandTag)]
    new_cases_strict_14.names = [
        "DatenstandTag", "AnzahlFallNeu-Meldung-letze-14-Tage" + postfix
    ]
    new_cases_strict_14.key = "DatenstandTag"
    print("new_cases_strict_14 rows = {}, new_cases_to_count_strict_14 = {}".
          format(new_cases_strict_14.nrows,
                 new_cases_to_count_strict_14.nrows))

    dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) |
                             (dt.f.NeuerTodesfall == 1), :]
    dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],
                         dt.by(dt.f.DatenstandTag)]
    dead.names = ["DatenstandTag", "AnzahlTodesfall" + postfix]
    dead.key = "DatenstandTag"
    #print("dead rows = {}".format(dead.nrows))

    new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) |
                                 (dt.f.NeuerTodesfall == 1), :]
    new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],
                                 dt.by(dt.f.DatenstandTag)]
    new_dead.names = ["DatenstandTag", "AnzahlTodesfallNeu" + postfix]
    new_dead.key = "DatenstandTag"
    #print("new_dead rows = {}".format(new_dead.nrows))

    recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) |
                                  (dt.f.NeuGenesen == 1), :]
    recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],
                                   dt.by(dt.f.DatenstandTag)]
    recovered.names = ["DatenstandTag", "AnzahlGenesen" + postfix]
    recovered.key = "DatenstandTag"
    #print("recovered rows = {}".format(recovered.nrows))

    new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) |
                                      (dt.f.NeuGenesen == 1), :]
    new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],
                                           dt.by(dt.f.DatenstandTag)]
    new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu" + postfix]
    new_recovered.key = "DatenstandTag"
    #print("new_recovered rows = {}".format(new_recovered.nrows))

    byDayTable = cases[:,:,dt.join(new_cases)]\
                     [:,:,dt.join(dead)][:,:,dt.join(new_dead)][:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\
        [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)]
    byDayTable.key = "DatenstandTag"
    #print("byDayTable rows = {}".format(byDayTable.nrows))
    print(byDayTable)

    return byDayTable
"""Compute new features based on aggregates, e.g. distance from mean"""

# Compute per-column expressions (signed distance from the mean in this example)
# for all numeric (int, float) columns with stats computed by groups and
# new column added for each original numeric feature.
# see: https://stackoverflow.com/questions/62974899/updating-or-adding-multiple-columns-with-pydatatable-in-style-of-r-datables-sd
#
# Specification:
# Inputs:
#   X: datatable - primary data set
# Parameters:
#   group_by_cols: list of column names - group columns to compute stats by
# Output:
#   dataset augmented with computed statistics

from datatable import f, by, sort, update, shift, isna, mean

group_by_cols = ["user_id"]

new_dataset_name = "new_dataset_name_with_stats"

aggs = {f"{col}_dist_from_mean": mean(dt.f[col]) - f[col]
        for col in X[:, f[int].extend(f[float])].names}

X[:, update(**aggs), by(*group_by_cols)]

return {new_dataset_name: X}
          solution=solution,
          version=ver,
          git=git,
          fun=fun,
          run=2,
          time_sec=t,
          mem_gb=m,
          cache=cache,
          chk=make_chk(flatten(chk.topython())),
          chk_time_sec=chkt)
del ans

question = "sum v1 mean v3 by id3"  #3
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task,
          data=data_name,
          in_rows=x.shape[0],
          question=question,
          out_rows=ans.shape[0],
          out_cols=ans.shape[1],
          solution=solution,
          version=ver,
          git=git,
示例#9
0
# Hour wise arrests
hour_wise_arrests_dt = py_dt_two_group_proportions_summary(policia_tidy_dt_v1,'stop_hour','is_arrested')

# Visualization
alt.Chart(hour_wise_arrests_dt.to_pandas()).mark_bar().encode(
    alt.X('stop_hour:N'),
    alt.Y('count'),
    alt.Color('is_arrested')
).properties(

    title= 'Hour wise arrest trends'
)

# Hour wise arrest rates
hour_wise_arrests_rates_dt= hour_wise_arrests_dt[f.is_arrested==True,:
                                                ][:,dt.mean(f.count),by(f.stop_hour)
                                                 ]

# Visualization
alt.Chart(hour_wise_arrests_rates_dt.to_pandas()).mark_line().encode(
    alt.X('stop_hour'),
    alt.Y('count')
).properties(

    title = 'Hourly wise - average arrest rates'
)

py_dt_one_group_proportions_summary(policia_tidy_dt_v1,'drugs_related_stop')

# stop date and converting to pandas frame
stop_date_df = policia_tidy_dt[:,(f.stop_date)].to_pandas()
amigos_info_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends_info.csv')
amigos_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends.csv')

# Glance
amigos_info_dt

# Seasons
amigos_info_dt[:,count(),by(f.season)]

# Unique episodes per a season
amigos_info_dt[:,count(),by(f.season,f.episode)
              ][:,{'unique_episodes':count()},by(f.season)
               ]

# average views and ratings per season
amigos_info_dt[:,dt.mean(f[-2:]),by(f.season)]

# Highest rating title
amigos_info_dt[f.imdb_rating==dt.max(f.imdb_rating),:]

# lowest rating title
amigos_info_dt[f.imdb_rating==dt.min(f.imdb_rating),:]

# Top 2 titles having higher rating per season
amigos_info_dt[:2,:,by(f.season),sort(-f.imdb_rating)]

# find a title info
amigos_info_dt[f.title=="The Last One",:]

# select few observations till 235
amigos_info_dt[[slice(None,235)],:]
def agregar_variables_nuevas(dataset: Frame) -> Frame:
    dataset['tarjetas_status01'] = dataset[:,
                                           dt.rowmax([
                                               f.Master_status, f.Visa_status
                                           ])]  # 3
    dataset['tarjetas_status02'] = dataset[:,
                                           dt.rowmin([
                                               f.Master_status, f.Visa_status
                                           ])]  # 2
    dataset['tarjetas_fultimo_cierre01'] = dataset[:,
                                                   dt.rowmax([
                                                       f.Master_fultimo_cierre,
                                                       f.Visa_fultimo_cierre
                                                   ])]  # 479
    dataset['tarjetas_fultimo_cierre02'] = dataset[:,
                                                   dt.rowmin([
                                                       f.Master_fultimo_cierre,
                                                       f.Visa_fultimo_cierre
                                                   ])]  # 421
    dataset['tarjetas_Finiciomora'] = dataset[:,
                                              dt.rowmin([
                                                  f.Master_Finiciomora,
                                                  f.Visa_Finiciomora
                                              ])]  # 12
    dataset['tarjetas_Fvencimiento'] = dataset[:,
                                               dt.rowmin([
                                                   f.Master_Fvencimiento,
                                                   f.Visa_Fvencimiento
                                               ])]  # 359
    dataset['tarjetas_delinquency'] = dataset[:,
                                              dt.rowmax([
                                                  f.Master_delinquency,
                                                  f.Visa_delinquency
                                              ])]  # 18
    dataset[
        'tarjetas_mfinanciacion_limite'] = dataset[:,
                                                   dt.rowsum([
                                                       f.
                                                       Master_mfinanciacion_limite,
                                                       f.
                                                       Visa_mfinanciacion_limite
                                                   ])]  # 230
    dataset['tarjetas_msaldototal'] = dataset[:, f.Master_msaldototal +
                                              f.Visa_msaldototal]  # 57
    dataset['tarjetas_msaldopesos'] = dataset[:, f.Master_msaldopesos +
                                              f.Visa_msaldopesos]  # 46
    dataset[
        'tarjetas_msaldodolares'] = dataset[:, f.Master_msaldodolares + f.
                                            Visa_msaldodolares]  # 1142 pero una derivada 104
    dataset['tarjetas_mconsumospesos'] = dataset[:, f.Master_mconsumospesos +
                                                 f.Visa_mconsumospesos]  # 400
    dataset[
        'tarjetas_mconsumosdolares'] = dataset[:,
                                               f.Master_mconsumosdolares + f.
                                               Visa_mconsumosdolares]  # 891 pero con derivadas 352
    dataset[
        'tarjetas_mlimitecompra'] = dataset[:, f.Master_mlimitecompra + f.
                                            Visa_mlimitecompra]  # 186 pero con derivadas 26
    dataset[
        'tarjetas_madelantopesos'] = dataset[:, f.Master_madelantopesos + f.
                                             Visa_madelantopesos]  # 666 pero derivadas 26
    dataset[
        'tarjetas_madelantodolares'] = dataset[:,
                                               f.Master_madelantodolares + f.
                                               Visa_madelantodolares]  # 294 y derivadas 33
    dataset['tarjetas_fultimo_cierre'] = dataset[:,
                                                 dt.rowmax([
                                                     f.Master_fultimo_cierre,
                                                     f.Visa_fultimo_cierre
                                                 ])]  # 448
    dataset['tarjetas_mpagado'] = dataset[:, f.Master_mpagado +
                                          f.Visa_mpagado]  # 384 y derivadas 29
    dataset['tarjetas_mpagospesos'] = dataset[:, f.Master_mpagospesos +
                                              f.Visa_mpagospesos]  # 28
    dataset[
        'tarjetas_mpagosdolares'] = dataset[:, f.Master_mpagosdolares + f.
                                            Visa_mpagosdolares]  # 1017 y derivadas 255
    dataset['tarjetas_fechaalta'] = dataset[:,
                                            dt.rowmax([
                                                f.Master_fechaalta,
                                                f.Visa_fechaalta
                                            ])]  # 159
    dataset[
        'tarjetas_mconsumototal'] = dataset[:, f.Master_mconsumototal + f.
                                            Visa_mconsumototal]  # 512 y derivadas 365
    dataset['tarjetas_cconsumos'] = dataset[:, f.Master_cconsumos +
                                            f.Visa_cconsumos]  # 424
    dataset[
        'tarjetas_cadelantosefectivo'] = dataset[:,
                                                 f.Master_cadelantosefectivo +
                                                 f.
                                                 Visa_cadelantosefectivo]  # 750
    dataset['tarjetas_mpagominimo'] = dataset[:, f.Master_mpagominimo +
                                              f.Visa_mpagominimo]  # 98
    dataset[
        'ratio_tarjetas_msaldodolares__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                          tarjetas_msaldodolares
                                                                          / f.
                                                                          tarjetas_mlimitecompra]  # 104
    dataset[
        'ratio_tarjetas_msaldodolares__tarjetas_msaldototal'] = dataset[:, f.
                                                                        tarjetas_msaldodolares
                                                                        / f.
                                                                        tarjetas_msaldototal]  # 611
    dataset[
        'ratio_tarjetas_mconsumospesos__tarjetas_mlimitecompra'] = dataset[:,
                                                                           f.
                                                                           tarjetas_mconsumospesos
                                                                           / f.
                                                                           tarjetas_mlimitecompra]  # 244
    dataset[
        'ratio_tarjetas_madelantopesos__tarjetas_mlimitecompra'] = dataset[:,
                                                                           f.
                                                                           tarjetas_madelantopesos
                                                                           / f.
                                                                           tarjetas_mlimitecompra]  # 26
    dataset[
        'ratio_tarjetas_madelantodolares__tarjetas_mlimitecompra'] = dataset[:,
                                                                             f.
                                                                             tarjetas_madelantodolares
                                                                             /
                                                                             f.
                                                                             tarjetas_mlimitecompra]  # 33
    dataset[
        'ratio_tarjetas_mpagospesos__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                        tarjetas_mpagospesos
                                                                        / f.
                                                                        tarjetas_mlimitecompra]  # 38
    dataset[
        'ratio_tarjetas_mpagominimo__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                        tarjetas_mpagominimo
                                                                        / f.
                                                                        tarjetas_mlimitecompra]  # 100
    dataset[
        'ratio_tarjetas_mpagado__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                    tarjetas_mpagado
                                                                    / f.
                                                                    tarjetas_mlimitecompra]  # 29
    dataset[
        'ratio_tarjetas_mpagosdolares__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                          tarjetas_mpagosdolares
                                                                          / f.
                                                                          tarjetas_mlimitecompra]  # 255
    dataset[
        'ratio_tarjetas_mconsumototal__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                          tarjetas_mconsumototal
                                                                          / f.
                                                                          tarjetas_mlimitecompra]  # 365
    dataset[
        'ratio_tarjetas_mconsumosdolares__tarjetas_mlimitecompra'] = dataset[:,
                                                                             f.
                                                                             tarjetas_mconsumosdolares
                                                                             /
                                                                             f.
                                                                             tarjetas_mlimitecompra]  # 352
    dataset[
        'ratio_tarjetas_msaldopesos__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                        tarjetas_msaldopesos
                                                                        / f.
                                                                        tarjetas_mlimitecompra]  # 270
    dataset[
        'ratio_tarjetas_msaldopesos__tarjetas_msaldototal'] = dataset[:, f.
                                                                      tarjetas_msaldopesos
                                                                      / f.
                                                                      tarjetas_msaldototal]  # 414
    dataset[
        'ratio_Master_mlimitecompra__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                        Master_mlimitecompra
                                                                        / f.
                                                                        tarjetas_mlimitecompra]  # 367
    dataset[
        'ratio_Visa_mlimitecompra__tarjetas_mlimitecompra'] = dataset[:, f.
                                                                      Visa_mlimitecompra
                                                                      / f.
                                                                      tarjetas_mlimitecompra]  # 192

    # v2
    dataset['ctarjetas_credito'] = dataset[:, f.ctarjeta_master +
                                           f.ctarjeta_visa]  # 27
    dataset['ctarjetas'] = dataset[:, f.ctarjetas_credito +
                                   f.ctarjeta_debito]  # 623
    dataset[
        'ratio_mprestamos_personales__cprestamos_personales'] = dataset[:, f.
                                                                        mprestamos_personales
                                                                        / f.
                                                                        cprestamos_personales]  # 127
    dataset['cextracciones'] = dataset[:, f.cextraccion_autoservicio +
                                       f.ccajas_extracciones]  # 157
    dataset[
        'ratio_mextraccion_autoservicio__mcuentas_saldo'] = dataset[:, f.
                                                                    mextraccion_autoservicio
                                                                    / f.
                                                                    mcuentas_saldo]  # 565
    dataset['ccomisiones'] = dataset[:, f.ccomisiones_mantenimiento +
                                     f.ccomisiones_otras]  # 578
    dataset['ratio_mcomisiones__ccomisiones'] = dataset[:, f.mcomisiones /
                                                        f.ccomisiones]  # 508
    dataset['ctransacciones'] = dataset[:, f.ccallcenter_transacciones +
                                        f.chomebanking_transacciones +
                                        f.ccajas_transacciones]  # 485
    dataset['ratio_ctransacciones__cproductos'] = dataset[:, f.ctransacciones /
                                                          f.cproductos]  # 472

    # v3
    dataset['mpayroll_total'] = dataset[:, f.mpayroll + f.mpayroll2]  # 68
    dataset['ratio_mpayroll_total__cliente_edad'] = dataset[:,
                                                            f.mpayroll_total /
                                                            f.
                                                            cliente_edad]  # 87
    dataset['ratio_mcaja_ahorro__cliente_edad'] = dataset[:, f.mcaja_ahorro /
                                                          f.cliente_edad]  # 23
    dataset[
        'ratio_mcuentas_saldo__cliente_edad'] = dataset[:, f.mcuentas_saldo /
                                                        f.cliente_edad]  # 102
    dataset['cseguros_total'] = dataset[:, f.cseguro_vida + f.cseguro_auto +
                                        f.cseguro_vivienda +
                                        f.cseguro_accidentes_personales]  # 454
    dataset[
        'ratio_cseguros_total__cliente_antiguedad'] = dataset[:,
                                                              f.cseguros_total
                                                              / f.
                                                              cliente_antiguedad]  # 628

    # v7
    dataset['tarjetas_mconsumo_mes'] = dataset[:, f.mtarjeta_visa_consumo +
                                               f.mtarjeta_master_consumo]  # 45
    dataset['tarjetas_mconsumototal'] = dataset[:, f.Master_mconsumototal +
                                                f.Visa_mconsumototal]  # 419
    dataset[
        'ratio_tarjetas_consumo_mes__cliente_edad'] = dataset[:, f.
                                                              tarjetas_mconsumo_mes
                                                              / f.
                                                              cliente_edad]  # 51
    dataset['score_04'] = dataset[:, (f.ctarjetas_credito *
                                      f.tarjetas_delinquency) /
                                  f.cliente_edad]  # 695
    dataset['score_04_relativo'] = dataset[:, f.score_04 /
                                           mean(f.score_04)]  # 267

    # Resultaron no ser importantes

    # v1
    # dataset['ratio_tarjetas_msaldototal__tarjetas_mlimitecompra'] = dataset[:, f.tarjetas_mlimitecompra / f.tarjetas_mlimitecompra] # 2544

    # v2
    # dataset['ratio_mrentabilidad__cproductos'] = dataset[:, f.mrentabilidad / f.cproductos] # 911
    # dataset['dif_tarjetas_mconsumototal__tarjetas_mpagado'] = dataset[:, f.tarjetas_mconsumototal - f.tarjetas_mpagado] # 1277
    # dataset['ratio_mrentabilidad__mcomisiones'] = dataset[:, f.mrentabilidad / f.mcomisiones] # 1100

    # v3
    # dataset['ratio_mrentabilidad__mcuentas_saldo'] = dataset[:, f.mrentabilidad / f.mcuentas_saldo] # 2042
    # dataset['ratio_mrentabilidad__cliente_antiguedad'] = dataset[:, f.mrentabilidad / f.cliente_antiguedad] # 1854
    # dataset['ratio_mrentabilidad__cliente_edad'] = dataset[:, f.mrentabilidad / f.cliente_edad] # 1811
    # dataset['ratio_cliente_antiguedad__cliente_edad'] = dataset[:, f.cliente_antiguedad / f.cliente_edad] # 1719

    # v7
    # dataset['score_01_relativo'] = dataset[:, f.score_01 / mean(f.score_01)] # no aparece
    # dataset['score_02_relativo'] = dataset[:, f.score_02 / mean(f.score_02)] # 2507
    # dataset['score_03_relativo'] = dataset[:, f.score_03 / mean(f.score_03)] # 2454
    # dataset['ratio_tarjetas_mconsumototal__cliente_edad'] = dataset[:, f.tarjetas_mconsumototal / f.cliente_edad] # 2459
    # dataset['ratio_Visa_mconsumospesos__cliente_edad'] = dataset[:, f.Visa_mconsumospesos / f.cliente_edad] # 2485
    # dataset['ratio_Visa_mconsumosdolares__cliente_edad'] = dataset[:, f.Visa_mconsumosdolares / f.cliente_edad] # 2486
    # dataset['ratio_Visa_mconsumototal__cliente_edad'] = dataset[:, f.Visa_mconsumototal / f.cliente_edad] # 2429
    # dataset['ratio_Master_mconsumospesos__cliente_edad'] = dataset[:, f.Master_mconsumospesos / f.cliente_edad] # 2501
    # dataset['ratio_Master_mconsumosdolares__cliente_edad'] = dataset[:, f.Master_mconsumosdolares / f.cliente_edad] # 2345
    # dataset['ratio_Master_mconsumototal__cliente_edad'] = dataset[:, f.Master_mconsumototal / f.cliente_edad] # 2493
    # dataset['ratio_ctransacciones__cliente_edad'] = dataset[:, f.ctransacciones / f.cliente_edad] # 2508
    # dataset['score_01'] = dataset[:, (f.ctarjetas * f.mrentabilidad) / f.ctrx_quarter] # 2575
    # dataset['score_02'] = dataset[:, (f.ctarjetas * f.ctransacciones) / f.ctrx_quarter] # 2507
    # dataset['score_03'] = dataset[:, (f.ctarjetas * f.ctransacciones) / f.cliente_edad] # 2498

    return dataset
    def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[List[any]] = None,
              X: typing.Optional[dt.Frame] = None,
              **kwargs) -> float:

        # Get the logger if it exists
        logger = self.get_experiment_logger()

        # hard-coded as access to experiment parameters (such as self.tgc) not yet available
        tgc = ["Store", "Dept"]
        # tgc = ["state"]
        # tgc = None

        # enable weighted average over TS R2 scores: weighted based on TS share of rows
        isR2AverageWeighted = False

        # obtain a scorer for metric to use
        scorer = self.get_scorer()

        if tgc is None or not all(col in X.names for col in tgc):
            loggerinfo(
                logger,
                f"TS R2 computes single R2 on {X.nrows} rows as either tgc {tgc} is not defined or incorrect."
            )
            return scorer.score(actual, predicted, sample_weight, labels,
                                **kwargs)
        else:
            tgc_values = X[:, {
                "weight": count() / X.nrows,
                "r2": 0.0
            }, by(tgc)]
            loggerinfo(
                logger,
                f"TS R2 computes multiple R2 on {X.nrows} rows, tgc {tgc} with weighting is {isR2AverageWeighted}."
            )
            none_values = [None] * X.nrows
            X = cbind(
                X[:, tgc],
                Frame(actual=actual,
                      predicted=predicted,
                      sample_weight=sample_weight
                      if sample_weight is not None else none_values))

            for i in range(0, tgc_values.nrows):
                current_tgc = tgc_values[i, :]
                current_tgc.key = tgc
                ts_frame = X[:, :, join(current_tgc)][~isna(f.r2), :]
                r2_score = scorer.score(
                    ts_frame['actual'].to_numpy(),
                    ts_frame['predicted'].to_numpy(),
                    ts_frame['sample_weight'].to_numpy()
                    if sample_weight is not None else None, labels, **kwargs)
                tgc_values[i, f.r2] = r2_score

                loggerinfo(
                    logger,
                    f"TS R2 = {r2_score} on {ts_frame.nrows} rows, tgc = {current_tgc[0, tgc].to_tuples()}"
                )

            if isR2AverageWeighted:
                # return np.average(tgc_values["r2"].to_numpy(), weights=tgc_values["weight"].to_numpy())
                return tgc_values[:, mean(f.r2 * f.weight)][0, 0]
            else:
                return tgc_values[:, mean(f.r2)][0, 0]
示例#13
0
ans = x[:, {"v1": sum(f.v1)}, by(f.id1, f.id2)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.v1)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
print(ans.head(3).to_pandas(), flush=True)
print(ans.tail(3).to_pandas(), flush=True)
del ans

question = "sum v1 mean v3 by id3" # q3
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, [sum(f.v1), sum(f.v3)]]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:, {"v1": sum(f.v1), "v3": mean(f.v3)}, by(f.id3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
示例#14
0
文件: app.py 项目: pavel-mayer/covid
dead = fullTable[:,'AnzahlTodesfall'].sum()[0,0]

lastDay=fullTable[:,'MeldeDay'].max()[0,0]
print("lastDay {} cases {} dead{}".format(lastDay, cases, dead))

newTable=fullTable[:,dt.f[:].extend({"erkMeldeDelay": dt.f.MeldeDay-dt.f.RefDay})]
#print(newTable.keys())


#dt.by(dt.f.Bundesland)]
alldays=fullTable[:,
          [dt.sum(dt.f.AnzahlFall),
           dt.sum(dt.f.FaellePro100k),
           dt.sum(dt.f.AnzahlTodesfall),
           dt.sum(dt.f.TodesfaellePro100k),
           dt.mean(dt.f.Bevoelkerung)],
   dt.by(dt.f.Landkreis)]

last7days=fullTable[dt.f.newCaseOnDay>lastDay-7,:][:,
          [dt.sum(dt.f.AnzahlFall),
           dt.sum(dt.f.FaellePro100k),
           dt.sum(dt.f.AnzahlTodesfall),
           dt.sum(dt.f.TodesfaellePro100k)],
   dt.by(dt.f.Landkreis)]
last7days.names=["Landkreis","AnzahlFallLetzte7Tage","FaellePro100kLetzte7Tage","AnzahlTodesfallLetzte7Tage","TodesfaellePro100kLetzte7Tage"]

def merge(largerTable, smallerTable, keyFieldName):
    keys = smallerTable[:, keyFieldName].to_list()[0]
    extTable = largerTable.copy()
    for colName in smallerTable.names:
        if colName != keyFieldName:
示例#15
0
def analyzeDaily(fullTable, filter, prefix, postfix, byDateColName):

    print("analyzeDaily prefix='{}' postfix='{}' byDateColName='{}'".format(prefix, postfix, byDateColName))
    #print("analyzeDaily filter='{}' '".format(filter))
    byDate = dt.f[byDateColName]
    #print("----- analyzeDaily:"+postfix)
    #dayTable = fullTable[(dt.f.DatenstandTag >= fromDay) & (dt.f.DatenstandTag < toDay) & (dt.f.IdLandkreis == forIdLandkreis),:]

    dayTable = fullTable[filter,:]

    cases_to_count = dayTable[(dt.f.NeuerFall == 0) | (dt.f.NeuerFall == 1),:]
    cases = cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)]
    cases.names = [byDateColName, prefix+"AnzahlFall"+postfix]
    cases.key = byDateColName
    print("cases rows = {}, cases_to_count = {}".format(cases.nrows, cases_to_count.nrows))
    #print(cases)
    byDayTable = cases

    if byDateColName == "DatenstandTag":
        new_cases_to_count = dayTable[(dt.f.NeuerFall == -1) | (dt.f.NeuerFall == 1),:]
        new_cases = new_cases_to_count[:, [dt.sum(dt.f.AnzahlFall)],dt.by(byDate)]
        new_cases.names = [byDateColName, prefix+"AnzahlFallNeu"+postfix]
        new_cases.key = byDateColName
        print("new_cases rows = {}, new_cases_to_count = {}".format(new_cases.nrows, new_cases_to_count.nrows))
        #new_cases_to_count.to_csv("new_cases_to_count.csv")
        byDayTable = byDayTable[:,:,dt.join(new_cases)]
    else:
        # add days by MeldeTag
        byDayTable.names = {prefix+"AnzahlFall"+postfix: prefix+"AnzahlFallNeu"+postfix}
        byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlFallNeu"+postfix, prefix+"AnzahlFall"+postfix)

    dead_to_count = dayTable[(dt.f.NeuerTodesfall == 0) | (dt.f.NeuerTodesfall == 1),:]
    dead = dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)]
    dead.names = [byDateColName, prefix+"AnzahlTodesfall"+postfix]
    dead.key = byDateColName
    #print("dead rows = {}".format(dead.nrows))
    byDayTable = byDayTable[:,:,dt.join(dead)]

    if byDateColName == "DatenstandTag":
        new_dead_to_count = dayTable[(dt.f.NeuerTodesfall == -1) | (dt.f.NeuerTodesfall == 1),:]
        new_dead = new_dead_to_count[:, [dt.sum(dt.f.AnzahlTodesfall)],dt.by(byDate)]
        new_dead.names = [byDateColName, prefix+"AnzahlTodesfallNeu"+postfix]
        new_dead.key = byDateColName
        #print("new_dead rows = {}".format(new_dead.nrows))
        byDayTable = byDayTable[:,:,dt.join(new_dead)]
    else:
        # add days by MeldeTag
        byDayTable.names = {prefix+"AnzahlTodesfall"+postfix: prefix+"AnzahlTodesfallNeu"+postfix}
        byDayTable = addRunningSumColumn(byDayTable, prefix+"AnzahlTodesfallNeu"+postfix, prefix+"AnzahlTodesfall"+postfix)

    byDayTable.key = byDateColName

    if postfix == "" and prefix == "" and byDateColName == "DatenstandTag":
        new_cases_to_count_delay = new_cases_to_count[(dt.f.AnzahlFall > 0), :]  # measure delay only for positive cases
        new_cases_to_count_delay.materialize()
        new_cases_delay = new_cases_to_count_delay[:, [dt.min(dt.f.MeldeDelay), dt.max(dt.f.MeldeDelay),
                                                       dt.mean(dt.f.MeldeDelay), dt.median(dt.f.MeldeDelay),
                                                       dt.sd(dt.f.MeldeDelay), dt.sum(dt.f.AnzahlFall),
                                                       dt.max(dt.f.DatenstandTag)], dt.by(byDate)]
        new_cases_delay.names = ["DatenstandTag",
                                 "PublikationsdauerFallNeu_Min" + postfix, "PublikationsdauerFallNeu_Max" + postfix,
                                 "PublikationsdauerFallNeu_Schnitt" + postfix, "PublikationsdauerFallNeu_Median" + postfix,
                                 "PublikationsdauerFallNeu_StdAbw" + postfix, "PublikationsdauerFallNeu_Fallbasis" + postfix,
                                 "DatenstandTag_Max" + postfix]
        new_cases_delay.key = "DatenstandTag"
        print("new_cases_delay rows = {}, new_cases_to_count_delay = {}".format(new_cases_delay.nrows,
                                                                                new_cases_to_count_delay.nrows))

        recovered_to_count = dayTable[(dt.f.NeuGenesen == 0) | (dt.f.NeuGenesen == 1),:]
        recovered = recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)]
        recovered.names = ["DatenstandTag", "AnzahlGenesen"+postfix]
        recovered.key = "DatenstandTag"
        #print("recovered rows = {}".format(recovered.nrows))

        new_recovered_to_count = dayTable[(dt.f.NeuGenesen == -1) | (dt.f.NeuGenesen == 1),:]
        new_recovered = new_recovered_to_count[:, [dt.sum(dt.f.AnzahlGenesen)],dt.by(byDate)]
        new_recovered.names = ["DatenstandTag", "AnzahlGenesenNeu"+postfix]
        new_recovered.key = "DatenstandTag"
        #print("new_recovered rows = {}".format(new_recovered.nrows))

        byDayTable = byDayTable[:, :, dt.join(recovered)][:, :, dt.join(new_recovered)][:, :,dt.join(new_cases_delay)]
        #byDayTable = byDayTable[:,:,dt.join(recovered)][:,:,dt.join(new_recovered)]\
        #    [:,:,dt.join(new_cases_strict)][:,:,dt.join(new_cases_strict_14)][:,:,dt.join(new_cases_delay)]

    byDayTable.key = byDateColName
    #print("byDayTable rows = {}".format(byDayTable.nrows))
    #print(byDayTable)
    return byDayTable
示例#16
0
def test_rows_mean():
    from datatable import mean
    df0 = dt.Frame(range(10), names=["A"])
    df1 = df0(f.A > mean(f.A), engine="eager")
    df1.internal.check()
    assert df1.topython() == [[5, 6, 7, 8, 9]]
示例#17
0
def test_mean():
    assert str(dt.mean(f.A)) == str(f.A.mean())
    assert str(dt.mean(f[:])) == str(f[:].mean())
    DT = dt.Frame(A=range(1, 10))
    assert_equals(DT[:, f.A.mean()], DT[:, dt.mean(f.A)])
示例#18
0
def test_rows_mean():
    from datatable import mean
    df0 = dt.Frame(A=range(10))
    df1 = df0[f.A > mean(f.A), :]
    frame_integrity_check(df1)
    assert df1.to_list() == [[5, 6, 7, 8, 9]]
示例#19
0
def test_groups1():
    f0 = dt.Frame({"A": [1, 2, 1, 2, 1, 3, 1, 1],
                   "B": [0, 1, 2, 3, 4, 5, 6, 7]})
    f1 = f0(select=mean(f.B), groupby=f.A)
    assert f1.stypes == (dt.float64,)
    assert f1.topython() == [[3.8, 2.0, 5.0]]