示例#1
0
def test_group_reverse_flag():
    DT = dt.Frame({"A": [1, 2, 1, 2, 2, 3, 3], "B": [2, 2, 4, 4, 23, 5, 30]})
    EXPECTED = DT[:, :, dt.by(dt.f.A), dt.sort(-dt.f.B)]
    RES1 = DT[:, :, dt.by("A"), dt.sort("B", reverse=True)]
    RES2 = DT[:, :, dt.by(dt.f.A), dt.sort(dt.f.B, reverse=True)]
    assert_equals(EXPECTED, RES1)
    assert_equals(RES1, RES2)
示例#2
0
def test_sort_consts():
    DT = dt.Frame(A=[5], B=[7.9], C=["Hello"], D=[None])
    DT = dt.repeat(DT, 1000)
    assert_equals(DT[:, :, sort(f.A)], DT)
    assert_equals(DT[:, :, sort(f.B)], DT)
    assert_equals(DT[:, :, sort(f.C)], DT)
    assert_equals(DT[:, :, sort(f.D)], DT)
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        forecast_len = 7

        # get COVID19 new cases data from Our World in Data github
        X = dt.fread(
            "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
        )

        # remove country aggregates like 'World' and 'International'
        X = X[~(dt.f.iso_code == '') & ~(dt.f.continent == ''), :]

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(X[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        train = X[dt.f[date_col] <= split_date, :]
        test = X[dt.f[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {
            f"covid19_daily_{split_date}_by_countries_train": train,
            f"covid19_daily_{test_date}_by_countries_test": test
        }
示例#4
0
def test_sort_expr():
    df = dt.Frame(A=[1, 2, 1, 2], B=[3.9, 2.7, 0.1, 4.5])
    assert_equals(df[:, :, sort("A")],
                  dt.Frame(A=[1, 1, 2, 2], B=[3.9, 0.1, 2.7, 4.5]))
    assert_equals(df[:, :, sort(f.B)],
                  dt.Frame(A=[1, 2, 1, 2], B=[0.1, 2.7, 3.9, 4.5]))
    assert_equals(df[:, 'B', by("A"), sort("B")],
                  dt.Frame(A=[1, 1, 2, 2], B=[0.1, 3.9, 2.7, 4.5]))
示例#5
0
def test_group_negate_column():
    DT = dt.Frame({"A": [1, 2, 1, 2, 2, 3, 3], "B": [2, 2, 4, 4, 23, 5, 30]})
    EXPECTED = dt.Frame({
        "A": [3, 3, 2, 2, 2, 1, 1],
        "B": [30, 5, 23, 4, 2, 4, 2]
    })
    RES1 = DT[:, :, dt.by(-dt.f.A), dt.sort(-dt.f.B)]
    RES2 = DT[:, :, dt.by(-dt.f.A), dt.sort(dt.f.B, reverse=True)]
    assert_equals(EXPECTED, RES1)
    assert_equals(RES1, RES2)
示例#6
0
def test_sort_strings_reverse_large():
    src = ['klein', 'nim', 'toapr', 'f', '', 'zleu', '?34', '.............']
    src *= 10
    src += ['adferg', 'reneeas', 'ldodls', 'qu', 'zleuss', 'ni'] * 7
    src *= 25
    src += ['shoo!', 'zzZzzZ' * 5]
    DT = dt.Frame(A=src)
    RES = dt.Frame(A=sorted(src, reverse=True))
    assert_equals(DT[:, :, sort(-f.A)], RES)
    assert_equals(DT[:, :, sort(f.A, reverse=True)], RES)
示例#7
0
def test_issue2348():
    DT = dt.Frame(A=[1, 2, 3, 1, 2, 3], B=list('akdfnv'),
                  C=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
                  D=[11]*6, E=[2]*6)
    # Check that these expressions do not crash
    DT[:, :, by(f.A), sort(f.A, f.E)]
    DT[:, :, by(f.A, f.B), sort(f.A, f.B)]
    assert_equals(DT[:, dt.count(), by(f.D), sort(f.E, f.A)],
                  dt.Frame([[11], [6]],
                           names=["D", "count"],
                           stypes=[dt.int32, dt.int64]))
示例#8
0
def test_sort_with_reverse_list_true_true(numpy):
    DT = dt.Frame({
        'A': ['o1', 'o2', 'o3', 'o4', 'o5'],
        'B': ['c1', 'c1', 'c2', 'c2', 'c3'],
        'C': [5, 1, 3, numpy.NaN, numpy.NaN]
    })
    EXP = DT[:, :, dt.sort(-f.A, -f.B)]
    RES1 = DT[:, :, dt.sort("B", "A", reverse=[True, True])]
    RES2 = DT[:, :, dt.sort(1, 0, reverse=[True, True])]
    RES3 = DT[:, :, dt.sort(["B", "A"], reverse=[True, True])]
    assert_equals(EXP, RES1)
    assert_equals(EXP, RES2)
    assert_equals(EXP, RES3)
示例#9
0
def test_int16_random(n):
    random.seed(n)
    nn = int(random.expovariate(0.001)) + 1
    span = min(65535, int(random.expovariate(0.01)) + 3)
    data = [random.randint(-span, span) for _ in range(nn)]
    DT0 = dt.Frame(A=data, stype=dt.int16)
    DT1 = dt.Frame(A=sorted(data), stype=dt.int16)
    if random.choice([True, False]):
        DTS = DT0[:, :, sort(f.A)]
        assert_equals(DTS, DT1)
    else:
        DTS = DT0[:, :, sort(-f.A)]
        assert_equals(DTS, DT1[::-1, :])
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # define date column and forecast horizon
        date_col = 'date'
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_total = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv")

        # produce lag of 1 unit and add as new feature for each column in the list
        series_cols = ["cases", "deaths"]
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_total[:, update(**aggs), sort(date_col)]

        # update NA lags to 0
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_total[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        # compute daily values by differentiating
        aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols}
        us_total[:, update(**aggs), sort(date_col)]

        # delete columns with yesterday (shift) values
        series_cols_to_delete = [f"{col}_yesterday" for col in series_cols]
        del us_total[:, series_cols_to_delete]

        # set negative daily values to 0
        us_total[f.cases_daily < 0, [f.cases_daily]] = 0
        us_total[f.deaths_daily < 0, [f.deaths_daily]] = 0

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_total[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_total[date_col].to_pandas()
        train = us_total[df[date_col] <= split_date, :]
        test = us_total[df[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {f"covid19_daily_{split_date}_us_train": train,
                f"covid19_daily_{test_date}_us_test": test}
示例#11
0
def test_sort_api():
    df = dt.Frame([[1, 2, 1, 2], [3.3, 2.7, 0.1, 4.5]], names=["A", "B"])
    df1 = df.sort("A")
    df2 = df.sort("B")
    df3 = df.sort("A", "B")
    df4 = df.sort(["A", "B"])
    df5 = df.sort()  # issue 1354
    df6 = df[:, :, dt.sort()]
    df7 = df[:, :, dt.sort(["A", "B"])]
    assert df1.to_list() == [[1, 1, 2, 2], [3.3, 0.1, 2.7, 4.5]]
    assert df2.to_list() == [[1, 2, 1, 2], [0.1, 2.7, 3.3, 4.5]]
    assert df3.to_list() == [[1, 1, 2, 2], [0.1, 3.3, 2.7, 4.5]]
    assert df4.to_list() == df7.to_list()
    assert df5.to_list() == df6.to_list()
示例#12
0
def test_int8_small_stable():
    DT0 = dt.Frame(A=[5, 3, 5, None, 100, None, 3, None] / dt.int8,
                   B=[1, 5, 10, 20, 50, 100, 200, 500])
    DT1 = dt.Frame(A=[None, None, None, 3, 3, 5, 5, 100] / dt.int8,
                   B=[20, 100, 500, 5, 200, 1, 10, 50])
    DTS = DT0[:, :, sort(f.A)]
    assert_equals(DTS, DT1)
示例#13
0
def load_join_write(name,
                    data_dir,
                    output_dir,
                    foreign_keys=[],
                    join_dfs=None,
                    add_index=True):
    """
    Given the name of a table, load all PSet tables of that name from data_dir,
    join them to any foreign key tables (specified by foreign_keys), and write
    the final combined and joined table to output_dir as a CSV.

    @param name: [`string`] The name of the table
    @param data_dir: [`string`] File path to the directory with all PSet tables
    @param output_dir: [`string`] The file path to the final tables
    @param foreign_keys: [`list(string)`] An optional list of tables that this table
        needs to be joined with
    @param join_dfs: [`dict(string: datatable.Frame)`] An optional dictionary of join
        tables (for building out foreign keys); keys are table names
    @param add_index: [`bool`] Indicates whether or not to add a primary key (1-nrows)
        when writing the final table to a .jay
    @return: [`datatable.Frame`] The final combined and joined table
    """
    df = load_table(name, data_dir)
    for fk in foreign_keys:
        logger.info(f"Joining {name} table with {fk} table...")
        if fk not in join_dfs:
            raise KeyError(f"The {name} table has the foreign key {fk}_id but \
                            there is no {fk} table in the join tables dictionary."
                           )
        df = join_tables(df, join_dfs[fk], f"{fk}_id")
    fk_columns = [f"{fk}_id" for fk in foreign_keys]
    df = df[:, :, sort(fk_columns)]
    df = write_table(df, name, output_dir, add_index)
    return df
示例#14
0
def test_bool8_small():
    DT0 = dt.Frame([True, False, False, None, True, True, None])
    DT1 = dt.Frame([None, None, False, False, True, True, True])
    DTS = DT0[:, :, sort("C0")]
    assert DT0.stype == dt.bool8
    assert isview(DTS)
    assert_equals(DTS, DT1)
示例#15
0
def test_bool8_small_descending():
    DT0 = dt.Frame([True, False, False, None, True, True, None])
    DT1 = dt.Frame([None, None, True, True, True, False, False])
    DTS = DT0[:, :, sort(-f.C0)]
    assert DT0.stype == dt.bool8
    assert isview(DTS)
    assert_equals(DTS, DT1)
示例#16
0
def test_bool8_large_stable(n):
    DT0 = dt.Frame(A=[True, False, None] * n, B=range(3 * n))
    DT1 = dt.Frame(B=list(range(2, 3 * n, 3)) +
                     list(range(1, 3 * n, 3)) +
                     list(range(0, 3 * n, 3)))
    DTS = DT0[:, f.B, sort(f.A)]
    assert_equals(DTS, DT1)
示例#17
0
def test_sort_ints_reverse(st):
    DT = dt.Frame(A=[5, 17, 9, -12, 0, 111, 3, 5], B=list('abcdefgh'),
                  stypes={"A": st, "B": dt.str32})
    assert_equals(DT[:, :, sort(-f.A)],
                  dt.Frame(A=[111, 17, 9, 5, 5, 3, 0, -12],
                           B=list('fbcahged'),
                           stypes={"A": st, "B": dt.str32}))
示例#18
0
def test_sort_view1():
    DT0 = dt.Frame([5, 10])
    DT1 = DT0[[i % 2 for i in range(10)], :]
    assert DT1.shape == (10, 1)
    assert isview(DT1)
    DT2 = DT1[:, :, sort(0)]
    assert_equals(DT2, dt.Frame([5] * 5 + [10] * 5))
示例#19
0
def test_sort_view3():
    d0 = dt.Frame(range(1000))
    d1 = d0[::-5, :]
    d2 = d1[:, :, sort(0)]
    d2.internal.check()
    assert d2.shape == (200, 1)
    assert d2.to_list() == [list(range(4, 1000, 5))]
示例#20
0
    def __call__(self,
                 rows=None,
                 select=None,
                 verbose=False,
                 timeit=False,
                 groupby=None,
                 join=None,
                 sort=None,
                 engine=None):
        """DEPRECATED, use DT[i, j, ...] instead."""
        warnings.warn(
            "`DT(rows, select, ...)` is deprecated and will be removed in "
            "version 0.9.0. Please use `DT[i, j, ...]` instead",
            category=FutureWarning)
        time0 = time.time() if timeit else 0
        function = type(lambda: None)
        if isinstance(rows, function):
            rows = rows(datatable.f)
        if isinstance(select, function):
            select = select(datatable.f)

        res = self[rows, select,
                   datatable.join(join),
                   datatable.by(groupby),
                   datatable.sort(sort)]
        if timeit:
            print("Time taken: %d ms" % (1000 * (time.time() - time0)))
        return res
示例#21
0
def test_int16_small_descending():
    DT0 = dt.Frame(A=[4, 12, 1000, None, 2, 4, 0, -444, 95, None, 7] /
                   dt.int16)
    DT1 = dt.Frame(A=[None, None, 1000, 95, 12, 7, 4, 4, 2, 0, -444] /
                   dt.int16)
    DTS = DT0[:, :, sort(-f.A)]
    assert_equals(DTS, DT1)
示例#22
0
def test_int8_large_stable(n):
    src = [None, 10, -10] * (n // 3)
    DT = dt.Frame([src, range(n)], names=("A", "B"), stypes={"A": "int8"})
    assert DT["A"].stype == dt.int8
    d1 = DT[:, f.B, sort(f.A)]
    assert d1.to_list() == [
        list(range(0, n, 3)) + list(range(2, n, 3)) + list(range(1, n, 3))
    ]
示例#23
0
def test_bool8_small():
    d0 = dt.Frame([True, False, False, None, True, True, None])
    assert d0.stypes == (stype.bool8, )
    d1 = d0[:, :, sort("C0")]
    assert d1.stypes == d0.stypes
    assert d1.internal.isview
    d1.internal.check()
    assert d1.to_list() == [[None, None, False, False, True, True, True]]
示例#24
0
def test_int8_large_stable(n):
    src = [None, 10, -10] * (n // 3)
    d0 = dt.Frame([src, range(n)], names=("A", "B"))
    assert d0.stypes[0] == stype.int8
    d1 = d0[:, f.B, sort(f.A)]
    assert d1.to_list() == [
        list(range(0, n, 3)) + list(range(2, n, 3)) + list(range(1, n, 3))
    ]
示例#25
0
def test_int32_large_stable(n):
    src = [None, 100, 100000] * (n // 3)
    d0 = dt.Frame([src, range(n)], names=["A", "B"])
    assert d0.stypes[0] == stype.int32
    d1 = d0[:, "B", sort("A")]
    assert d1.to_list() == [
        list(range(0, n, 3)) + list(range(1, n, 3)) + list(range(2, n, 3))
    ]
示例#26
0
def test_bool8_small():
    d0 = dt.Frame([True, False, False, None, True, True, None])
    assert d0.stypes == (stype.bool8, )
    d1 = d0[:, :, sort("C0")]
    assert d1.stypes == d0.stypes
    assert isview(d1)
    frame_integrity_check(d1)
    assert d1.to_list() == [[None, None, False, False, True, True, True]]
示例#27
0
def py_dt_one_group_proportions_summary(DT,por):
    
    DT_summary = DT[:,dt.count(),by(f[por])
                   ][:,f[:].extend({'grand_tot':dt.sum(f.count)})
                    ][:,f[:].extend({'prop':f.count/f.grand_tot})
                     ][:,f[:].remove(f.grand_tot),dt.sort(-f.prop)
                      ]
    return DT_summary
示例#28
0
def test_int16_small_stable():
    DT0 = dt.Frame(A=[0, 1000, 0, 0, 1000, 0, 0, 1000, 0] / dt.int16,
                   B=[1, 2, 3, 4, 5, 6, 7, 8, 9])
    DT1 = dt.Frame(A=[0, 0, 0, 0, 0, 0, 1000, 1000, 1000] / dt.int16,
                   B=[1, 3, 4, 6, 7, 9, 2, 5, 8])
    DTS = DT0[:, :, sort(f.A)]
    assert DT0['A'].stype == dt.int16
    assert_equals(DTS, DT1)
示例#29
0
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # Download files
        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
        link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
        link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz"

        # Download the files
        file_basics = download(link_basics, dest_path=temp_path)
        file_ratings = download(link_ratings, dest_path=temp_path)
        file_episodes = download(link_episodes, dest_path=temp_path)

        # get COVID19 new cases data from Our World in Data github
        basics = dt.fread(file_basics, fill=True)
        ratings = dt.fread(file_ratings, fill=True)
        episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True)

        # remove files
        os.remove(file_basics)
        os.remove(file_ratings)
        os.remove(file_episodes)

        # Create Title with Ratings dataset
        # join titles with non-null ratings
        ratings = ratings[~dt.isna(dt.f.averageRating), :]
        ratings.key = "tconst"
        basics_ratings = basics[:, :, dt.join(ratings)]

        # Create Episodes dataset
        episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :]
        episode_ratings = episodes[:, :, dt.join(ratings)]
        episode_ratings.names = {'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes'}
        basics_ratings.key = 'tconst'
        title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)]

        # enumerate series episodes from 1 to N
        title_episode_ratings = title_episode_ratings[:, :, dt.sort(dt.f.tconst, dt.f.seasonNumber, dt.f.episodeNumber)]
        result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list()
        from itertools import chain
        cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0])
        title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount))

        # return datasets
        return {f"imdb_title_ratings": basics_ratings,
                f"imdb_episode_ratings": title_episode_ratings}
示例#30
0
def test_h2oai7014(tempfile):
    data = dt.Frame([[None, 't'], [3580, 1047]], names=["ID", "count"])
    data.to_jay(tempfile)
    # The data has to be opened from file
    counts = dt.open(tempfile)
    counts = counts[1:, :]
    counts = counts[:, :, sort("count")]
    counts.materialize()
    assert counts.to_list() == [['t'], [1047]]
示例#31
0
ans = x[:, {"range_v1_v2": max(f.v1)-min(f.v2)}, by(f.id2, f.id4)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.range_v1_v2)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
print(ans.head(3).to_pandas(), flush=True)
print(ans.tail(3).to_pandas(), flush=True)
del ans

question = "largest two v3 by id2 id4" # q8
gc.collect()
t_start = timeit.default_timer()
ans = x[:2, {"largest2_v3": f.v3}, by(f.id2, f.id4), sort(-f.v3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()
chk = ans[:, sum(f.largest2_v3)]
chkt = timeit.default_timer() - t_start
write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt)
del ans
gc.collect()
t_start = timeit.default_timer()
ans = x[:2, {"largest2_v3": f.v3}, by(f.id2, f.id4), sort(-f.v3)]
print(ans.shape, flush=True)
t = timeit.default_timer() - t_start
m = memory_usage()
t_start = timeit.default_timer()