def test_group_reverse_flag(): DT = dt.Frame({"A": [1, 2, 1, 2, 2, 3, 3], "B": [2, 2, 4, 4, 23, 5, 30]}) EXPECTED = DT[:, :, dt.by(dt.f.A), dt.sort(-dt.f.B)] RES1 = DT[:, :, dt.by("A"), dt.sort("B", reverse=True)] RES2 = DT[:, :, dt.by(dt.f.A), dt.sort(dt.f.B, reverse=True)] assert_equals(EXPECTED, RES1) assert_equals(RES1, RES2)
def test_sort_consts(): DT = dt.Frame(A=[5], B=[7.9], C=["Hello"], D=[None]) DT = dt.repeat(DT, 1000) assert_equals(DT[:, :, sort(f.A)], DT) assert_equals(DT[:, :, sort(f.B)], DT) assert_equals(DT[:, :, sort(f.C)], DT) assert_equals(DT[:, :, sort(f.D)], DT)
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' forecast_len = 7 # get COVID19 new cases data from Our World in Data github X = dt.fread( "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv" ) # remove country aggregates like 'World' and 'International' X = X[~(dt.f.iso_code == '') & ~(dt.f.continent == ''), :] # determine threshold to split train and test based on forecast horizon dates = dt.unique(X[:, date_col]) split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0] test_date = dates[-1, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set train = X[dt.f[date_col] <= split_date, :] test = X[dt.f[date_col] > split_date, :] # return [train, test] and rename dataset names as needed return { f"covid19_daily_{split_date}_by_countries_train": train, f"covid19_daily_{test_date}_by_countries_test": test }
def test_sort_expr(): df = dt.Frame(A=[1, 2, 1, 2], B=[3.9, 2.7, 0.1, 4.5]) assert_equals(df[:, :, sort("A")], dt.Frame(A=[1, 1, 2, 2], B=[3.9, 0.1, 2.7, 4.5])) assert_equals(df[:, :, sort(f.B)], dt.Frame(A=[1, 2, 1, 2], B=[0.1, 2.7, 3.9, 4.5])) assert_equals(df[:, 'B', by("A"), sort("B")], dt.Frame(A=[1, 1, 2, 2], B=[0.1, 3.9, 2.7, 4.5]))
def test_group_negate_column(): DT = dt.Frame({"A": [1, 2, 1, 2, 2, 3, 3], "B": [2, 2, 4, 4, 23, 5, 30]}) EXPECTED = dt.Frame({ "A": [3, 3, 2, 2, 2, 1, 1], "B": [30, 5, 23, 4, 2, 4, 2] }) RES1 = DT[:, :, dt.by(-dt.f.A), dt.sort(-dt.f.B)] RES2 = DT[:, :, dt.by(-dt.f.A), dt.sort(dt.f.B, reverse=True)] assert_equals(EXPECTED, RES1) assert_equals(RES1, RES2)
def test_sort_strings_reverse_large(): src = ['klein', 'nim', 'toapr', 'f', '', 'zleu', '?34', '.............'] src *= 10 src += ['adferg', 'reneeas', 'ldodls', 'qu', 'zleuss', 'ni'] * 7 src *= 25 src += ['shoo!', 'zzZzzZ' * 5] DT = dt.Frame(A=src) RES = dt.Frame(A=sorted(src, reverse=True)) assert_equals(DT[:, :, sort(-f.A)], RES) assert_equals(DT[:, :, sort(f.A, reverse=True)], RES)
def test_issue2348(): DT = dt.Frame(A=[1, 2, 3, 1, 2, 3], B=list('akdfnv'), C=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], D=[11]*6, E=[2]*6) # Check that these expressions do not crash DT[:, :, by(f.A), sort(f.A, f.E)] DT[:, :, by(f.A, f.B), sort(f.A, f.B)] assert_equals(DT[:, dt.count(), by(f.D), sort(f.E, f.A)], dt.Frame([[11], [6]], names=["D", "count"], stypes=[dt.int32, dt.int64]))
def test_sort_with_reverse_list_true_true(numpy): DT = dt.Frame({ 'A': ['o1', 'o2', 'o3', 'o4', 'o5'], 'B': ['c1', 'c1', 'c2', 'c2', 'c3'], 'C': [5, 1, 3, numpy.NaN, numpy.NaN] }) EXP = DT[:, :, dt.sort(-f.A, -f.B)] RES1 = DT[:, :, dt.sort("B", "A", reverse=[True, True])] RES2 = DT[:, :, dt.sort(1, 0, reverse=[True, True])] RES3 = DT[:, :, dt.sort(["B", "A"], reverse=[True, True])] assert_equals(EXP, RES1) assert_equals(EXP, RES2) assert_equals(EXP, RES3)
def test_int16_random(n): random.seed(n) nn = int(random.expovariate(0.001)) + 1 span = min(65535, int(random.expovariate(0.01)) + 3) data = [random.randint(-span, span) for _ in range(nn)] DT0 = dt.Frame(A=data, stype=dt.int16) DT1 = dt.Frame(A=sorted(data), stype=dt.int16) if random.choice([True, False]): DTS = DT0[:, :, sort(f.A)] assert_equals(DTS, DT1) else: DTS = DT0[:, :, sort(-f.A)] assert_equals(DTS, DT1[::-1, :])
def create_data(X: dt.Frame = None) -> Union[ str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' forecast_len = 7 # get COVID19 data from NYTimes github us_total = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv") # produce lag of 1 unit and add as new feature for each column in the list series_cols = ["cases", "deaths"] aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols} us_total[:, update(**aggs), sort(date_col)] # update NA lags to 0 aggs = {f"{col}_yesterday": 0 for col in series_cols} us_total[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)] # compute daily values by differentiating aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols} us_total[:, update(**aggs), sort(date_col)] # delete columns with yesterday (shift) values series_cols_to_delete = [f"{col}_yesterday" for col in series_cols] del us_total[:, series_cols_to_delete] # set negative daily values to 0 us_total[f.cases_daily < 0, [f.cases_daily]] = 0 us_total[f.deaths_daily < 0, [f.deaths_daily]] = 0 # determine threshold to split train and test based on forecast horizon dates = dt.unique(us_total[:, date_col]) split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0] test_date = dates[-1, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set df = us_total[date_col].to_pandas() train = us_total[df[date_col] <= split_date, :] test = us_total[df[date_col] > split_date, :] # return [train, test] and rename dataset names as needed return {f"covid19_daily_{split_date}_us_train": train, f"covid19_daily_{test_date}_us_test": test}
def test_sort_api(): df = dt.Frame([[1, 2, 1, 2], [3.3, 2.7, 0.1, 4.5]], names=["A", "B"]) df1 = df.sort("A") df2 = df.sort("B") df3 = df.sort("A", "B") df4 = df.sort(["A", "B"]) df5 = df.sort() # issue 1354 df6 = df[:, :, dt.sort()] df7 = df[:, :, dt.sort(["A", "B"])] assert df1.to_list() == [[1, 1, 2, 2], [3.3, 0.1, 2.7, 4.5]] assert df2.to_list() == [[1, 2, 1, 2], [0.1, 2.7, 3.3, 4.5]] assert df3.to_list() == [[1, 1, 2, 2], [0.1, 3.3, 2.7, 4.5]] assert df4.to_list() == df7.to_list() assert df5.to_list() == df6.to_list()
def test_int8_small_stable(): DT0 = dt.Frame(A=[5, 3, 5, None, 100, None, 3, None] / dt.int8, B=[1, 5, 10, 20, 50, 100, 200, 500]) DT1 = dt.Frame(A=[None, None, None, 3, 3, 5, 5, 100] / dt.int8, B=[20, 100, 500, 5, 200, 1, 10, 50]) DTS = DT0[:, :, sort(f.A)] assert_equals(DTS, DT1)
def load_join_write(name, data_dir, output_dir, foreign_keys=[], join_dfs=None, add_index=True): """ Given the name of a table, load all PSet tables of that name from data_dir, join them to any foreign key tables (specified by foreign_keys), and write the final combined and joined table to output_dir as a CSV. @param name: [`string`] The name of the table @param data_dir: [`string`] File path to the directory with all PSet tables @param output_dir: [`string`] The file path to the final tables @param foreign_keys: [`list(string)`] An optional list of tables that this table needs to be joined with @param join_dfs: [`dict(string: datatable.Frame)`] An optional dictionary of join tables (for building out foreign keys); keys are table names @param add_index: [`bool`] Indicates whether or not to add a primary key (1-nrows) when writing the final table to a .jay @return: [`datatable.Frame`] The final combined and joined table """ df = load_table(name, data_dir) for fk in foreign_keys: logger.info(f"Joining {name} table with {fk} table...") if fk not in join_dfs: raise KeyError(f"The {name} table has the foreign key {fk}_id but \ there is no {fk} table in the join tables dictionary." ) df = join_tables(df, join_dfs[fk], f"{fk}_id") fk_columns = [f"{fk}_id" for fk in foreign_keys] df = df[:, :, sort(fk_columns)] df = write_table(df, name, output_dir, add_index) return df
def test_bool8_small(): DT0 = dt.Frame([True, False, False, None, True, True, None]) DT1 = dt.Frame([None, None, False, False, True, True, True]) DTS = DT0[:, :, sort("C0")] assert DT0.stype == dt.bool8 assert isview(DTS) assert_equals(DTS, DT1)
def test_bool8_small_descending(): DT0 = dt.Frame([True, False, False, None, True, True, None]) DT1 = dt.Frame([None, None, True, True, True, False, False]) DTS = DT0[:, :, sort(-f.C0)] assert DT0.stype == dt.bool8 assert isview(DTS) assert_equals(DTS, DT1)
def test_bool8_large_stable(n): DT0 = dt.Frame(A=[True, False, None] * n, B=range(3 * n)) DT1 = dt.Frame(B=list(range(2, 3 * n, 3)) + list(range(1, 3 * n, 3)) + list(range(0, 3 * n, 3))) DTS = DT0[:, f.B, sort(f.A)] assert_equals(DTS, DT1)
def test_sort_ints_reverse(st): DT = dt.Frame(A=[5, 17, 9, -12, 0, 111, 3, 5], B=list('abcdefgh'), stypes={"A": st, "B": dt.str32}) assert_equals(DT[:, :, sort(-f.A)], dt.Frame(A=[111, 17, 9, 5, 5, 3, 0, -12], B=list('fbcahged'), stypes={"A": st, "B": dt.str32}))
def test_sort_view1(): DT0 = dt.Frame([5, 10]) DT1 = DT0[[i % 2 for i in range(10)], :] assert DT1.shape == (10, 1) assert isview(DT1) DT2 = DT1[:, :, sort(0)] assert_equals(DT2, dt.Frame([5] * 5 + [10] * 5))
def test_sort_view3(): d0 = dt.Frame(range(1000)) d1 = d0[::-5, :] d2 = d1[:, :, sort(0)] d2.internal.check() assert d2.shape == (200, 1) assert d2.to_list() == [list(range(4, 1000, 5))]
def __call__(self, rows=None, select=None, verbose=False, timeit=False, groupby=None, join=None, sort=None, engine=None): """DEPRECATED, use DT[i, j, ...] instead.""" warnings.warn( "`DT(rows, select, ...)` is deprecated and will be removed in " "version 0.9.0. Please use `DT[i, j, ...]` instead", category=FutureWarning) time0 = time.time() if timeit else 0 function = type(lambda: None) if isinstance(rows, function): rows = rows(datatable.f) if isinstance(select, function): select = select(datatable.f) res = self[rows, select, datatable.join(join), datatable.by(groupby), datatable.sort(sort)] if timeit: print("Time taken: %d ms" % (1000 * (time.time() - time0))) return res
def test_int16_small_descending(): DT0 = dt.Frame(A=[4, 12, 1000, None, 2, 4, 0, -444, 95, None, 7] / dt.int16) DT1 = dt.Frame(A=[None, None, 1000, 95, 12, 7, 4, 4, 2, 0, -444] / dt.int16) DTS = DT0[:, :, sort(-f.A)] assert_equals(DTS, DT1)
def test_int8_large_stable(n): src = [None, 10, -10] * (n // 3) DT = dt.Frame([src, range(n)], names=("A", "B"), stypes={"A": "int8"}) assert DT["A"].stype == dt.int8 d1 = DT[:, f.B, sort(f.A)] assert d1.to_list() == [ list(range(0, n, 3)) + list(range(2, n, 3)) + list(range(1, n, 3)) ]
def test_bool8_small(): d0 = dt.Frame([True, False, False, None, True, True, None]) assert d0.stypes == (stype.bool8, ) d1 = d0[:, :, sort("C0")] assert d1.stypes == d0.stypes assert d1.internal.isview d1.internal.check() assert d1.to_list() == [[None, None, False, False, True, True, True]]
def test_int8_large_stable(n): src = [None, 10, -10] * (n // 3) d0 = dt.Frame([src, range(n)], names=("A", "B")) assert d0.stypes[0] == stype.int8 d1 = d0[:, f.B, sort(f.A)] assert d1.to_list() == [ list(range(0, n, 3)) + list(range(2, n, 3)) + list(range(1, n, 3)) ]
def test_int32_large_stable(n): src = [None, 100, 100000] * (n // 3) d0 = dt.Frame([src, range(n)], names=["A", "B"]) assert d0.stypes[0] == stype.int32 d1 = d0[:, "B", sort("A")] assert d1.to_list() == [ list(range(0, n, 3)) + list(range(1, n, 3)) + list(range(2, n, 3)) ]
def test_bool8_small(): d0 = dt.Frame([True, False, False, None, True, True, None]) assert d0.stypes == (stype.bool8, ) d1 = d0[:, :, sort("C0")] assert d1.stypes == d0.stypes assert isview(d1) frame_integrity_check(d1) assert d1.to_list() == [[None, None, False, False, True, True, True]]
def py_dt_one_group_proportions_summary(DT,por): DT_summary = DT[:,dt.count(),by(f[por]) ][:,f[:].extend({'grand_tot':dt.sum(f.count)}) ][:,f[:].extend({'prop':f.count/f.grand_tot}) ][:,f[:].remove(f.grand_tot),dt.sort(-f.prop) ] return DT_summary
def test_int16_small_stable(): DT0 = dt.Frame(A=[0, 1000, 0, 0, 1000, 0, 0, 1000, 0] / dt.int16, B=[1, 2, 3, 4, 5, 6, 7, 8, 9]) DT1 = dt.Frame(A=[0, 0, 0, 0, 0, 0, 1000, 1000, 1000] / dt.int16, B=[1, 3, 4, 6, 7, 9, 2, 5, 8]) DTS = DT0[:, :, sort(f.A)] assert DT0['A'].stype == dt.int16 assert_equals(DTS, DT1)
def create_data(X: dt.Frame = None) -> Union[ str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # Download files # Location in DAI file system where we will save the data set temp_path = os.path.join(user_dir(), config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) # URL of desired data, this comes from the City of Seattle link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz" link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz" link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz" # Download the files file_basics = download(link_basics, dest_path=temp_path) file_ratings = download(link_ratings, dest_path=temp_path) file_episodes = download(link_episodes, dest_path=temp_path) # get COVID19 new cases data from Our World in Data github basics = dt.fread(file_basics, fill=True) ratings = dt.fread(file_ratings, fill=True) episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True) # remove files os.remove(file_basics) os.remove(file_ratings) os.remove(file_episodes) # Create Title with Ratings dataset # join titles with non-null ratings ratings = ratings[~dt.isna(dt.f.averageRating), :] ratings.key = "tconst" basics_ratings = basics[:, :, dt.join(ratings)] # Create Episodes dataset episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :] episode_ratings = episodes[:, :, dt.join(ratings)] episode_ratings.names = {'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes'} basics_ratings.key = 'tconst' title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)] # enumerate series episodes from 1 to N title_episode_ratings = title_episode_ratings[:, :, dt.sort(dt.f.tconst, dt.f.seasonNumber, dt.f.episodeNumber)] result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list() from itertools import chain cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0]) title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount)) # return datasets return {f"imdb_title_ratings": basics_ratings, f"imdb_episode_ratings": title_episode_ratings}
def test_h2oai7014(tempfile): data = dt.Frame([[None, 't'], [3580, 1047]], names=["ID", "count"]) data.to_jay(tempfile) # The data has to be opened from file counts = dt.open(tempfile) counts = counts[1:, :] counts = counts[:, :, sort("count")] counts.materialize() assert counts.to_list() == [['t'], [1047]]
ans = x[:, {"range_v1_v2": max(f.v1)-min(f.v2)}, by(f.id2, f.id4)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.range_v1_v2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) print(ans.head(3).to_pandas(), flush=True) print(ans.tail(3).to_pandas(), flush=True) del ans question = "largest two v3 by id2 id4" # q8 gc.collect() t_start = timeit.default_timer() ans = x[:2, {"largest2_v3": f.v3}, by(f.id2, f.id4), sort(-f.v3)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.largest2_v3)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = x[:2, {"largest2_v3": f.v3}, by(f.id2, f.id4), sort(-f.v3)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()