def test_groups_large2_str(n, seed): random.seed(seed) while n == 0: n = int(random.expovariate(0.0005)) src = ["%x" % random.getrandbits(6) for _ in range(n)] f0 = dt.Frame({"A": src}) f1 = f0[:, count(), by("A")] f1.internal.check() assert f1.nrows == len(set(src))
def py_dt_two_group_proportions_summary(DT,por1,por2): DT_summary = DT[:,dt.count(),by(f[por1],f[por2]) ][:,f[:].extend({'group_tot':dt.sum(f.count)}),by(f[por1]) ][:,f[:].extend({'prop':f.count/f.group_tot}) ][:,f[:].remove(f[1]) ] return DT_summary
def test_issue_2242(seed): n = 25000 X = dt.Frame(AGE=[random.randint(1, 50) for i in range(n)], PAY=[random.choice([True, False]) for i in range(n)]) RES = X[:, dt.math.log((count() + 1) / (sum(f.PAY) + 0.5) - 1), by(f.AGE)] assert RES.shape == (50, 2) data = RES.to_list() assert data[0] == list(range(1, 51)) assert all(isinstance(x, float) for x in data[1])
def test_issue2348(): DT = dt.Frame(A=[1, 2, 3, 1, 2, 3], B=list('akdfnv'), C=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], D=[11]*6, E=[2]*6) # Check that these expressions do not crash DT[:, :, by(f.A), sort(f.A, f.E)] DT[:, :, by(f.A, f.B), sort(f.A, f.B)] assert_equals(DT[:, dt.count(), by(f.D), sort(f.E, f.A)], dt.Frame([[11], [6]], names=["D", "count"], stypes=[dt.int32, dt.int64]))
def test_sort_multicolumn1(): # See issue #3141 DT = dt.Frame(A=[111] * 100, B=['a', 'b'] * 50, C=['ads', 'adfv', 'adfv', 'adsfv'] * 25) RES1 = DT[:, dt.count(), dt.by(f.A, f.B, f.C)] assert_equals( RES1, dt.Frame(A=[111, 111, 111, 111], B=['a', 'a', 'b', 'b'], C=['adfv', 'ads', 'adfv', 'adsfv'], count=[25, 25, 25, 25] / dt.int64)) RES2 = DT[:, dt.count(), dt.by(f.B, f.C)] assert_equals( RES2, dt.Frame(B=['a', 'a', 'b', 'b'], C=['adfv', 'ads', 'adfv', 'adsfv'], count=[25, 25, 25, 25] / dt.int64)) RES3 = DT[:, dt.count(), dt.by(f.A, f.B)] assert_equals( RES3, dt.Frame(A=[111, 111], B=['a', 'b'], count=[50, 50] / dt.int64))
def test_group_empty_frame2(): DT = dt.Frame(A=[]) D1 = DT[:, count(), by(f.A)] frame_integrity_check(D1) assert D1.shape == (0, 2) assert D1.stypes == (DT.stype, dt.int64)
def test_group_boolean2(): DT = dt.Frame(A=[True, False, False] * 500 + [None, True]) DTR = DT[:, count(), by(f.A)] assert_equals(DTR, dt.Frame(A=[None, False, True], count=[1, 1000, 501], stypes={"count": dt.int64}))
time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3), flush=True) print(ans.tail(3), flush=True) del ans question = 'sum v3 count by id1:id6' # q10 gc.collect() t_start = timeit.default_timer() ans = x[:, { 'v3': sum(f.v3), 'count': count() }, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.v3), sum(f.count)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution,
Here is our basic syntax representation of datatable frame- DT[I,J,BY|SORT|JOIN] A sample dataframe created with the column name **languages** and would like to count how many of students are interested in learning each language category using aggregations such as by along with count,min,max,mean etc etc.. Yes, its correct we should use a function called **count** to caluclate the number of observations and let us see how it works below. prog_lang_dt = dt.Frame(languages= ['html', 'R', 'R', 'html', 'R', 'javascript', 'R', 'javascript', 'html']) prog_lang_dt prog_lang_dt[:,count(),by(f.languages)] If we would like like to rename a count column as total it can be done as follows, prog_lang_dt[:,{'total':count()},by(f.languages)] **count** can also take a column name as argument and report how many of non-missing entries in that specific column. for this example we will create a new dataframe as Y. data = """ id|charges|payment_method 634-VHG|28|Cheque 365-DQC|33.5|Credit card 264-PPR|631|-- 845-AJO|42.3| 789-KPO|56.9|Bank Transfer """
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # Download files # Location in DAI file system where we will save the data set temp_path = os.path.join(user_dir(), config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) # URL of desired data, this comes from the City of Seattle link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz" link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz" link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz" # Download the files file_basics = download(link_basics, dest_path=temp_path) file_ratings = download(link_ratings, dest_path=temp_path) file_episodes = download(link_episodes, dest_path=temp_path) # get COVID19 new cases data from Our World in Data github basics = dt.fread(file_basics, fill=True) ratings = dt.fread(file_ratings, fill=True) episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True) # remove files os.remove(file_basics) os.remove(file_ratings) os.remove(file_episodes) # Create Title with Ratings dataset # join titles with non-null ratings ratings = ratings[~dt.isna(dt.f.averageRating), :] ratings.key = "tconst" basics_ratings = basics[:, :, dt.join(ratings)] # Create Episodes dataset episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :] episode_ratings = episodes[:, :, dt.join(ratings)] episode_ratings.names = { 'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes' } basics_ratings.key = 'tconst' title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)] # enumerate series episodes from 1 to N title_episode_ratings = title_episode_ratings[:, :, dt.sort( dt.f.tconst, dt.f. seasonNumber, dt.f. episodeNumber)] result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list() from itertools import chain cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0]) title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount)) # return datasets return { f"imdb_title_ratings": basics_ratings, f"imdb_episode_ratings": title_episode_ratings }
# Output: # dataset with downsampled majority class from sklearn.utils import resample import pandas as pd from datatable import f, count, sort, by, min, max, rbind # parameters target_col = "Known_Fraud" times = 5 random_seed = 123 new_dataset_name = "new_dataset_name_with_downsampled_majority" # counts by target groups g = X[:, {"count": count()}, by(target_col)] if not g.shape[1] == 2: raise ValueError( "Not a binary target - target column must contain exactly 2 values.") # find sizes and target values for minority and majority class partitions n_minority = g[:, min(f.count)][0, 0] n_majority = g[:, max(f.count)][0, 0] target_minority = g[f.count == n_minority, target_col][0, 0] target_majority = g[f.count == n_majority, target_col][0, 0] # validate that times indeed downsamples majority class if times * n_minority >= n_majority: raise ValueError( "Downsampling coefficient `times` is too large: downsampled dataset results in inflated majority class." )
# ~ 3b ~ # Investigate Unknown HPO Codes # For each code listed in 'hpo_codes_not_found', investigate the code to # determine how to reconcile the invalid code. In some instances, you can # add a new record in `rd3_phenotype`, but some codes may have changed or # are no longer used. If you are still unable to find an answer, contact # the data provider. # # prep "HPO codes to verify" dataset hpoCodes = dt.Frame(hpo_codes_not_found, types={ 'id': str, 'hpo': str })[:, { 'frequency': count(), 'url': None, 'status': 'not.found', 'action': None }, by(f.hpo)] # set url for HPO code if it exists hpoCodes['url'] = dt.Frame([ f'http://purl.obolibrary.org/obo/{d}' for d in hpoCodes['hpo'].to_list()[0] ]) # manually check each link and search for code. Either add the code or follow # up with SolveRD project data coordinators. rbind(fread('data/unknown_hpo_codes.csv'), hpoCodes).to_csv('data/unknown_hpo_codes.csv')
'TMIN': "temp_min", 'TMAX': "temp_max"} weather_dt weather_dt.key="stop_date" # count the number of missing values policia_dt.countna() del policia_dt[:,['county_name', 'state']] # glance policia_dt policia_dt[:,count(),by(f.driver_gender)] policia_tidy_dt = policia_dt[~dt.isna(f.driver_gender),:] policia_tidy_dt[:,count(),by(f.violation) ][:,f[:].extend({'grand_tot':dt.sum(f.count)}) ][:,f[:].extend({'prop':f.count/f.grand_tot}) ][:,f[:].remove(f.grand_tot),sort(-f.prop) ] # custom function to generate a summary report per a single group column def py_dt_one_group_proportions_summary(DT,por): DT_summary = DT[:,dt.count(),by(f[por]) ][:,f[:].extend({'grand_tot':dt.sum(f.count)}) ][:,f[:].extend({'prop':f.count/f.grand_tot})
gains = defaultdict(list) for experimento in os.listdir(f'../experimentos/'): for file in os.listdir(f'../experimentos/{experimento}/'): if file.endswith('importance.csv'): with open(f'../experimentos/{experimento}/{file}') as csvfile: reader = csv.DictReader(csvfile) for row in reader: weights[row['variable']].append(float(row['weight'])) gains[row['variable']].append(float(row['gain'])) weights = pad_dict_list(weights) gains = pad_dict_list(gains) df_weights = dt.Frame(weights).mean().to_pandas().T df_gains = dt.Frame(gains).mean().to_pandas().T df_count = dt.Frame(weights)[:, dt.count(dt.f[:])].to_pandas().T df = pd.DataFrame(columns=['variable', 'weight', 'gain']) df['variable'] = df_weights.index.values df['gain'] = df_gains[0].values df['count'] = df_count[0].values df['weight'] = df_weights[0].values df = df.sort_values(by=['gain', 'count', 'weight'], ascending=False) df.to_csv('../analisis_feature_importance.csv', index=False)
penguins_dt[:, update(temp=f.body_mass_g == dt.max(f.body_mass_g)), by(f.sex)] # step - 2 : finding a max value of body_mass of penguins per sex penguins_dt[f.temp == 1, f[:].remove(f.temp)] # step - 1 : finding a min value of body_mass of penguins per sex penguins_dt[:, update(temp=f.body_mass_g == dt.min(f.body_mass_g)), by(f.sex)] penguins_dt[f.temp == 1, f[:].remove(f.temp)] del penguins_dt["temp"] penguins_tidy_dt = penguins_dt[~dt.isna(f.sex), :] penguins_year_island = penguins_tidy_dt[:, { 'total': count() }, by(f.year, f.island)] penguins_year = penguins_year_island[:, { 'gr_total': dt.sum(f.total) }, by(f.year)] penguins_year penguins_year.key = "year" penguins_year_island = penguins_year_island[:, :, dt.join(penguins_year)] penguins_year_island[:, update(perc=f.total / f.gr_total)] penguins_year_island
def score(self, actual: np.array, predicted: np.array, sample_weight: typing.Optional[np.array] = None, labels: typing.Optional[List[any]] = None, X: typing.Optional[dt.Frame] = None, **kwargs) -> float: # Get the logger if it exists logger = self.get_experiment_logger() # hard-coded as access to experiment parameters (such as self.tgc) not yet available tgc = ["Store", "Dept"] # tgc = ["state"] # tgc = None # enable weighted average over TS R2 scores: weighted based on TS share of rows isR2AverageWeighted = False # obtain a scorer for metric to use scorer = self.get_scorer() if tgc is None or not all(col in X.names for col in tgc): loggerinfo( logger, f"TS R2 computes single R2 on {X.nrows} rows as either tgc {tgc} is not defined or incorrect." ) return scorer.score(actual, predicted, sample_weight, labels, **kwargs) else: tgc_values = X[:, { "weight": count() / X.nrows, "r2": 0.0 }, by(tgc)] loggerinfo( logger, f"TS R2 computes multiple R2 on {X.nrows} rows, tgc {tgc} with weighting is {isR2AverageWeighted}." ) none_values = [None] * X.nrows X = cbind( X[:, tgc], Frame(actual=actual, predicted=predicted, sample_weight=sample_weight if sample_weight is not None else none_values)) for i in range(0, tgc_values.nrows): current_tgc = tgc_values[i, :] current_tgc.key = tgc ts_frame = X[:, :, join(current_tgc)][~isna(f.r2), :] r2_score = scorer.score( ts_frame['actual'].to_numpy(), ts_frame['predicted'].to_numpy(), ts_frame['sample_weight'].to_numpy() if sample_weight is not None else None, labels, **kwargs) tgc_values[i, f.r2] = r2_score loggerinfo( logger, f"TS R2 = {r2_score} on {ts_frame.nrows} rows, tgc = {current_tgc[0, tgc].to_tuples()}" ) if isR2AverageWeighted: # return np.average(tgc_values["r2"].to_numpy(), weights=tgc_values["weight"].to_numpy()) return tgc_values[:, mean(f.r2 * f.weight)][0, 0] else: return tgc_values[:, mean(f.r2)][0, 0]
weather_dt = dt.fread( 'https://assets.datacamp.com/production/repositories/1497/datasets/02f3fb2d4416d3f6626e1117688e0386784e8e55/weather.csv', na_strings=[""]) policia_dt weather_dt # count the number of missing values policia_dt.countna() del policia_dt[:, ['county_name', 'state']] policia_dt policia_dt[:, count(), by(f.driver_gender)] policia_tidy_dt = policia_dt[~dt.isna(f.driver_gender), :] policia_tidy_df = policia_tidy_dt.to_pandas() policia_tidy_df.info() policia_tidy_df policia_tidy_dt[:, count(), by(f.violation)][:, f[:].extend({'grand_tot': dt.sum( f.count)})][:, f[:].extend({'prop': f.count / f.grand_tot} )][:, f[:].remove(f.grand_tot), sort(-f.prop)]
def test_group_empty_frame3(): DT = dt.Frame(A=[], stype=dt.float32) D2 = DT[:, count(f.A), by(f.A)] frame_integrity_check(D2) assert D2.shape == (0, 2) assert D2.stypes == (DT.stype, dt.int64)
def test_count_with_i(): # See issue 1316 DT = dt.Frame(A=range(100)) assert DT[:5, count()][0, 0] == 5 assert DT[-12:, count()][0, 0] == 12 assert DT[::3, count()][0, 0] == 34
def test_group_boolean3(): DT = dt.Frame(A=[True] * 1234) DTR = DT[:, count(), by(f.A)] assert_equals(DTR, dt.Frame(A=[True], count=[1234], stypes={"count": dt.int64}))
def test_count_2d_array_integer(): a_in = [[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1], [0, 1, 0, 5, 3, 8, 1, 0, 2, 5, None, 8, 1]] a_reduce = count(a_in) assert a_reduce == 2
from datatable import f,by,count,update,sort,join import re dt.options.display.head_nrows=4 dt.options.display.tail_nrows=4 dt.init_styles() # Importign data amigos_info_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends_info.csv') amigos_dt = dt.fread('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-09-08/friends.csv') # Glance amigos_info_dt # Seasons amigos_info_dt[:,count(),by(f.season)] # Unique episodes per a season amigos_info_dt[:,count(),by(f.season,f.episode) ][:,{'unique_episodes':count()},by(f.season) ] # average views and ratings per season amigos_info_dt[:,dt.mean(f[-2:]),by(f.season)] # Highest rating title amigos_info_dt[f.imdb_rating==dt.max(f.imdb_rating),:] # lowest rating title amigos_info_dt[f.imdb_rating==dt.min(f.imdb_rating),:]
#ans = x[:, {"r2": cor(v1, v2)^2}, by(f.id2, f.id4)] #print(ans.shape, flush=True) #t = timeit.default_timer() - t_start #m = memory_usage() #t_start = timeit.default_timer() #chk = ans[:, sum(f.r2)] #chkt = timeit.default_timer() - t_start #write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) #print(ans.head(3).to_pandas(), flush=True) #print(ans.tail(3).to_pandas(), flush=True) #del ans question = "sum v3 count by id1:id6" # q10 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v3": sum(f.v3), "count": count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.v3), sum(f.count)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) del ans gc.collect() t_start = timeit.default_timer() ans = x[:, {"v3": sum(f.v3), "count": count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()
def test_groups_large1(): n = 251 * 4000 xs = [(i * 19) % 251 for i in range(n)] f0 = dt.Frame({"A": xs}) f1 = f0[:, count(), by("A")] assert f1.to_list() == [list(range(251)), [4000] * 251]
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' group_by_cols = ["state"] forecast_len = 7 # state codes lookup table us_state_codes = dt.Frame( code=[ 'AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VI', 'VA', 'WA', 'WV', 'WI', 'WY' ], state=[ 'Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming' ]) us_state_codes.key = "state" # get states population lookup table us_states_pop = dt.fread( "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv" ) us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'} us_states_pop = us_states_pop[dt.f.STATE > 0, :] us_states_pop.key = "state" # join state codes and population into single lookup table us_states_pop[:, dt.update(code=dt.g.code), dt.join(us_state_codes)] us_states_pop.key = "code" # US Covid Tracking API: https://covidtracking.com/data/api us_states = dt.fread( "https://covidtracking.com/api/v1/states/daily.csv") # remove deprecated fields deprecated = [ 'checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified', 'grade', 'hash', 'hospitalized', 'negativeIncrease', 'negativeRegularScore', 'negativeScore', 'posNeg', 'positiveScore', 'score', 'total' ] us_states = us_states[:, list(set(us_states.names) - set(deprecated))] us_states.names = {'state': 'code'} series_cols = [ "positive", "negative", "hospitalizedCumulative", "inIcuCumulative", "onVentilatorCumulative", "recovered", "death" ] aggs = {f"{col}100k": f[col] / (g.pop / 100000) for col in series_cols} us_states[:, dt.update( state=g.state, pop=g.pop, pop100k=g.pop / 10000, **aggs), join(us_states_pop)] us_states = us_states[~dt.isna(dt.f.state), :] # produce lag of 1 unit and add as new feature for each shift column series_cols.extend([col + "100k" for col in series_cols]) aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols} us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] # update NA lags aggs = {f"{col}_yesterday": 0 for col in series_cols} us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)] aggs = { f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols } us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] for col in series_cols: del us_states[:, f[f"{col}_yesterday"]] # validate dataset if us_states[:, count(), by(dt.f.state, f.date)][f.count > 1, :].shape[0] > 1: raise ValueError( "Found duplicate elements for the same date and state.") # determine threshold to split train and test based on forecast horizon dates = dt.unique(us_states[:, date_col]) split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0] test_date = dates[-1, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set df = us_states[date_col].to_pandas() train = us_states[df[date_col] <= split_date, :] test = us_states[df[date_col] > split_date, :] return { f"covidtracking_daily_{split_date}_by_us_states_train": train, f"covidtracking_daily_{test_date}_by_us_states_test": test }
def test_count_array_integer(): a_in = [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1] a_reduce = count(a_in) assert a_reduce == 10
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(config.data_directory, config.contrib_relative_directory, "airlines") os.makedirs(temp_path, exist_ok=True) # specify which years are used for training and testing training = [2007] testing = [2008] # download and unzip files files = [] for f in ["%d.csv.bz2" % year for year in training + testing]: link = "http://stat-computing.org/dataexpo/2009/%s" % f file = download(link, dest_path=temp_path) output_file = file.replace(".bz2", "") extract_bz2(file, output_file) files.append(output_file) # parse with datatable X = dt.rbind(*[dt.fread(x) for x in files]) # add date date_col = 'Date' X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[ 'DayofMonth'] cols_to_keep = ['Date'] # add number of flights in/out for each airport per given interval timeslice_mins = 60 for name, new_col, col, group in [ ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"), ("in", "CRSArrTime_mod", "CRSArrTime", "Dest") ]: X[:, new_col] = X[:, dt.f[col] // timeslice_mins] group_cols = [date_col, group, new_col] new_name = 'flights_%s' % name flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)] flights.key = group_cols cols_to_keep.append(new_name) X = X[:, :, dt.join(flights)] # Fill NaNs with 0s X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0 cols_to_keep.extend([ 'DepDelay', 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'CRSElapsedTime', 'Origin', 'Dest', 'Distance', # Leaks for delay # 'DepTime', # 'ArrTime', #'CRSArrTime', # 'ActualElapsedTime', # 'AirTime', #'ArrDelay', #'DepDelay', # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay', # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay', ]) X = X[:, cols_to_keep] # Join in some extra info join_files = [('UniqueCarrier', 'carriers.csv', 'Code'), ('Origin', 'airports.csv', 'iata'), ('Dest', 'airports.csv', 'iata'), ('TailNum', 'plane-data.csv', 'tailnum')] for join_key, file, col in join_files: file = download('http://stat-computing.org/dataexpo/2009/%s' % file, dest_path=temp_path) X_join = dt.fread(file, fill=True) X_join.names = {col: join_key} X_join.names = [join_key] + [ join_key + "_" + x for x in X_join.names if x != join_key ] X_join.key = join_key X = X[:, :, dt.join(X_join)] del X[:, join_key] split = False if not split: filename = os.path.join( temp_path, "flight_delays_regression_%d-%d.jay" % (min(training), max(testing))) X.to_jay(filename) return filename else: # prepare splits (by year) and create binary .jay files for import into Driverless AI output_files = [] for condition, name in [ ((min(training) <= dt.f['Year']) & (dt.f['Year'] <= max(training)), 'training'), ((min(testing) <= dt.f['Year']) & (dt.f['Year'] <= max(testing)), 'test'), ]: X_split = X[condition, :] filename = os.path.join(temp_path, "flight_delays_%s.jay" % name) X_split.to_jay(filename) output_files.append(filename) return output_files
def test_count_array_string(): a_in = [None, "blue", "green", "indico", None, None, "orange", "red", "violet", "yellow", "green", None, "blue"] a_reduce = count(a_in) assert a_reduce == 9
ans = x[:, {"r2": corr(f.v1, f.v2)**2}, by(f.id2, f.id4)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, sum(f.r2)] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) print(ans.head(3).to_pandas(), flush=True) print(ans.tail(3).to_pandas(), flush=True) del ans question = "sum v3 count by id1:id6" # q10 gc.collect() t_start = timeit.default_timer() ans = x[:, {"v3": sum(f.v3), "count": count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.v3), sum(f.count)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=1, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt, on_disk=on_disk) del ans gc.collect() t_start = timeit.default_timer() ans = x[:, {"v3": sum(f.v3), "count": count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer()
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines") os.makedirs(temp_path, exist_ok=True) dt.options.nthreads = 8 # specify which years are used for training and testing training = list(range(2005, 2008)) testing = [2008] # download and unzip files files = [] for f in ["%d.csv.bz2" % year for year in training + testing]: link = AirlinesData.base_url + "%s" % f file = download(link, dest_path=temp_path) output_file = file.replace(".bz2", "") if not os.path.exists(output_file): extract_bz2(file, output_file) files.append(output_file) # parse with datatable X = dt.rbind(*[dt.fread(x) for x in files]) # add date date_col = 'Date' X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[ 'DayofMonth'] cols_to_keep = ['Date'] # add number of flights in/out for each airport per given interval timeslice_mins = 60 for name, new_col, col, group in [ ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"), ("in", "CRSArrTime_mod", "CRSArrTime", "Dest") ]: X[:, new_col] = X[:, dt.f[col] // timeslice_mins] group_cols = [date_col, group, new_col] new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins) flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)] flights.key = group_cols cols_to_keep.append(new_name) X = X[:, :, dt.join(flights)] # select flights leaving from SFO only X = X[dt.f['Origin'] == 'SFO', :] # Fill NaNs in DepDelay column X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0 # create binary target column depdelay_threshold_mins = 15 target = 'DepDelay%dm' % depdelay_threshold_mins X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins cols_to_keep.extend([ target, 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'CRSElapsedTime', 'Origin', 'Dest', 'Distance', # Leaks for delay # 'DepTime', # 'ArrTime', #'CRSArrTime', # 'ActualElapsedTime', # 'AirTime', #'ArrDelay', #'DepDelay', # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay', # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay', ]) X = X[:, cols_to_keep] # Join in some extra info join_files = [('UniqueCarrier', 'carriers.csv', 'Code'), ('Origin', 'airports.csv', 'iata'), ('Dest', 'airports.csv', 'iata'), ('TailNum', 'plane-data.csv', 'tailnum')] for join_key, file, col in join_files: file = download( 'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' % file, dest_path=temp_path) X_join = dt.fread(file, fill=True) X_join.names = {col: join_key} X_join.names = [join_key] + [ join_key + "_" + x for x in X_join.names if x != join_key ] X_join.key = join_key X = X[:, :, dt.join(X_join)] del X[:, join_key] split = True if not split: filename = os.path.join( temp_path, "flight_delays_data_recipe_%d-%d.csv" % (min(training), max(testing))) X.to_csv(filename) return filename else: # prepare splits (by year) and create binary .jay files for import into Driverless AI output_files = [] for condition, name in [ ((min(training) <= dt.f['Year']) & (dt.f['Year'] <= max(training)), 'training'), ((min(testing) <= dt.f['Year']) & (dt.f['Year'] <= max(testing)), 'test'), ]: X_split = X[condition, :] filename = os.path.join( temp_path, "augmented_flights_%s-%d_%s.csv" % (X_split[:, 'Year'].min1(), X_split[:, 'Year'].max1(), name)) X_split.to_csv(filename) output_files.append(filename) return output_files
#t = timeit.default_timer() - t_start #m = memory_usage() #t_start = timeit.default_timer() #chk = ans[:, sum(f.r2)] #chkt = timeit.default_timer() - t_start #write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution, version=ver, git=git, fun=fun, run=2, time_sec=t, mem_gb=m, cache=cache, chk=make_chk(flatten(chk.to_list())), chk_time_sec=chkt) #print(ans.head(3).to_pandas(), flush=True) #print(ans.tail(3).to_pandas(), flush=True) #del ans question = "sum v3 count by id1:id6" # q10 gc.collect() t_start = timeit.default_timer() ans = x[:, { "v3": sum(f.v3), "count": count() }, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)] print(ans.shape, flush=True) t = timeit.default_timer() - t_start m = memory_usage() t_start = timeit.default_timer() chk = ans[:, [sum(f.v3), sum(f.count)]] chkt = timeit.default_timer() - t_start write_log(task=task, data=data_name, in_rows=x.shape[0], question=question, out_rows=ans.shape[0], out_cols=ans.shape[1], solution=solution,