def test_assign_string_columns(): f0 = dt.Frame(A=["One", "two", "three", None, "five"]) f0[dt.isna(f.A), f.A] = dt.Frame(["FOUR"]) assert f0.names == ("A", ) assert f0.stypes == (dt.stype.str32, ) assert f0.to_list() == [["One", "two", "three", "FOUR", "five"]]
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): # Get column names orig_cols = list(X.names) from h2oaicore.tensorflow_dynamic import got_cpu_tf, got_gpu_tf import tensorflow as tf import shap import scipy import pandas as pd self.setup_keras_session() import h2oaicore.keras as keras import matplotlib.pyplot as plt if not hasattr(self, 'save_model_path'): model_id = str(uuid.uuid4())[:8] self.save_model_path = os.path.join(user_dir(), "custom_xnn_model.hdf5") np.random.seed(self.random_state) my_init = keras.initializers.RandomUniform(seed=self.random_state) # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) # Set up temp folter tmp_folder = self._create_tmp_folder(logger) # define base model def xnn_initialize(features, ridge_functions=3, arch=[20, 12], learning_rate=0.01, bg_samples=100, beta1=0.9, beta2=0.999, dec=0.0, ams=True, bseed=None, is_categorical=False): # # Prepare model architecture # # Input to the network, our observation containing all the features input = keras.layers.Input(shape=(features, ), name='main_input') # Record current column names loggerinfo(logger, "XNN LOG") loggerdata(logger, "Feature list:") loggerdata(logger, str(orig_cols)) # Input to ridge function number i is the dot product of our original input vector times coefficients ridge_input = keras.layers.Dense(ridge_functions, name="projection_layer", activation='linear')(input) ridge_networks = [] # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it ridge_inputs = SplitLayer(ridge_functions)(ridge_input) for i, ridge_input in enumerate(ridge_inputs): # Generate subnetwork i mlp = _mlp(ridge_input, i, arch) ridge_networks.append(mlp) added = keras.layers.Concatenate( name='concatenate_1')(ridge_networks) # Add the correct output layer for the problem if is_categorical: out = keras.layers.Dense(1, activation='sigmoid', input_shape=(ridge_functions, ), name='main_output')(added) else: out = keras.layers.Dense(1, activation='linear', input_shape=(ridge_functions, ), name='main_output')(added) model = keras.models.Model(inputs=input, outputs=out) optimizer = keras.optimizers.Adam(lr=learning_rate, beta_1=beta1, beta_2=beta2, decay=dec, amsgrad=ams) # Use the correct loss for the problem if is_categorical: model.compile(loss={'main_output': 'binary_crossentropy'}, optimizer=optimizer) else: model.compile(loss={'main_output': 'mean_squared_error'}, optimizer=optimizer) return model def _mlp(input, idx, arch=[20, 12], activation='relu'): # Set up a submetwork # Hidden layers mlp = keras.layers.Dense(arch[0], activation=activation, name='mlp_{}_dense_0'.format(idx), kernel_initializer=my_init)(input) for i, layer in enumerate(arch[1:]): mlp = keras.layers.Dense(layer, activation=activation, name='mlp_{}_dense_{}'.format( idx, i + 1), kernel_initializer=my_init)(mlp) # Output of the MLP mlp = keras.layers.Dense( 1, activation='linear', name='mlp_{}_dense_last'.format(idx), kernel_regularizer=keras.regularizers.l1(1e-3), kernel_initializer=my_init)(mlp) return mlp def get_shap(X, model): # Calculate the Shap values np.random.seed(24) bg_samples = min(X.shape[0], 1000) if isinstance(X, pd.DataFrame): background = X.iloc[np.random.choice(X.shape[0], bg_samples, replace=False)] else: background = X[np.random.choice(X.shape[0], bg_samples, replace=False)] # Explain predictions of the model on the subset explainer = shap.DeepExplainer(model, background) shap_values = explainer.shap_values(X) # Return the mean absolute value of each shap value for each dataset xnn_shap = np.abs(shap_values[0]).mean(axis=0) return xnn_shap # Initialize the xnn's features = X.shape[1] orig_cols = list(X.names) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) self.is_cat = True xnn1 = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) xnn = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) else: self.is_cat = False xnn1 = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) xnn = xnn_initialize(features=features, ridge_functions=features, arch=self.params["arch"], learning_rate=self.params["lr"], beta1=self.params["beta_1"], beta2=self.params["beta_1"], dec=self.params["decay"], ams=self.params["amsgrad"], is_categorical=self.is_cat) # Replace missing values with a value smaller than all observed values self.min = dict() for col in X.names: XX = X[:, col] self.min[col] = XX.min1() if self.min[col] is None or np.isnan(self.min[col]): self.min[col] = -1e10 else: self.min[col] -= 1 XX.replace(None, self.min[col]) X[:, col] = XX assert X[dt.isna(dt.f[col]), col].nrows == 0 X = X.to_numpy() inputs = {'main_input': X} validation_set = 0 verbose = 0 # Train the neural network once with early stopping and a validation set history = keras.callbacks.History() es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min') history = xnn1.fit(inputs, y, epochs=self.params["n_estimators"], batch_size=self.params["batch_size"], validation_split=0.3, verbose=verbose, callbacks=[history, es]) # Train again on the full data number_of_epochs_it_ran = len(history.history['loss']) xnn.fit(inputs, y, epochs=number_of_epochs_it_ran, batch_size=self.params["batch_size"], validation_split=0.0, verbose=verbose) # Get the mean absolute Shapley values importances = np.array(get_shap(X, xnn)) int_output = {} int_weights = {} int_bias = {} int_input = {} original_activations = {} x_labels = list(map(lambda x: 'x' + str(x), range(features))) intermediate_output = [] # Record and plot the projection weights # weight_list = [] for layer in xnn.layers: layer_name = layer.get_config()['name'] if layer_name != "main_input": print(layer_name) weights = layer.get_weights() # Record the biases try: bias = layer.get_weights()[1] int_bias[layer_name] = bias except: print("No Bias") # Record outputs for the test set intermediate_layer_model = keras.models.Model( inputs=xnn.input, outputs=xnn.get_layer(layer_name).output) # Record the outputs from the training set if self.is_cat and (layer_name == 'main_output'): original_activations[layer_name] = scipy.special.logit( intermediate_layer_model.predict(X)) original_activations[ layer_name + "_p"] = intermediate_layer_model.predict(X) else: original_activations[ layer_name] = intermediate_layer_model.predict(X) # Record other weights, inputs, and outputs int_weights[layer_name] = weights int_input[layer_name] = layer.input int_output[layer_name] = layer.output # Plot the projection layers if "projection_layer" in layer.get_config()['name']: # print(layer.get_config()['name']) # Record the weights for each projection layer weights = [np.transpose(layer.get_weights()[0])] weight_list2 = [] for i, weight in enumerate(weights[0]): weight_list.append(weight) weight_list2.append( list(np.reshape(weight, (1, features))[0])) # Plot weights plt.bar(orig_cols, abs(np.reshape(weight, (1, features))[0]), 1, color="blue") plt.ylabel("Coefficient value") plt.title("Projection Layer Weights {}".format(i), fontdict={'fontsize': 10}) plt.xticks(rotation=90) plt.show() plt.savefig(os.path.join( tmp_folder, 'projection_layer_' + str(i) + '.png'), bbox_inches="tight") plt.clf() if "main_output" in layer.get_config()['name']: weights_main = layer.get_weights() print(weights_main) pd.DataFrame(weight_list2).to_csv(os.path.join(tmp_folder, "projection_data.csv"), index=False) intermediate_output = [] for feature_num in range(features): intermediate_layer_model = keras.models.Model( inputs=xnn.input, outputs=xnn.get_layer('mlp_' + str(feature_num) + '_dense_last').output) intermediate_output.append(intermediate_layer_model.predict(X)) # Record and plot the ridge functions ridge_x = [] ridge_y = [] for weight_number in range(len(weight_list)): ridge_x.append( list( sum(X[:, ii] * weight_list[weight_number][ii] for ii in range(features)))) ridge_y.append(list(intermediate_output[weight_number])) plt.plot( sum(X[:, ii] * weight_list[weight_number][ii] for ii in range(features)), intermediate_output[weight_number], 'o') plt.xlabel("Input") plt.ylabel("Subnetwork " + str(weight_number)) plt.title("Ridge Function {}".format(i), fontdict={'fontsize': 10}) plt.show() plt.savefig( os.path.join(tmp_folder, 'ridge_' + str(weight_number) + '.png')) plt.clf() # Output the ridge function importance weights2 = np.array([item[0] for item in list(weights)[0]]) output_activations = np.abs( np.array([ item * weights2 for item in list(original_activations["concatenate_1"]) ])).mean(axis=0) loggerinfo(logger, str(output_activations)) pd.DataFrame(output_activations).to_csv(os.path.join( tmp_folder, "ridge_weights.csv"), index=False) plt.bar(x_labels, output_activations, 1, color="blue") plt.xlabel("Ridge function number") plt.ylabel("Feature importance") plt.title("Ridge function importance", fontdict={'fontsize': 10}) plt.show() plt.savefig(os.path.join(tmp_folder, 'Ridge_function_importance.png')) pd.DataFrame(ridge_y).applymap(lambda x: x[0]).to_csv(os.path.join( tmp_folder, "ridge_y.csv"), index=False) pd.DataFrame(ridge_x).to_csv(os.path.join(tmp_folder, "ridge_x.csv"), index=False) pd.DataFrame(orig_cols).to_csv(os.path.join(tmp_folder, "input_columns.csv"), index=False) self.set_model_properties(model=xnn, features=orig_cols, importances=importances.tolist(), iterations=self.params['n_estimators'])
def transform(self, X: dt.Frame): return X[:, dt.isna(dt.f[0])]
def test_assign_string_columns(): DT = dt.Frame(A=["One", "two", "three", None, "five"]) DT[dt.isna(f.A), f.A] = dt.Frame(["FOUR"]) assert_equals(DT, dt.Frame(A=["One", "two", "three", "FOUR", "five"]))
def test_del_rows_nas(): d0 = dt.Frame({"A": [1, 5, None, 12, 7, None, -3]}) del d0[isna(f.A), :] frame_integrity_check(d0) assert d0.to_list() == [[1, 5, 12, 7, -3]]
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # Download files # Location in DAI file system where we will save the data set temp_path = os.path.join(user_dir(), config.contrib_relative_directory) os.makedirs(temp_path, exist_ok=True) # URL of desired data, this comes from the City of Seattle link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz" link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz" link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz" # Download the files file_basics = download(link_basics, dest_path=temp_path) file_ratings = download(link_ratings, dest_path=temp_path) file_episodes = download(link_episodes, dest_path=temp_path) # get COVID19 new cases data from Our World in Data github basics = dt.fread(file_basics, fill=True) ratings = dt.fread(file_ratings, fill=True) episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True) # remove files os.remove(file_basics) os.remove(file_ratings) os.remove(file_episodes) # Create Title with Ratings dataset # join titles with non-null ratings ratings = ratings[~dt.isna(dt.f.averageRating), :] ratings.key = "tconst" basics_ratings = basics[:, :, dt.join(ratings)] # Create Episodes dataset episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :] episode_ratings = episodes[:, :, dt.join(ratings)] episode_ratings.names = { 'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes' } basics_ratings.key = 'tconst' title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)] # enumerate series episodes from 1 to N title_episode_ratings = title_episode_ratings[:, :, dt.sort( dt.f.tconst, dt.f. seasonNumber, dt.f. episodeNumber)] result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list() from itertools import chain cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0]) title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount)) # return datasets return { f"imdb_title_ratings": basics_ratings, f"imdb_episode_ratings": title_episode_ratings }
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' group_by_cols = ["state"] forecast_len = 7 # state codes lookup table us_state_codes = dt.Frame( code=[ 'AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VI', 'VA', 'WA', 'WV', 'WI', 'WY' ], state=[ 'Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming' ]) us_state_codes.key = "state" # get states population lookup table us_states_pop = dt.fread( "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv" ) us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'} us_states_pop = us_states_pop[dt.f.STATE > 0, :] us_states_pop.key = "state" # join state codes and population into single lookup table us_states_pop[:, dt.update(code=dt.g.code), dt.join(us_state_codes)] us_states_pop.key = "code" # US Covid Tracking API: https://covidtracking.com/data/api us_states = dt.fread( "https://covidtracking.com/api/v1/states/daily.csv") # remove deprecated fields deprecated = [ 'checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified', 'grade', 'hash', 'hospitalized', 'negativeIncrease', 'negativeRegularScore', 'negativeScore', 'posNeg', 'positiveScore', 'score', 'total' ] us_states = us_states[:, list(set(us_states.names) - set(deprecated))] us_states.names = {'state': 'code'} series_cols = [ "positive", "negative", "hospitalizedCumulative", "inIcuCumulative", "onVentilatorCumulative", "recovered", "death" ] aggs = {f"{col}100k": f[col] / (g.pop / 100000) for col in series_cols} us_states[:, dt.update( state=g.state, pop=g.pop, pop100k=g.pop / 10000, **aggs), join(us_states_pop)] us_states = us_states[~dt.isna(dt.f.state), :] # produce lag of 1 unit and add as new feature for each shift column series_cols.extend([col + "100k" for col in series_cols]) aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols} us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] # update NA lags aggs = {f"{col}_yesterday": 0 for col in series_cols} us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)] aggs = { f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols } us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] for col in series_cols: del us_states[:, f[f"{col}_yesterday"]] # validate dataset if us_states[:, count(), by(dt.f.state, f.date)][f.count > 1, :].shape[0] > 1: raise ValueError( "Found duplicate elements for the same date and state.") # determine threshold to split train and test based on forecast horizon dates = dt.unique(us_states[:, date_col]) split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0] test_date = dates[-1, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set df = us_states[date_col].to_pandas() train = us_states[df[date_col] <= split_date, :] test = us_states[df[date_col] > split_date, :] return { f"covidtracking_daily_{split_date}_by_us_states_train": train, f"covidtracking_daily_{test_date}_by_us_states_test": test }
# X: datatable - primary data set # Parameters: # time_col: date/time/int - time column to order rows before the shift op # group_by_cols: list of column names - group columns # shift_cols: list of column names - columns to shift # Output: # dataset augmented with shifted columns from datatable import f, by, sort, update, shift, isna time_col = "date" group_by_cols = ["state"] shift_cols = ["cases", "deaths"] new_dataset_name = "new_dataset_name_with_shift" # produce lag of 1 unit and add as new feature for each shift column aggs = {f"{col}_yesterday": shift(f[col]) for col in shift_cols} X[:, update(**aggs), sort(time_col), by(*group_by_cols)] # update NA lags aggs = {f"{col}_yesterday": 0 for col in shift_cols} X[isna(f[f"{shift_cols[0]}_yesterday"]), update(**aggs)] aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in shift_cols} X[:, update(**aggs), sort(time_col), by(group_by_cols)] for col in shift_cols: del X[:, f[f"{col}_yesterday"]] return {new_dataset_name: X}
weather_dt weather_dt.key="stop_date" # count the number of missing values policia_dt.countna() del policia_dt[:,['county_name', 'state']] # glance policia_dt policia_dt[:,count(),by(f.driver_gender)] policia_tidy_dt = policia_dt[~dt.isna(f.driver_gender),:] policia_tidy_dt[:,count(),by(f.violation) ][:,f[:].extend({'grand_tot':dt.sum(f.count)}) ][:,f[:].extend({'prop':f.count/f.grand_tot}) ][:,f[:].remove(f.grand_tot),sort(-f.prop) ] # custom function to generate a summary report per a single group column def py_dt_one_group_proportions_summary(DT,por): DT_summary = DT[:,dt.count(),by(f[por]) ][:,f[:].extend({'grand_tot':dt.sum(f.count)}) ][:,f[:].extend({'prop':f.count/f.grand_tot}) ][:,f[:].remove(f.grand_tot),dt.sort(-f.prop) ]
] # Top 10 directors who have made more titles directors_dt = amigos_info_dt[:,count(),by(f.directed_by) ][:10,:,dt.sort(-f.count) ] # setting a key on DT directors_dt.key='directed_by' # First 5 and last 5 observations directors_views_rating[[slice(5),slice(25,None)],:] # directors and their avg title rating and total titles directors_views_rating_v1 = directors_views_rating[:,:,dt.join(directors_dt) ][~dt.isna(f.count),: ][:,:,dt.sort(-f.count) ] directors_views_rating_v1 alt.Chart(directors_views_rating_v1.to_pandas()).mark_bar().encode( alt.Y('directed_by',sort='-x'), alt.X('count'), alt.Color('imdb_rating') ).properties( title='Top directors title counts and imdb ratings' ) alt.Chart(directors_views_rating_v1.to_pandas()).mark_bar().encode(
def score(self, actual: np.array, predicted: np.array, sample_weight: typing.Optional[np.array] = None, labels: typing.Optional[List[any]] = None, X: typing.Optional[dt.Frame] = None, **kwargs) -> float: # Get the logger if it exists logger = self.get_experiment_logger() # hard-coded as access to experiment parameters (such as self.tgc) not yet available tgc = ["Store", "Dept"] # tgc = ["state"] # tgc = None # enable weighted average over TS R2 scores: weighted based on TS share of rows isR2AverageWeighted = False # obtain a scorer for metric to use scorer = self.get_scorer() if tgc is None or not all(col in X.names for col in tgc): loggerinfo( logger, f"TS R2 computes single R2 on {X.nrows} rows as either tgc {tgc} is not defined or incorrect." ) return scorer.score(actual, predicted, sample_weight, labels, **kwargs) else: tgc_values = X[:, { "weight": count() / X.nrows, "r2": 0.0 }, by(tgc)] loggerinfo( logger, f"TS R2 computes multiple R2 on {X.nrows} rows, tgc {tgc} with weighting is {isR2AverageWeighted}." ) none_values = [None] * X.nrows X = cbind( X[:, tgc], Frame(actual=actual, predicted=predicted, sample_weight=sample_weight if sample_weight is not None else none_values)) for i in range(0, tgc_values.nrows): current_tgc = tgc_values[i, :] current_tgc.key = tgc ts_frame = X[:, :, join(current_tgc)][~isna(f.r2), :] r2_score = scorer.score( ts_frame['actual'].to_numpy(), ts_frame['predicted'].to_numpy(), ts_frame['sample_weight'].to_numpy() if sample_weight is not None else None, labels, **kwargs) tgc_values[i, f.r2] = r2_score loggerinfo( logger, f"TS R2 = {r2_score} on {ts_frame.nrows} rows, tgc = {current_tgc[0, tgc].to_tuples()}" ) if isR2AverageWeighted: # return np.average(tgc_values["r2"].to_numpy(), weights=tgc_values["weight"].to_numpy()) return tgc_values[:, mean(f.r2 * f.weight)][0, 0] else: return tgc_values[:, mean(f.r2)][0, 0]
na_strings=[""]) policia_dt weather_dt # count the number of missing values policia_dt.countna() del policia_dt[:, ['county_name', 'state']] policia_dt policia_dt[:, count(), by(f.driver_gender)] policia_tidy_dt = policia_dt[~dt.isna(f.driver_gender), :] policia_tidy_df = policia_tidy_dt.to_pandas() policia_tidy_df.info() policia_tidy_df policia_tidy_dt[:, count(), by(f.violation)][:, f[:].extend({'grand_tot': dt.sum( f.count)})][:, f[:].extend({'prop': f.count / f.grand_tot} )][:, f[:].remove(f.grand_tot), sort(-f.prop)] policia_tidy_dt[:, count( ), by(f.driver_gender, f.violation
payments_dt We have a function called **countna** from datatable, on applying it on dataframe it will count how many of NA's are in each column and returns a dataframe as below. payments_dt.countna() Here we have 2 NA's in charges column and 5 NA's in payment method. As per the Datatable sytax we have a filter option I position as, DT[I,:] We can now call a function called **isna** from datatable, pass a column name whose NA's should be checked. For example we would like to filter NAs for a column charges as follows - payments_dt[dt.isna(f.charges),:] What if we would like to not to include NA's observations per a column charges, and it can be done as follows- here we have just append that isna function with a symbol (~). payments_dt[~dt.isna(f.charges),:] We could also use the **isna** in by position to count how many of observations are having NA's per in specified column. payments_dt[:,count(),by(dt.isna(f.payment_method))] We have observed that there are 5 NA's and 6 observations are having the values. There are some cases where we have to see two or more observations are together having the NA's. for example we can check for NA's across payment method and charges as below. we have made use of some logical operations and will be discussed when a case comes in. for now you can just remember this syntax. payments_dt[:,count(),by((dt.isna(f.payment_method)) & (dt.isna(f.charges)))]
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]: import os from h2oaicore.systemutils_more import download from h2oaicore.systemutils import config import bz2 def extract_bz2(file, output_file): zipfile = bz2.BZ2File(file) data = zipfile.read() open(output_file, 'wb').write(data) temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines") os.makedirs(temp_path, exist_ok=True) # specify which years are used for training and testing training = [2007] testing = [2008] # download and unzip files files = [] for f in ["%d.csv.bz2" % year for year in training + testing]: link = AirlinesData.base_url + "%s" % f file = download(link, dest_path=temp_path) output_file = file.replace(".bz2", "") extract_bz2(file, output_file) files.append(output_file) # parse with datatable X = dt.rbind(*[dt.fread(x) for x in files]) # add date date_col = 'Date' X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[ 'DayofMonth'] cols_to_keep = ['Date'] # add number of flights in/out for each airport per given interval timeslice_mins = 60 for name, new_col, col, group in [ ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"), ("in", "CRSArrTime_mod", "CRSArrTime", "Dest") ]: X[:, new_col] = X[:, dt.f[col] // timeslice_mins] group_cols = [date_col, group, new_col] new_name = 'flights_%s' % name flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)] flights.key = group_cols cols_to_keep.append(new_name) X = X[:, :, dt.join(flights)] # Fill NaNs with 0s X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0 cols_to_keep.extend([ 'DepDelay', 'Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'UniqueCarrier', 'FlightNum', 'TailNum', 'CRSElapsedTime', 'Origin', 'Dest', 'Distance', # Leaks for delay # 'DepTime', # 'ArrTime', #'CRSArrTime', # 'ActualElapsedTime', # 'AirTime', #'ArrDelay', #'DepDelay', # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay', # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay', ]) X = X[:, cols_to_keep] # Join in some extra info join_files = [('UniqueCarrier', 'carriers.csv', 'Code'), ('Origin', 'airports.csv', 'iata'), ('Dest', 'airports.csv', 'iata'), ('TailNum', 'plane-data.csv', 'tailnum')] for join_key, file, col in join_files: file = download('http://stat-computing.org/dataexpo/2009/%s' % file, dest_path=temp_path) X_join = dt.fread(file, fill=True) X_join.names = {col: join_key} X_join.names = [join_key] + [ join_key + "_" + x for x in X_join.names if x != join_key ] X_join.key = join_key X = X[:, :, dt.join(X_join)] del X[:, join_key] split = False if not split: filename = os.path.join( temp_path, "flight_delays_regression_%d-%d.jay" % (min(training), max(testing))) X.to_jay(filename) return filename else: # prepare splits (by year) and create binary .jay files for import into Driverless AI output_files = [] for condition, name in [ ((min(training) <= dt.f['Year']) & (dt.f['Year'] <= max(training)), 'training'), ((min(testing) <= dt.f['Year']) & (dt.f['Year'] <= max(testing)), 'test'), ]: X_split = X[condition, :] filename = os.path.join(temp_path, "flight_delays_%s.jay" % name) X_split.to_jay(filename) output_files.append(filename) return output_files
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): X = dt.Frame(X) orig_cols = list(X.names) if self.num_classes >= 2: feature_model = NuSVC(kernel='linear', nu=self.params['nu']) model = NuSVC(nu=self.params['nu'], kernel=self.params['kernel'], degree=self.params['degree'], probability=self.params['probability']) lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) else: feature_model = NuSVR(kernel='linear', nu=self.params['nu']) model = NuSVR(nu=self.params['nu'], kernel=self.params['kernel'], degree=self.params['degree']) self.means = dict() for col in X.names: XX = X[:, col] self.means[col] = XX.mean1() if np.isnan(self.means[col]): self.means[col] = 0 XX.replace(None, self.means[col]) X[:, col] = XX assert X[dt.isna(dt.f[col]), col].nrows == 0 X = X.to_numpy() # nu is infeasible sometimes # doing quaternary search on both sides of selected nu valid_nu = None while valid_nu is None: try: model.fit(X, y) valid_nu = self.params['nu'] except: if self.params['nu'] > 0.5: self.params['nu'] = 1.0 - self.params['nu'] else: self.params['nu'] = (4.0 - 3.0 * self.params['nu']) / 4.0 if self.num_classes >= 2: feature_model = NuSVC(kernel='linear', nu=self.params['nu']) model = NuSVC(nu=self.params['nu'], kernel=self.params['kernel'], degree=self.params['degree'], probability=self.params['probability']) else: feature_model = NuSVR(kernel='linear', nu=self.params['nu']) model = NuSVR(nu=self.params['nu'], kernel=self.params['kernel'], degree=self.params['degree']) feature_model.fit(X, y) importances = np.array(abs(feature_model.coef_)).ravel() self.set_model_properties(model=model, features=orig_cols, importances=importances.tolist(), iterations=0)
def transform(self, X: dt.Frame): if X.ncols == 0: return np.zeros((X.nrows, 1)) return X[:, dt.sum([dt.isna(dt.f[x]) for x in range(X.ncols)])]
def fit_transform(self, X: dt.Frame, y: np.array = None, **kwargs): X_original = X X = X[:, dt.f[int].extend(dt.f[float]).extend(dt.f[bool]). extend(dt.f[str])] if hasattr(self, 'runcount'): self.run_count += 1 else: self.run_count = 0 # Get the logger if it exists logger = None if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir, username=self.context.username, ) survival_event = self.__class__._survival_event if survival_event in X.names: raise ValueError( "Consider renaming feature '{}'.".format(survival_event)) # bind y to X to use as event in CoxPH X[:, survival_event] = np.array(LabelEncoder().fit_transform(y)) # sanity check that target is binary if X[survival_event].nunique()[0, 0] != 2: raise ValueError( "Too many values {} in event column - must be exactly 2.". format(X[survival_event].nunique()[0, 0])) # redress target values into 0, 1 event_max = X[survival_event].max()[0, 0] X[dt.f[survival_event] != event_max, survival_event] = 0 X[dt.f[survival_event] == event_max, survival_event] = 1 stop_column_name = self.__class__._stop_column_name ignored_columns = self.__class__._ignored_columns if stop_column_name is None: raise ValueError("Stop column name can't be null.") main_message = "Survival Analysis CoxPH pre-transformer will use event '{}' and time '{}' columns.". \ format(survival_event, stop_column_name) # in accpetance test simply return input X if stop_column_name not in X.names: loggerwarning( logger, "Survival Analysis CoxPH pre-transformer found no time column '{}'." .format(stop_column_name)) return X_original if not X[:, stop_column_name].stype in [ dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32, dt.float64 ]: raise ValueError( "Stop column `{}' type must be numeric, but found '{}'".format( stop_column_name, X[:, stop_column_name].stype)) # remove stop column from X del X_original[:, stop_column_name] self._output_feature_names = list(X_original.names) self._feature_desc = list(X_original.names) if self.run_count == 0 and self.context and self.context.experiment_id: loggerinfo(logger, main_message) task = kwargs.get('task') if task and main_message is not None: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=main_message)) task.flush() # Validate CoxPH requirements on stop column if X[stop_column_name].min()[0, 0] < 0: X[dt.f[stop_column_name] < 0, stop_column_name] = 0 loggerwarning( logger, "Stop column can't be negative: replaced negative values with 0." ) if X[stop_column_name].countna()[0, 0] > 0: X[dt.isna(dt.f[stop_column_name]), stop_column_name] = 0 loggerwarning( logger, "Stop column can't contain NULLs: replaced NULL with 0.") h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir) model = H2OCoxProportionalHazardsEstimator( stop_column=stop_column_name, ties=self.ties, max_iterations=self.max_iterations) frame = h2o.H2OFrame(X.to_pandas()) model_path = None risk_frame = None try: model.train(y=survival_event, training_frame=frame, ignored_columns=ignored_columns) self.id = model.model_id model_path = os.path.join(temporary_files_path, "h2o_model." + str(uuid.uuid4())) model_path = h2o.save_model(model=model, path=model_path) with open(model_path, "rb") as f: self.raw_model_bytes = f.read() risk_frame = model.predict(frame) X_original[:, "risk_score_coxph_{}_{}".format( self.ties, self.max_iterations)] = risk_frame.as_data_frame( header=False) self._output_feature_names.append( f"{self.display_name}{orig_feat_prefix}riskscore_coxph{extra_prefix}{self.ties}_{self.max_iterations}" ) self._feature_desc.append( f"CoxPH model risk score [ties={self.ties}, max.iter={self.max_iterations}" ) return X_original finally: if model_path is not None: remove(model_path) h2o.remove(model) h2o.remove(frame) if risk_frame is not None: h2o.remove(risk_frame)
def test_rows_isna(df1): from datatable import isna dt1 = df1[isna(f.A), :] frame_integrity_check(dt1) assert dt1.names == df1.names assert dt1.to_list() == [[None, None], [None, 8]]
def create_data(X: dt.Frame = None) -> Union[ str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: # define date column and forecast horizon date_col = 'date' group_by_cols = ["state"] forecast_len = 7 # get COVID19 data from NYTimes github us_states = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv") # get states population us_states_pop = dt.fread( "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv") us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'} us_states_pop.key = "state" # augment data with state population figures and create adjusted case and death counts series_cols = ["cases", "deaths"] aggs = {f"{col}100k": dt.f[col] / (dt.g.pop / 100000) for col in series_cols} us_states[:, dt.update(pop = g.pop, pop100k = g.pop / 10000, **aggs), join(us_states_pop)] # remove rows without state defined (resulted in unmatched rows after left outer join) del us_states[isna(f.pop), :] # produce lag of 1 unit and add as new feature for each column in the list series_cols.extend([col + "100k" for col in series_cols]) aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols} us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] # update NA lags to 0 aggs = {f"{col}_yesterday": 0 for col in series_cols} us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)] # compute daily values by differentiating aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols} us_states[:, update(**aggs), sort(date_col), by(group_by_cols)] # delete columns with yesterday (shift) values series_cols_to_delete = [f"{col}_yesterday" for col in series_cols] del us_states[:, series_cols_to_delete] # set negative daily values to 0 us_states[f.cases_daily < 0, [f.cases_daily, f.cases100k_daily]] = 0 us_states[f.deaths_daily < 0, [f.deaths_daily, f.deaths100k_daily]] = 0 # determine threshold to split train and test based on forecast horizon dates = dt.unique(us_states[:, date_col]) split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0] test_date = dates[-1, :, dt.sort(date_col)][0, 0] # split data to honor forecast horizon in test set df = us_states[date_col].to_pandas() train = us_states[df[date_col] <= split_date, :] test = us_states[df[date_col] > split_date, :] # return [train, test] and rename dataset names as needed return {f"covid19_daily_{split_date}_by_states_train": train, f"covid19_daily_{test_date}_by_states_test": test}
def test_del_rows_from_view2(): f0 = dt.Frame([1, 3, None, 4, 5, None, None, 2, None, None, None]) f1 = f0[5:, :] del f1[isna(f[0]), :] assert f1.to_list() == [[2]]
def test_rows_isna(df1): from datatable import isna dt1 = df1[isna(f.A), :] dt1.internal.check() assert dt1.names == df1.names assert dt1.to_list() == [[None, None], [None, 8]]
def test_del_rows_nas(): d0 = dt.Frame({"A": [1, 5, None, 12, 7, None, -3]}) del d0[isna(f.A), :] d0.internal.check() assert d0.topython() == [[1, 5, 12, 7, -3]]
# Displaying DT names and their types for cname, ctype in zip(penguins_dt.names, penguins_dt.stypes): print(f'{cname}- is a type of: {ctype} ') # First five observations from 2 to 5 columns in DT penguins_dt[:5, 2:6] # Last five observations from DT penguins_dt[-5:, :] # All observations for last 3 columns penguins_dt[:, -3:] # Filter out NA's from sex and body mass g columns penguins_dt[(dt.isna(f.sex) & ~dt.isna(f.body_mass_g)), :] # mean of all numerics columns per different penguin sex categories penguins_dt[~dt.isna(f.sex), :][:, dt.mean((f[dt.int32].remove(f.year), f[dt.float64])), by(f.sex)] # step - 1 : finding a max value of body_mass of penguins per sex penguins_dt[:, update(temp=f.body_mass_g == dt.max(f.body_mass_g)), by(f.sex)] # step - 2 : finding a max value of body_mass of penguins per sex penguins_dt[f.temp == 1, f[:].remove(f.temp)] # step - 1 : finding a min value of body_mass of penguins per sex penguins_dt[:, update(temp=f.body_mass_g == dt.min(f.body_mass_g)), by(f.sex)]