예제 #1
0
def test_assign_string_columns():
    f0 = dt.Frame(A=["One", "two", "three", None, "five"])
    f0[dt.isna(f.A), f.A] = dt.Frame(["FOUR"])
    assert f0.names == ("A", )
    assert f0.stypes == (dt.stype.str32, )
    assert f0.to_list() == [["One", "two", "three", "FOUR", "five"]]
예제 #2
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get column names
        orig_cols = list(X.names)

        from h2oaicore.tensorflow_dynamic import got_cpu_tf, got_gpu_tf
        import tensorflow as tf
        import shap
        import scipy
        import pandas as pd

        self.setup_keras_session()

        import h2oaicore.keras as keras
        import matplotlib.pyplot as plt

        if not hasattr(self, 'save_model_path'):
            model_id = str(uuid.uuid4())[:8]
            self.save_model_path = os.path.join(user_dir(),
                                                "custom_xnn_model.hdf5")

        np.random.seed(self.random_state)

        my_init = keras.initializers.RandomUniform(seed=self.random_state)

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folter
        tmp_folder = self._create_tmp_folder(logger)

        # define base model
        def xnn_initialize(features,
                           ridge_functions=3,
                           arch=[20, 12],
                           learning_rate=0.01,
                           bg_samples=100,
                           beta1=0.9,
                           beta2=0.999,
                           dec=0.0,
                           ams=True,
                           bseed=None,
                           is_categorical=False):

            #
            # Prepare model architecture
            #
            # Input to the network, our observation containing all the features
            input = keras.layers.Input(shape=(features, ), name='main_input')

            # Record current column names
            loggerinfo(logger, "XNN LOG")
            loggerdata(logger, "Feature list:")
            loggerdata(logger, str(orig_cols))

            # Input to ridge function number i is the dot product of our original input vector times coefficients
            ridge_input = keras.layers.Dense(ridge_functions,
                                             name="projection_layer",
                                             activation='linear')(input)

            ridge_networks = []
            # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it
            ridge_inputs = SplitLayer(ridge_functions)(ridge_input)
            for i, ridge_input in enumerate(ridge_inputs):
                # Generate subnetwork i
                mlp = _mlp(ridge_input, i, arch)
                ridge_networks.append(mlp)

            added = keras.layers.Concatenate(
                name='concatenate_1')(ridge_networks)

            # Add the correct output layer for the problem
            if is_categorical:
                out = keras.layers.Dense(1,
                                         activation='sigmoid',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)
            else:
                out = keras.layers.Dense(1,
                                         activation='linear',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)

            model = keras.models.Model(inputs=input, outputs=out)

            optimizer = keras.optimizers.Adam(lr=learning_rate,
                                              beta_1=beta1,
                                              beta_2=beta2,
                                              decay=dec,
                                              amsgrad=ams)

            # Use the correct loss for the problem
            if is_categorical:
                model.compile(loss={'main_output': 'binary_crossentropy'},
                              optimizer=optimizer)
            else:
                model.compile(loss={'main_output': 'mean_squared_error'},
                              optimizer=optimizer)

            return model

        def _mlp(input, idx, arch=[20, 12], activation='relu'):
            # Set up a submetwork

            # Hidden layers
            mlp = keras.layers.Dense(arch[0],
                                     activation=activation,
                                     name='mlp_{}_dense_0'.format(idx),
                                     kernel_initializer=my_init)(input)
            for i, layer in enumerate(arch[1:]):
                mlp = keras.layers.Dense(layer,
                                         activation=activation,
                                         name='mlp_{}_dense_{}'.format(
                                             idx, i + 1),
                                         kernel_initializer=my_init)(mlp)

            # Output of the MLP
            mlp = keras.layers.Dense(
                1,
                activation='linear',
                name='mlp_{}_dense_last'.format(idx),
                kernel_regularizer=keras.regularizers.l1(1e-3),
                kernel_initializer=my_init)(mlp)
            return mlp

        def get_shap(X, model):
            # Calculate the Shap values
            np.random.seed(24)
            bg_samples = min(X.shape[0], 1000)

            if isinstance(X, pd.DataFrame):
                background = X.iloc[np.random.choice(X.shape[0],
                                                     bg_samples,
                                                     replace=False)]
            else:
                background = X[np.random.choice(X.shape[0],
                                                bg_samples,
                                                replace=False)]

            # Explain predictions of the model on the subset
            explainer = shap.DeepExplainer(model, background)
            shap_values = explainer.shap_values(X)

            # Return the mean absolute value of each shap value for each dataset
            xnn_shap = np.abs(shap_values[0]).mean(axis=0)

            return xnn_shap

        # Initialize the xnn's
        features = X.shape[1]
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            self.is_cat = True
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)
        else:
            self.is_cat = False
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        inputs = {'main_input': X}
        validation_set = 0
        verbose = 0

        # Train the neural network once with early stopping and a validation set
        history = keras.callbacks.History()
        es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')

        history = xnn1.fit(inputs,
                           y,
                           epochs=self.params["n_estimators"],
                           batch_size=self.params["batch_size"],
                           validation_split=0.3,
                           verbose=verbose,
                           callbacks=[history, es])

        # Train again on the full data
        number_of_epochs_it_ran = len(history.history['loss'])

        xnn.fit(inputs,
                y,
                epochs=number_of_epochs_it_ran,
                batch_size=self.params["batch_size"],
                validation_split=0.0,
                verbose=verbose)

        # Get the mean absolute Shapley values
        importances = np.array(get_shap(X, xnn))

        int_output = {}
        int_weights = {}
        int_bias = {}
        int_input = {}

        original_activations = {}

        x_labels = list(map(lambda x: 'x' + str(x), range(features)))

        intermediate_output = []

        # Record and plot the projection weights
        #
        weight_list = []
        for layer in xnn.layers:

            layer_name = layer.get_config()['name']
            if layer_name != "main_input":
                print(layer_name)
                weights = layer.get_weights()

                # Record the biases
                try:
                    bias = layer.get_weights()[1]
                    int_bias[layer_name] = bias
                except:
                    print("No Bias")

                # Record outputs for the test set
                intermediate_layer_model = keras.models.Model(
                    inputs=xnn.input, outputs=xnn.get_layer(layer_name).output)

                # Record the outputs from the training set
                if self.is_cat and (layer_name == 'main_output'):
                    original_activations[layer_name] = scipy.special.logit(
                        intermediate_layer_model.predict(X))
                    original_activations[
                        layer_name +
                        "_p"] = intermediate_layer_model.predict(X)
                else:
                    original_activations[
                        layer_name] = intermediate_layer_model.predict(X)

                    # Record other weights, inputs, and outputs
                int_weights[layer_name] = weights
                int_input[layer_name] = layer.input
                int_output[layer_name] = layer.output

            # Plot the projection layers
            if "projection_layer" in layer.get_config()['name']:

                # print(layer.get_config()['name'])

                # Record the weights for each projection layer
                weights = [np.transpose(layer.get_weights()[0])]

                weight_list2 = []
                for i, weight in enumerate(weights[0]):
                    weight_list.append(weight)
                    weight_list2.append(
                        list(np.reshape(weight, (1, features))[0]))

                    # Plot weights
                    plt.bar(orig_cols,
                            abs(np.reshape(weight, (1, features))[0]),
                            1,
                            color="blue")
                    plt.ylabel("Coefficient value")
                    plt.title("Projection Layer Weights {}".format(i),
                              fontdict={'fontsize': 10})
                    plt.xticks(rotation=90)
                    plt.show()
                    plt.savefig(os.path.join(
                        tmp_folder, 'projection_layer_' + str(i) + '.png'),
                                bbox_inches="tight")
                    plt.clf()

            if "main_output" in layer.get_config()['name']:
                weights_main = layer.get_weights()
                print(weights_main)

        pd.DataFrame(weight_list2).to_csv(os.path.join(tmp_folder,
                                                       "projection_data.csv"),
                                          index=False)

        intermediate_output = []

        for feature_num in range(features):
            intermediate_layer_model = keras.models.Model(
                inputs=xnn.input,
                outputs=xnn.get_layer('mlp_' + str(feature_num) +
                                      '_dense_last').output)
            intermediate_output.append(intermediate_layer_model.predict(X))

        # Record and plot the ridge functions
        ridge_x = []
        ridge_y = []
        for weight_number in range(len(weight_list)):
            ridge_x.append(
                list(
                    sum(X[:, ii] * weight_list[weight_number][ii]
                        for ii in range(features))))
            ridge_y.append(list(intermediate_output[weight_number]))

            plt.plot(
                sum(X[:, ii] * weight_list[weight_number][ii]
                    for ii in range(features)),
                intermediate_output[weight_number], 'o')
            plt.xlabel("Input")
            plt.ylabel("Subnetwork " + str(weight_number))
            plt.title("Ridge Function {}".format(i), fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(
                os.path.join(tmp_folder,
                             'ridge_' + str(weight_number) + '.png'))
            plt.clf()

        # Output the ridge function importance
        weights2 = np.array([item[0] for item in list(weights)[0]])

        output_activations = np.abs(
            np.array([
                item * weights2
                for item in list(original_activations["concatenate_1"])
            ])).mean(axis=0)
        loggerinfo(logger, str(output_activations))
        pd.DataFrame(output_activations).to_csv(os.path.join(
            tmp_folder, "ridge_weights.csv"),
                                                index=False)

        plt.bar(x_labels, output_activations, 1, color="blue")
        plt.xlabel("Ridge function number")
        plt.ylabel("Feature importance")
        plt.title("Ridge function importance", fontdict={'fontsize': 10})
        plt.show()
        plt.savefig(os.path.join(tmp_folder, 'Ridge_function_importance.png'))

        pd.DataFrame(ridge_y).applymap(lambda x: x[0]).to_csv(os.path.join(
            tmp_folder, "ridge_y.csv"),
                                                              index=False)
        pd.DataFrame(ridge_x).to_csv(os.path.join(tmp_folder, "ridge_x.csv"),
                                     index=False)

        pd.DataFrame(orig_cols).to_csv(os.path.join(tmp_folder,
                                                    "input_columns.csv"),
                                       index=False)

        self.set_model_properties(model=xnn,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=self.params['n_estimators'])
예제 #3
0
 def transform(self, X: dt.Frame):
     return X[:, dt.isna(dt.f[0])]
예제 #4
0
def test_assign_string_columns():
    DT = dt.Frame(A=["One", "two", "three", None, "five"])
    DT[dt.isna(f.A), f.A] = dt.Frame(["FOUR"])
    assert_equals(DT, dt.Frame(A=["One", "two", "three", "FOUR", "five"]))
예제 #5
0
def test_del_rows_nas():
    d0 = dt.Frame({"A": [1, 5, None, 12, 7, None, -3]})
    del d0[isna(f.A), :]
    frame_integrity_check(d0)
    assert d0.to_list() == [[1, 5, 12, 7, -3]]
예제 #6
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # Download files
        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
        link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
        link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz"

        # Download the files
        file_basics = download(link_basics, dest_path=temp_path)
        file_ratings = download(link_ratings, dest_path=temp_path)
        file_episodes = download(link_episodes, dest_path=temp_path)

        # get COVID19 new cases data from Our World in Data github
        basics = dt.fread(file_basics, fill=True)
        ratings = dt.fread(file_ratings, fill=True)
        episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True)

        # remove files
        os.remove(file_basics)
        os.remove(file_ratings)
        os.remove(file_episodes)

        # Create Title with Ratings dataset
        # join titles with non-null ratings
        ratings = ratings[~dt.isna(dt.f.averageRating), :]
        ratings.key = "tconst"
        basics_ratings = basics[:, :, dt.join(ratings)]

        # Create Episodes dataset
        episodes = episodes[~dt.isna(dt.f.seasonNumber)
                            & ~dt.isna(dt.f.episodeNumber), :]
        episode_ratings = episodes[:, :, dt.join(ratings)]
        episode_ratings.names = {
            'tconst': 'episodeTconst',
            'parentTconst': 'tconst',
            'averageRating': 'episodeAverageRating',
            'numVotes': 'episodeNumVotes'
        }
        basics_ratings.key = 'tconst'
        title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)]

        # enumerate series episodes from 1 to N
        title_episode_ratings = title_episode_ratings[:, :,
                                                      dt.sort(
                                                          dt.f.tconst, dt.f.
                                                          seasonNumber, dt.f.
                                                          episodeNumber)]
        result = title_episode_ratings[:, dt.count(),
                                       dt.by(dt.f.tconst)][:,
                                                           'count'].to_list()
        from itertools import chain
        cumcount = chain.from_iterable([i + 1 for i in range(n)]
                                       for n in result[0])
        title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount))

        # return datasets
        return {
            f"imdb_title_ratings": basics_ratings,
            f"imdb_episode_ratings": title_episode_ratings
        }
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # state codes lookup table
        us_state_codes = dt.Frame(
            code=[
                'AL', 'AK', 'AS', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC',
                'FL', 'GA', 'GU', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY',
                'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE',
                'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'MP', 'OH', 'OK',
                'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT',
                'VI', 'VA', 'WA', 'WV', 'WI', 'WY'
            ],
            state=[
                'Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas',
                'California', 'Colorado', 'Connecticut', 'Delaware',
                'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii',
                'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
                'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
                'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
                'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
                'New York', 'North Carolina', 'North Dakota',
                'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon',
                'Pennsylvania', 'Puerto Rico', 'Rhode Island',
                'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
                'Vermont', 'Virgin Islands', 'Virginia', 'Washington',
                'West Virginia', 'Wisconsin', 'Wyoming'
            ])
        us_state_codes.key = "state"

        # get states population lookup table
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv"
        )
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop = us_states_pop[dt.f.STATE > 0, :]
        us_states_pop.key = "state"

        # join state codes and population into single lookup table
        us_states_pop[:, dt.update(code=dt.g.code), dt.join(us_state_codes)]
        us_states_pop.key = "code"

        # US Covid Tracking API: https://covidtracking.com/data/api
        us_states = dt.fread(
            "https://covidtracking.com/api/v1/states/daily.csv")
        # remove deprecated fields
        deprecated = [
            'checkTimeEt', 'commercialScore', 'dateChecked', 'dateModified',
            'grade', 'hash', 'hospitalized', 'negativeIncrease',
            'negativeRegularScore', 'negativeScore', 'posNeg', 'positiveScore',
            'score', 'total'
        ]
        us_states = us_states[:, list(set(us_states.names) - set(deprecated))]
        us_states.names = {'state': 'code'}

        series_cols = [
            "positive", "negative", "hospitalizedCumulative",
            "inIcuCumulative", "onVentilatorCumulative", "recovered", "death"
        ]
        aggs = {f"{col}100k": f[col] / (g.pop / 100000) for col in series_cols}
        us_states[:,
                  dt.update(
                      state=g.state, pop=g.pop, pop100k=g.pop / 10000, **aggs),
                  join(us_states_pop)]
        us_states = us_states[~dt.isna(dt.f.state), :]

        # produce lag of 1 unit and add as new feature for each shift column
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        aggs = {
            f"{col}_daily": f[col] - f[f"{col}_yesterday"]
            for col in series_cols
        }
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        for col in series_cols:
            del us_states[:, f[f"{col}_yesterday"]]

        # validate dataset
        if us_states[:, count(),
                     by(dt.f.state, f.date)][f.count > 1, :].shape[0] > 1:
            raise ValueError(
                "Found duplicate elements for the same date and state.")

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        return {
            f"covidtracking_daily_{split_date}_by_us_states_train": train,
            f"covidtracking_daily_{test_date}_by_us_states_test": test
        }
예제 #8
0
#   X: datatable - primary data set
# Parameters:
#   time_col: date/time/int - time column to order rows before the shift op
#   group_by_cols: list of column names - group columns
#   shift_cols: list of column names - columns to shift
# Output:
#   dataset augmented with shifted columns

from datatable import f, by, sort, update, shift, isna

time_col = "date"
group_by_cols = ["state"]
shift_cols = ["cases", "deaths"]

new_dataset_name = "new_dataset_name_with_shift"

# produce lag of 1 unit and add as new feature for each shift column
aggs = {f"{col}_yesterday": shift(f[col]) for col in shift_cols}
X[:, update(**aggs), sort(time_col), by(*group_by_cols)]

# update NA lags
aggs = {f"{col}_yesterday": 0 for col in shift_cols}
X[isna(f[f"{shift_cols[0]}_yesterday"]), update(**aggs)]

aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in shift_cols}
X[:, update(**aggs), sort(time_col), by(group_by_cols)]

for col in shift_cols:
    del X[:, f[f"{col}_yesterday"]]

return {new_dataset_name: X}
예제 #9
0
weather_dt

weather_dt.key="stop_date"

# count the number of missing values
policia_dt.countna()

del policia_dt[:,['county_name', 'state']]

# glance
policia_dt

policia_dt[:,count(),by(f.driver_gender)]

policia_tidy_dt = policia_dt[~dt.isna(f.driver_gender),:]

policia_tidy_dt[:,count(),by(f.violation)
               ][:,f[:].extend({'grand_tot':dt.sum(f.count)})
                ][:,f[:].extend({'prop':f.count/f.grand_tot})
                 ][:,f[:].remove(f.grand_tot),sort(-f.prop)
                  ]

# custom function to generate a summary report per a single group column
def py_dt_one_group_proportions_summary(DT,por):
    
    DT_summary = DT[:,dt.count(),by(f[por])
                   ][:,f[:].extend({'grand_tot':dt.sum(f.count)})
                    ][:,f[:].extend({'prop':f.count/f.grand_tot})
                     ][:,f[:].remove(f.grand_tot),dt.sort(-f.prop)
                      ]
                                        ]

# Top 10 directors who have made more titles
directors_dt = amigos_info_dt[:,count(),by(f.directed_by)
                             ][:10,:,dt.sort(-f.count)
                              ]

# setting a key on DT
directors_dt.key='directed_by'

# First 5 and last 5 observations
directors_views_rating[[slice(5),slice(25,None)],:]

# directors and their avg title rating and total titles
directors_views_rating_v1 = directors_views_rating[:,:,dt.join(directors_dt)
                                                  ][~dt.isna(f.count),:
                                                   ][:,:,dt.sort(-f.count)
                                                    ]

directors_views_rating_v1

alt.Chart(directors_views_rating_v1.to_pandas()).mark_bar().encode(
    alt.Y('directed_by',sort='-x'),
    alt.X('count'),
    alt.Color('imdb_rating')
).properties(
    
    title='Top directors title counts and imdb ratings'
)

alt.Chart(directors_views_rating_v1.to_pandas()).mark_bar().encode(
    def score(self,
              actual: np.array,
              predicted: np.array,
              sample_weight: typing.Optional[np.array] = None,
              labels: typing.Optional[List[any]] = None,
              X: typing.Optional[dt.Frame] = None,
              **kwargs) -> float:

        # Get the logger if it exists
        logger = self.get_experiment_logger()

        # hard-coded as access to experiment parameters (such as self.tgc) not yet available
        tgc = ["Store", "Dept"]
        # tgc = ["state"]
        # tgc = None

        # enable weighted average over TS R2 scores: weighted based on TS share of rows
        isR2AverageWeighted = False

        # obtain a scorer for metric to use
        scorer = self.get_scorer()

        if tgc is None or not all(col in X.names for col in tgc):
            loggerinfo(
                logger,
                f"TS R2 computes single R2 on {X.nrows} rows as either tgc {tgc} is not defined or incorrect."
            )
            return scorer.score(actual, predicted, sample_weight, labels,
                                **kwargs)
        else:
            tgc_values = X[:, {
                "weight": count() / X.nrows,
                "r2": 0.0
            }, by(tgc)]
            loggerinfo(
                logger,
                f"TS R2 computes multiple R2 on {X.nrows} rows, tgc {tgc} with weighting is {isR2AverageWeighted}."
            )
            none_values = [None] * X.nrows
            X = cbind(
                X[:, tgc],
                Frame(actual=actual,
                      predicted=predicted,
                      sample_weight=sample_weight
                      if sample_weight is not None else none_values))

            for i in range(0, tgc_values.nrows):
                current_tgc = tgc_values[i, :]
                current_tgc.key = tgc
                ts_frame = X[:, :, join(current_tgc)][~isna(f.r2), :]
                r2_score = scorer.score(
                    ts_frame['actual'].to_numpy(),
                    ts_frame['predicted'].to_numpy(),
                    ts_frame['sample_weight'].to_numpy()
                    if sample_weight is not None else None, labels, **kwargs)
                tgc_values[i, f.r2] = r2_score

                loggerinfo(
                    logger,
                    f"TS R2 = {r2_score} on {ts_frame.nrows} rows, tgc = {current_tgc[0, tgc].to_tuples()}"
                )

            if isR2AverageWeighted:
                # return np.average(tgc_values["r2"].to_numpy(), weights=tgc_values["weight"].to_numpy())
                return tgc_values[:, mean(f.r2 * f.weight)][0, 0]
            else:
                return tgc_values[:, mean(f.r2)][0, 0]
    na_strings=[""])

policia_dt

weather_dt

# count the number of missing values
policia_dt.countna()

del policia_dt[:, ['county_name', 'state']]

policia_dt

policia_dt[:, count(), by(f.driver_gender)]

policia_tidy_dt = policia_dt[~dt.isna(f.driver_gender), :]

policia_tidy_df = policia_tidy_dt.to_pandas()

policia_tidy_df.info()

policia_tidy_df

policia_tidy_dt[:, count(),
                by(f.violation)][:, f[:].extend({'grand_tot': dt.sum(
                    f.count)})][:, f[:].extend({'prop': f.count / f.grand_tot}
                                               )][:, f[:].remove(f.grand_tot),
                                                  sort(-f.prop)]

policia_tidy_dt[:, count(
), by(f.driver_gender, f.violation
payments_dt

We have a function called **countna** from datatable, on applying it on dataframe it will count how many of NA's are in each column and returns a dataframe as below.

payments_dt.countna()

Here we have 2 NA's in charges column and 5 NA's in payment method.

As per the Datatable sytax we have a filter option I position as,

                           DT[I,:]

We can now call a function called **isna** from datatable, pass a column name whose NA's should be checked. For example we would like to filter NAs for a column charges as follows - 

payments_dt[dt.isna(f.charges),:]

What if we would like to not to include NA's observations per a column charges, and it can be done as follows- here we have just append that isna function with a symbol (~).

payments_dt[~dt.isna(f.charges),:]

We could also use the **isna** in by position to count how many of observations are having NA's per in specified column.

payments_dt[:,count(),by(dt.isna(f.payment_method))]

We have observed that there are 5 NA's and 6 observations are having the values.

There are some cases where we have to see two or more observations are together having the NA's. for example we can check for NA's across payment method and charges as below. we have made use of some logical operations and will be discussed when a case comes in. for now you can just remember this syntax.

payments_dt[:,count(),by((dt.isna(f.payment_method)) & (dt.isna(f.charges)))]
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines")
        os.makedirs(temp_path, exist_ok=True)

        # specify which years are used for training and testing
        training = [2007]
        testing = [2008]

        # download and unzip files
        files = []
        for f in ["%d.csv.bz2" % year for year in training + testing]:
            link = AirlinesData.base_url + "%s" % f
            file = download(link, dest_path=temp_path)
            output_file = file.replace(".bz2", "")
            extract_bz2(file, output_file)
            files.append(output_file)

        # parse with datatable
        X = dt.rbind(*[dt.fread(x) for x in files])

        # add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[
            'DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s' % name
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # Fill NaNs with 0s
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
        cols_to_keep.extend([
            'DepDelay',
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
            'Distance',
            # Leaks for delay
            # 'DepTime',
            # 'ArrTime', #'CRSArrTime',
            # 'ActualElapsedTime',
            # 'AirTime', #'ArrDelay', #'DepDelay',
            # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay',
            # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay',
        ])
        X = X[:, cols_to_keep]

        # Join in some extra info
        join_files = [('UniqueCarrier', 'carriers.csv', 'Code'),
                      ('Origin', 'airports.csv', 'iata'),
                      ('Dest', 'airports.csv', 'iata'),
                      ('TailNum', 'plane-data.csv', 'tailnum')]

        for join_key, file, col in join_files:
            file = download('http://stat-computing.org/dataexpo/2009/%s' %
                            file,
                            dest_path=temp_path)
            X_join = dt.fread(file, fill=True)
            X_join.names = {col: join_key}
            X_join.names = [join_key] + [
                join_key + "_" + x for x in X_join.names if x != join_key
            ]
            X_join.key = join_key
            X = X[:, :, dt.join(X_join)]
            del X[:, join_key]

        split = False

        if not split:
            filename = os.path.join(
                temp_path, "flight_delays_regression_%d-%d.jay" %
                (min(training), max(testing)))
            X.to_jay(filename)
            return filename
        else:
            # prepare splits (by year) and create binary .jay files for import into Driverless AI
            output_files = []
            for condition, name in [
                ((min(training) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(training)), 'training'),
                ((min(testing) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(testing)), 'test'),
            ]:
                X_split = X[condition, :]
                filename = os.path.join(temp_path,
                                        "flight_delays_%s.jay" % name)
                X_split.to_jay(filename)
                output_files.append(filename)
            return output_files
예제 #15
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)

        orig_cols = list(X.names)

        if self.num_classes >= 2:
            feature_model = NuSVC(kernel='linear', nu=self.params['nu'])
            model = NuSVC(nu=self.params['nu'],
                          kernel=self.params['kernel'],
                          degree=self.params['degree'],
                          probability=self.params['probability'])

            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
        else:
            feature_model = NuSVR(kernel='linear', nu=self.params['nu'])
            model = NuSVR(nu=self.params['nu'],
                          kernel=self.params['kernel'],
                          degree=self.params['degree'])

        self.means = dict()

        for col in X.names:
            XX = X[:, col]
            self.means[col] = XX.mean1()
            if np.isnan(self.means[col]):
                self.means[col] = 0
            XX.replace(None, self.means[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0

        X = X.to_numpy()

        # nu is infeasible sometimes
        # doing quaternary search on both sides of selected nu
        valid_nu = None
        while valid_nu is None:
            try:
                model.fit(X, y)
                valid_nu = self.params['nu']
            except:
                if self.params['nu'] > 0.5:
                    self.params['nu'] = 1.0 - self.params['nu']
                else:
                    self.params['nu'] = (4.0 - 3.0 * self.params['nu']) / 4.0
                if self.num_classes >= 2:
                    feature_model = NuSVC(kernel='linear',
                                          nu=self.params['nu'])
                    model = NuSVC(nu=self.params['nu'],
                                  kernel=self.params['kernel'],
                                  degree=self.params['degree'],
                                  probability=self.params['probability'])
                else:
                    feature_model = NuSVR(kernel='linear',
                                          nu=self.params['nu'])
                    model = NuSVR(nu=self.params['nu'],
                                  kernel=self.params['kernel'],
                                  degree=self.params['degree'])

        feature_model.fit(X, y)
        importances = np.array(abs(feature_model.coef_)).ravel()

        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=0)
예제 #16
0
 def transform(self, X: dt.Frame):
     if X.ncols == 0:
         return np.zeros((X.nrows, 1))
     return X[:, dt.sum([dt.isna(dt.f[x]) for x in range(X.ncols)])]
예제 #17
0
    def fit_transform(self, X: dt.Frame, y: np.array = None, **kwargs):

        X_original = X

        X = X[:, dt.f[int].extend(dt.f[float]).extend(dt.f[bool]).
              extend(dt.f[str])]

        if hasattr(self, 'runcount'):
            self.run_count += 1
        else:
            self.run_count = 0

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir,
                username=self.context.username,
            )

        survival_event = self.__class__._survival_event
        if survival_event in X.names:
            raise ValueError(
                "Consider renaming feature '{}'.".format(survival_event))

        # bind y to X to use as event in CoxPH
        X[:, survival_event] = np.array(LabelEncoder().fit_transform(y))

        # sanity check that target is binary
        if X[survival_event].nunique()[0, 0] != 2:
            raise ValueError(
                "Too many values {} in event column - must be exactly 2.".
                format(X[survival_event].nunique()[0, 0]))

        # redress target values into 0, 1
        event_max = X[survival_event].max()[0, 0]
        X[dt.f[survival_event] != event_max, survival_event] = 0
        X[dt.f[survival_event] == event_max, survival_event] = 1

        stop_column_name = self.__class__._stop_column_name
        ignored_columns = self.__class__._ignored_columns

        if stop_column_name is None:
            raise ValueError("Stop column name can't be null.")

        main_message = "Survival Analysis CoxPH pre-transformer will use event '{}' and time '{}' columns.". \
            format(survival_event, stop_column_name)

        # in accpetance test simply return input X
        if stop_column_name not in X.names:
            loggerwarning(
                logger,
                "Survival Analysis CoxPH pre-transformer found no time column '{}'."
                .format(stop_column_name))
            return X_original

        if not X[:, stop_column_name].stype in [
                dt.bool8, dt.int8, dt.int16, dt.int32, dt.int64, dt.float32,
                dt.float64
        ]:
            raise ValueError(
                "Stop column `{}' type must be numeric, but found '{}'".format(
                    stop_column_name, X[:, stop_column_name].stype))

        # remove stop column from X
        del X_original[:, stop_column_name]

        self._output_feature_names = list(X_original.names)
        self._feature_desc = list(X_original.names)

        if self.run_count == 0 and self.context and self.context.experiment_id:
            loggerinfo(logger, main_message)
            task = kwargs.get('task')
            if task and main_message is not None:
                task.sync(key=self.context.experiment_id,
                          progress=dict(type='update', message=main_message))
                task.flush()

        # Validate CoxPH requirements on stop column
        if X[stop_column_name].min()[0, 0] < 0:
            X[dt.f[stop_column_name] < 0, stop_column_name] = 0
            loggerwarning(
                logger,
                "Stop column can't be negative: replaced negative values with 0."
            )
        if X[stop_column_name].countna()[0, 0] > 0:
            X[dt.isna(dt.f[stop_column_name]), stop_column_name] = 0
            loggerwarning(
                logger,
                "Stop column can't contain NULLs: replaced NULL with 0.")

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model = H2OCoxProportionalHazardsEstimator(
            stop_column=stop_column_name,
            ties=self.ties,
            max_iterations=self.max_iterations)
        frame = h2o.H2OFrame(X.to_pandas())
        model_path = None
        risk_frame = None
        try:
            model.train(y=survival_event,
                        training_frame=frame,
                        ignored_columns=ignored_columns)
            self.id = model.model_id
            model_path = os.path.join(temporary_files_path,
                                      "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                self.raw_model_bytes = f.read()
            risk_frame = model.predict(frame)
            X_original[:, "risk_score_coxph_{}_{}".format(
                self.ties, self.max_iterations)] = risk_frame.as_data_frame(
                    header=False)
            self._output_feature_names.append(
                f"{self.display_name}{orig_feat_prefix}riskscore_coxph{extra_prefix}{self.ties}_{self.max_iterations}"
            )
            self._feature_desc.append(
                f"CoxPH model risk score [ties={self.ties}, max.iter={self.max_iterations}"
            )
            return X_original
        finally:
            if model_path is not None:
                remove(model_path)
            h2o.remove(model)
            h2o.remove(frame)
            if risk_frame is not None:
                h2o.remove(risk_frame)
예제 #18
0
def test_rows_isna(df1):
    from datatable import isna
    dt1 = df1[isna(f.A), :]
    frame_integrity_check(dt1)
    assert dt1.names == df1.names
    assert dt1.to_list() == [[None, None], [None, 8]]
예제 #19
0
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # define date column and forecast horizon
        date_col = 'date'
        group_by_cols = ["state"]
        forecast_len = 7

        # get COVID19 data from NYTimes github
        us_states = dt.fread("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv")

        # get states population
        us_states_pop = dt.fread(
            "http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv")
        us_states_pop.names = {'NAME': 'state', 'POPESTIMATE2019': 'pop'}
        us_states_pop.key = "state"

        # augment data with state population figures and create adjusted case and death counts
        series_cols = ["cases", "deaths"]
        aggs = {f"{col}100k": dt.f[col] / (dt.g.pop / 100000) for col in series_cols}
        us_states[:, dt.update(pop = g.pop, pop100k = g.pop / 10000, **aggs), join(us_states_pop)]

        # remove rows without state defined (resulted in unmatched rows after left outer join)
        del us_states[isna(f.pop), :]

        # produce lag of 1 unit and add as new feature for each column in the list
        series_cols.extend([col + "100k" for col in series_cols])
        aggs = {f"{col}_yesterday": shift(f[col]) for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # update NA lags to 0
        aggs = {f"{col}_yesterday": 0 for col in series_cols}
        us_states[isna(f[f"{series_cols[0]}_yesterday"]), update(**aggs)]

        # compute daily values by differentiating
        aggs = {f"{col}_daily": f[col] - f[f"{col}_yesterday"] for col in series_cols}
        us_states[:, update(**aggs), sort(date_col), by(group_by_cols)]

        # delete columns with yesterday (shift) values
        series_cols_to_delete = [f"{col}_yesterday" for col in series_cols]
        del us_states[:, series_cols_to_delete]

        # set negative daily values to 0
        us_states[f.cases_daily < 0, [f.cases_daily, f.cases100k_daily]] = 0
        us_states[f.deaths_daily < 0, [f.deaths_daily, f.deaths100k_daily]] = 0

        # determine threshold to split train and test based on forecast horizon
        dates = dt.unique(us_states[:, date_col])
        split_date = dates[-(forecast_len + 1):, :, dt.sort(date_col)][0, 0]
        test_date = dates[-1, :, dt.sort(date_col)][0, 0]

        # split data to honor forecast horizon in test set
        df = us_states[date_col].to_pandas()
        train = us_states[df[date_col] <= split_date, :]
        test = us_states[df[date_col] > split_date, :]

        # return [train, test] and rename dataset names as needed
        return {f"covid19_daily_{split_date}_by_states_train": train,
                f"covid19_daily_{test_date}_by_states_test": test}
예제 #20
0
def test_del_rows_from_view2():
    f0 = dt.Frame([1, 3, None, 4, 5, None, None, 2, None, None, None])
    f1 = f0[5:, :]
    del f1[isna(f[0]), :]
    assert f1.to_list() == [[2]]
예제 #21
0
def test_rows_isna(df1):
    from datatable import isna
    dt1 = df1[isna(f.A), :]
    dt1.internal.check()
    assert dt1.names == df1.names
    assert dt1.to_list() == [[None, None], [None, 8]]
예제 #22
0
def test_del_rows_nas():
    d0 = dt.Frame({"A": [1, 5, None, 12, 7, None, -3]})
    del d0[isna(f.A), :]
    d0.internal.check()
    assert d0.topython() == [[1, 5, 12, 7, -3]]
# Displaying DT  names and their types
for cname, ctype in zip(penguins_dt.names, penguins_dt.stypes):
    print(f'{cname}- is a type of: {ctype} ')

# First five observations from 2 to 5 columns in DT
penguins_dt[:5, 2:6]

# Last five observations from DT
penguins_dt[-5:, :]

# All observations for last 3 columns
penguins_dt[:, -3:]

# Filter out NA's from sex and body mass g columns
penguins_dt[(dt.isna(f.sex) & ~dt.isna(f.body_mass_g)), :]

# mean of all numerics columns per different penguin sex categories
penguins_dt[~dt.isna(f.sex), :][:,
                                dt.mean((f[dt.int32].remove(f.year),
                                         f[dt.float64])),
                                by(f.sex)]

# step - 1 : finding a max value of body_mass of penguins per sex
penguins_dt[:, update(temp=f.body_mass_g == dt.max(f.body_mass_g)), by(f.sex)]

# step - 2 : finding a max value of body_mass of penguins per sex
penguins_dt[f.temp == 1, f[:].remove(f.temp)]

# step - 1 : finding a min value of body_mass of penguins per sex
penguins_dt[:, update(temp=f.body_mass_g == dt.min(f.body_mass_g)), by(f.sex)]