Пример #1
0
    def _create_tmp_folder(self, logger):
        # Create a temp folder to store xnn files
        # Set the default value without context available (required to pass acceptance test)
        tmp_folder = os.path.join(user_dir(),
                                  "%s_xnn_model_folder" % uuid.uuid4())
        # Make a real tmp folder when experiment is available
        if self.context and self.context.experiment_id:
            tmp_folder = os.path.join(self.context.experiment_tmp_dir,
                                      "%s_xnn_model_folder" % uuid.uuid4())

        # Now let's try to create that folder
        try:
            os.mkdir(tmp_folder)
        except PermissionError:
            # This not occur so log a warning
            loggerwarning(logger, "XNN was denied temp folder creation rights")
            tmp_folder = os.path.join(user_dir(),
                                      "%s_xnn_model_folder" % uuid.uuid4())
            os.mkdir(tmp_folder)
        except FileExistsError:
            # We should never be here since temp dir name is expected to be unique
            loggerwarning(logger, "XNN temp folder already exists")
            tmp_folder = os.path.join(self.context.experiment_tmp_dir,
                                      "%s_xnn_model_folder" % uuid.uuid4())
            os.mkdir(tmp_folder)
        except:
            # Revert to temporary file path
            tmp_folder = os.path.join(user_dir(),
                                      "%s_xnn_model_folder" % uuid.uuid4())
            os.mkdir(tmp_folder)

        loggerinfo(logger, "XNN temp folder {}".format(tmp_folder))
        return tmp_folder
Пример #2
0
 def set_tagger(self):
     import nltk
     nltk_data_path = os.path.join(user_dir(),
                                   config.contrib_env_relative_directory,
                                   "nltk_data")
     nltk_temp_path = os.path.join(user_dir(), "nltk_data")
     nltk.data.path.append(nltk_data_path)
     nltk.download('averaged_perceptron_tagger',
                   download_dir=nltk_data_path)
     try:
         self.pos_tagger = nltk.pos_tag
         self.pos_tagger("test")
     except LookupError:
         os.makedirs(nltk_data_path, exist_ok=True)
         os.makedirs(nltk_temp_path, exist_ok=True)
         tagger_path = os.path.join(nltk_data_path, "taggers")
         os.makedirs(tagger_path, exist_ok=True)
         file1 = download(
             "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
             dest_path=nltk_temp_path)
         file2 = download(
             "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
             dest_path=nltk_temp_path)
         self.unzip_file(file1, tagger_path)
         self.unzip_file(file2, tagger_path)
         self.atomic_copy(file1, tagger_path)
         self.atomic_copy(file2, tagger_path)
         self.pos_tagger = nltk.pos_tag
         self.pos_tagger("test")
 def preprocess_image(self, source_img_path, check_only=False):
     try:
         final_img_path = os.path.join(user_dir(), self.uuid,
                                       os.path.basename(source_img_path))
     except:  # we are sometimes getting np.float32, why?
         return None
     delete = False
     if not os.path.exists(final_img_path):
         if not os.path.exists(source_img_path):
             try:
                 self.download(source_img_path, final_img_path)
             except requests.RequestException as e:
                 # print_debug("Error: %s for source_img_path: %s" % (str(e), str(source_img_path)))
                 return None
             delete = False  # True to avoid re-download or a race condition between multiple procs
         else:
             final_img_path = source_img_path
     if not check_only:
         import h2oaicore.keras as keras
         importlib.reload(keras)
         img = keras.preprocessing.image.load_img(final_img_path,
                                                  target_size=(224, 224))
         if delete:
             remove(final_img_path)
         x = keras.preprocessing.image.img_to_array(img)
         x = np.expand_dims(x, axis=0)
         x = keras.applications.resnet50.preprocess_input(x)
         return x
     else:
         return True
Пример #4
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        lb = LabelEncoder()
        lb.fit(self.labels)
        y = lb.transform(y)
        orig_cols = list(X.names)
        XX = X.to_pandas()
        params = {
            'train_dir': user_dir(),
            'allow_writing_files': False,
            'thread_count': 10,
            # 'loss_function': 'Logloss'
        }
        from catboost import CatBoostClassifier
        model = CatBoostClassifier(**params)
        model.fit(XX,
                  y=y,
                  sample_weight=sample_weight,
                  verbose=False,
                  cat_features=list(X[:, [str, int]].names)
                  )  # Amazon specific, also no early stopping

        # must always set best_iterations
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=model.feature_importances_,
                                  iterations=0)
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        X = self.inf_impute(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(user_dir(), self.id)
        model_file = os.path.join(model_path,
                                  "h2o_model." + str(uuid.uuid4()) + ".bin")
        os.makedirs(model_path, exist_ok=True)
        with open(model_file, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_file))
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            if kwargs.get("pred_contribs"):
                return model.predict_contributions(test_frame).as_data_frame(
                    header=False).values
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)
            if self.num_classes == 1:
                return preds.values.ravel()
            elif self.num_classes == 2:
                return preds.iloc[:, -1].values.ravel()
            else:
                return preds.iloc[:, 1:].values
        finally:
            # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model
            h2o.remove(test_frame)
            remove(model_file)
            if preds_frame is not None:
                h2o.remove(preds_frame)
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory,
                                 "airlines")
        os.makedirs(temp_path, exist_ok=True)

        link = AirlinesData.base_url + "1990.csv.bz2"
        file = download(link, dest_path=temp_path)
        output_file1 = file.replace(".bz2", "")
        print("%s %s" % (file, output_file1))
        extract_bz2(file, output_file1)

        link = AirlinesData.base_url + "1991.csv.bz2"
        file = download(link, dest_path=temp_path)
        output_file2 = file.replace(".bz2", "")
        print("%s %s" % (file, output_file2))
        extract_bz2(file, output_file2)

        return [output_file1, output_file2]
Пример #7
0
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(user_dir(), self.id)
        with open(model_path, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            if kwargs.get("pred_contribs"):
                return model.predict_contributions(test_frame).as_data_frame(
                    header=False).values
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)
            if self.num_classes == 1:
                return preds.values.ravel()
            elif self.num_classes == 2:
                return preds.iloc[:, -1].values.ravel()
            else:
                return preds.iloc[:, 1:].values
        finally:
            h2o.remove(self.id)
            h2o.remove(test_frame)
            if preds_frame is not None:
                h2o.remove(preds_frame)
Пример #8
0
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # Download files
        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
        link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
        link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz"

        # Download the files
        file_basics = download(link_basics, dest_path=temp_path)
        file_ratings = download(link_ratings, dest_path=temp_path)
        file_episodes = download(link_episodes, dest_path=temp_path)

        # get COVID19 new cases data from Our World in Data github
        basics = dt.fread(file_basics, fill=True)
        ratings = dt.fread(file_ratings, fill=True)
        episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True)

        # remove files
        os.remove(file_basics)
        os.remove(file_ratings)
        os.remove(file_episodes)

        # Create Title with Ratings dataset
        # join titles with non-null ratings
        ratings = ratings[~dt.isna(dt.f.averageRating), :]
        ratings.key = "tconst"
        basics_ratings = basics[:, :, dt.join(ratings)]

        # Create Episodes dataset
        episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :]
        episode_ratings = episodes[:, :, dt.join(ratings)]
        episode_ratings.names = {'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes'}
        basics_ratings.key = 'tconst'
        title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)]

        # enumerate series episodes from 1 to N
        title_episode_ratings = title_episode_ratings[:, :, dt.sort(dt.f.tconst, dt.f.seasonNumber, dt.f.episodeNumber)]
        result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list()
        from itertools import chain
        cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0])
        title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount))

        # return datasets
        return {f"imdb_title_ratings": basics_ratings,
                f"imdb_episode_ratings": title_episode_ratings}
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     self.id = None
     self.target = "__target__"
     self.weight = "__weight__"
     self.col_types = None
     self.my_log_dir = os.path.abspath(os.path.join(user_dir(),
                                                    config.contrib_relative_directory, "h2o_log"))
     if not os.path.isdir(self.my_log_dir):
         os.makedirs(self.my_log_dir, exist_ok=True)
    def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = TestData.url
        file = download(link, dest_path=temp_path)

        return file
Пример #11
0
 def __init__(self, batch_size=32, **kwargs):
     super().__init__(**kwargs)
     self.batch_size = batch_size
     self.model_name = "resnet_keras.h5p"
     self.uuid = "%s-img-data-" % self.__class__.__name__ + self.model_name  # + str(uuid.uuid4())[:6] # no, keeps changing and re-loadeing every init
     self.uuid_tmp = str(uuid.uuid4())[:6]
     self.col_name = self.input_feature_names[0]
     self.model_path = os.path.join(user_dir(), self.uuid + ".model")
     self.model_tmp_path = self.model_path + "_" + self.uuid_tmp + ".tmp"
     if not os.path.exists(self.model_path):
         self.download(
             url="http://s3.amazonaws.com/artifacts.h2o.ai/releases/ai/h2o/recipes/transformers/img/%s" % self.model_name,
             dest=self.model_path)
     with open(self.model_path, 'rb') as f:
         self.model_bytes = f.read()
Пример #12
0
    def transform(self, X: dt.Frame):
        h2o.init(port=config.h2o_recipes_port)
        model_path = os.path.join(user_dir(), self.id)
        with open(model_path, "wb") as f:
            f.write(self.raw_model_bytes)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        frame = h2o.H2OFrame(X.to_pandas())
        anomaly_frame = None

        try:
            anomaly_frame = model.anomaly(frame)
            anomaly_frame_df = anomaly_frame.as_data_frame(header=False)
            return anomaly_frame_df
        finally:
            h2o.remove(self.id)
            h2o.remove(anomaly_frame)
Пример #13
0
    def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = "http://data.un.org/_Docs/SYB/CSV/SYB63_226_202009_Net%20Disbursements%20from%20Official%20ODA%20to%20Recipients.csv"
        output_file1 = download(link, dest_path=temp_path)

        link = "http://data.un.org/_Docs/SYB/CSV/SYB63_223_202009_Net%20Disbursements%20from%20Official%20ODA%20from%20Donors.csv"
        output_file2 = download(link, dest_path=temp_path)

        return [output_file1, output_file2]
Пример #14
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        # exit gracefully if method is called as a data upload rather than data modify
        if X is None:
            return []
        import os
        from h2oaicore.systemutils import config

        # Change to pandas -> we can rewrite this as dt at a later date
        rain_raw = X.to_pandas()

        # Set index and pivot the data
        # Rows go from one row each month to one row each month & gauge
        rain_raw = rain_raw.set_index("date")
        rain_pivot = rain_raw.unstack().reset_index(name="rain_inches")
        rain_pivot.rename(columns={
            'level_0': 'rain_gauge',
            'date': 'end_of_month'
        },
                          inplace=True)

        # Format date appropriately
        rain_pivot['end_of_month'] = pd.to_datetime(rain_pivot['end_of_month'])

        # Split data into train and test by date
        # Train on 7 years of data, test on 1 year of data
        train_py = rain_pivot[(rain_pivot['end_of_month'] >= '2009-01-01')
                              & (rain_pivot['end_of_month'] <= '2016-01-01')]
        test_py = rain_pivot[rain_pivot['end_of_month'].dt.year == 2016]

        # Set up to save to disk
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # Save files to disk
        file_train = os.path.join(temp_path, "seattle_rain_train.csv")
        train_py.to_csv(file_train)
        file_test = os.path.join(temp_path, "seattle_rain_test.csv")
        test_py.to_csv(file_test)

        return [file_train, file_test]
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory,
                                 "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
        output_file1 = download(link, dest_path=temp_path)

        link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data"
        output_file2 = download(link, dest_path=temp_path)

        return [output_file1, output_file2]
Пример #16
0
    def transform(self, X: dt.Frame):
        h2o.init(port=config.h2o_recipes_port)
        model_path = os.path.join(user_dir(), self.id)
        model_file = os.path.join(model_path,
                                  "h2o_model." + str(uuid.uuid4()) + ".bin")
        os.makedirs(model_path, exist_ok=True)
        with open(model_file, "wb") as f:
            f.write(self.raw_model_bytes)
        model = h2o.load_model(os.path.abspath(model_file))
        frame = h2o.H2OFrame(X.to_pandas())
        anomaly_frame = None

        try:
            anomaly_frame = model.anomaly(frame)
            anomaly_frame_df = anomaly_frame.as_data_frame(header=False)
            return anomaly_frame_df
        finally:
            remove(model_file)
            h2o.remove(self.id)
            h2o.remove(anomaly_frame)
Пример #17
0
 def fit_transform(self, X: dt.Frame, y: np.array = None):
     h2o.init(port=config.h2o_recipes_port)
     model = H2OAutoEncoderEstimator(activation='tanh',
                                     epochs=1,
                                     hidden=[50, 50],
                                     reproducible=True,
                                     seed=1234)
     frame = h2o.H2OFrame(X.to_pandas())
     model_path = None
     try:
         model.train(x=list(range(X.ncols)), training_frame=frame)
         self.id = model.model_id
         model_path = os.path.join(user_dir(),
                                   "h2o_model." + str(uuid.uuid4()))
         model_path = h2o.save_model(model=model, path=model_path)
         with open(model_path, "rb") as f:
             self.raw_model_bytes = f.read()
         return model.anomaly(frame).as_data_frame(header=False)
     finally:
         if model_path is not None:
             remove(model_path)
         h2o.remove(model)
    def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(user_dir(), self.id)
        with open(model_path, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_path))
        remove(model_path)
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)

            return preds.values.ravel()

        finally:
            h2o.remove(self.id)
            h2o.remove(test_frame)
            if preds_frame is not None:
                h2o.remove(preds_frame)
Пример #19
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link = "https://data.seattle.gov/resource/rdtp-hzy3.csv"

        # Download the file
        file = download(link, dest_path=temp_path)

        # Give the file a descriptive name for the UI
        output_file = file.replace("rdtp-hzy3", "seattle_monthly_rain_raw")
        os.rename(file, output_file)

        # Return the location on the DAI server for this data set
        return output_file
Пример #20
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines")
        os.makedirs(temp_path, exist_ok=True)
        dt.options.nthreads = 8

        # specify which years are used for training and testing
        training = list(range(2005, 2008))
        testing = [2008]

        # download and unzip files
        files = []
        for f in ["%d.csv.bz2" % year for year in training + testing]:
            link = AirlinesData.base_url + "%s" % f
            file = download(link, dest_path=temp_path)
            output_file = file.replace(".bz2", "")
            if not os.path.exists(output_file):
                extract_bz2(file, output_file)
            files.append(output_file)

        # parse with datatable
        X = dt.rbind(*[dt.fread(x) for x in files])

        # add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[
            'DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # select flights leaving from SFO only
        X = X[dt.f['Origin'] == 'SFO', :]

        # Fill NaNs in DepDelay column
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0

        # create binary target column
        depdelay_threshold_mins = 15
        target = 'DepDelay%dm' % depdelay_threshold_mins
        X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
        cols_to_keep.extend([
            target,
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
            'Distance',
            # Leaks for delay
            # 'DepTime',
            # 'ArrTime', #'CRSArrTime',
            # 'ActualElapsedTime',
            # 'AirTime', #'ArrDelay', #'DepDelay',
            # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay',
            # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay',
        ])
        X = X[:, cols_to_keep]

        # Join in some extra info
        join_files = [('UniqueCarrier', 'carriers.csv', 'Code'),
                      ('Origin', 'airports.csv', 'iata'),
                      ('Dest', 'airports.csv', 'iata'),
                      ('TailNum', 'plane-data.csv', 'tailnum')]

        for join_key, file, col in join_files:
            file = download(
                'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' %
                file,
                dest_path=temp_path)
            X_join = dt.fread(file, fill=True)
            X_join.names = {col: join_key}
            X_join.names = [join_key] + [
                join_key + "_" + x for x in X_join.names if x != join_key
            ]
            X_join.key = join_key
            X = X[:, :, dt.join(X_join)]
            del X[:, join_key]

        split = True
        if not split:
            filename = os.path.join(
                temp_path, "flight_delays_data_recipe_%d-%d.csv" %
                (min(training), max(testing)))
            X.to_csv(filename)
            return filename
        else:
            # prepare splits (by year) and create binary .jay files for import into Driverless AI
            output_files = []
            for condition, name in [
                ((min(training) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(training)), 'training'),
                ((min(testing) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(testing)), 'test'),
            ]:
                X_split = X[condition, :]
                filename = os.path.join(
                    temp_path, "augmented_flights_%s-%d_%s.csv" %
                    (X_split[:, 'Year'].min1(), X_split[:,
                                                        'Year'].max1(), name))
                X_split.to_csv(filename)
                output_files.append(filename)
            return output_files
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)
        X = self.inf_impute(X)
        self.transcribe(X=X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None and len(
                    sample_weight_eval_set
            ) > 0 and sample_weight_eval_set[0] is not None:
                sample_weight_eval_set1 = sample_weight_eval_set[0]
                sample_weight_eval_set1[sample_weight_eval_set1 != 0] = 1
                sample_weight_eval_set1 = sample_weight_eval_set1.astype(int)
                sample_weight_eval_set = [sample_weight_eval_set1]

        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(
            y,
            column_names=[self.target],
            column_types=[
                'categorical' if self.num_classes >= 2 else 'numeric'
            ])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(),
                                   column_types=self.col_types)
            valid_y = h2o.H2OFrame(
                eval_set[0][1],
                column_names=[self.target],
                column_types=[
                    'categorical' if self.num_classes >= 2 else 'numeric'
                ])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs', 0)
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            trials = 2
            for trial in range(0, trials):
                try:
                    # Models that can use an offset column
                    model = self.make_instance(**params)
                    if isinstance(model, H2OGBMModel) | isinstance(
                            model, H2ODLModel) | isinstance(
                                model, H2OGLMModel):
                        model.train(x=cols_to_train,
                                    y=self.target,
                                    training_frame=train_frame,
                                    offset_column=offset_col,
                                    **train_kwargs)
                    else:
                        model.train(x=train_X.names,
                                    y=self.target,
                                    training_frame=train_frame,
                                    **train_kwargs)
                    break
                except Exception as e:
                    print(str(e))
                    t, v, tb = sys.exc_info()
                    ex = ''.join(traceback.format_exception(t, v, tb))
                    if 'Training data must have at least 2 features' in str(
                            ex) and X.ncols != 0:
                        # if had non-zero features but h2o-3 saw as constant, ignore h2o-3 in that case
                        raise IgnoreEntirelyError
                    elif "min_rows: The dataset size is too small to split for min_rows" in str(
                            e):
                        # then h2o-3 counted as rows some reduced set, since we already protect against actual rows vs. min_rows
                        params['min_rows'] = 1  # go down to lowest value
                        # permit another trial
                    else:
                        raise
                    if trial == trials - 1:
                        # if at end of trials, raise no matter what
                        raise

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(user_dir(),
                                      "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [
                    train_frame, train_X, train_y, model, valid_frame, valid_X,
                    valid_y
            ]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [
                    x for x in orig_cols if x not in list(df_varimp.index)
            ]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
Пример #22
0
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None:
                sample_weight_eval_set = [(sample_weight_eval_set[0] != 0).astype(int)]

        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(y,
                               column_names=[self.target],
                               column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs')
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight
            model = self.make_instance(**params)

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            # Models that can use an offset column
            if isinstance(model, H2OGBMModel) | isinstance(model, H2ODLModel) | isinstance(model, H2OGLMModel):
                model.train(x=cols_to_train, y=self.target, training_frame=train_frame, offset_column=offset_col,
                            **train_kwargs)
            else:
                model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs)

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [x for x in orig_cols if x not in list(df_varimp.index)]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        orig_cols = list(X.names)
        train_X = h2o.H2OFrame(X.to_pandas())
        self.col_types = train_X.types
        train_y = h2o.H2OFrame(y,
                               column_names=[self.target],
                               column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()

            max_runtime_secs = self.params.get('max_runtime_secs', 0)
            train_kwargs = dict(max_runtime_secs=max_runtime_secs)

            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight
            model = self.make_instance(**self.params)
            model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs)
            self.id = model.model_id
            model_path = os.path.join(user_dir(), "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]:
                if xx is not None:
                    h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [x for x in orig_cols if x not in list(df_varimp.index)]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
Пример #24
0
    def transcribe_params(self, params=None, **kwargs):
        if params is None:
            params = self.params  # reference
        params = params.copy(
        )  # don't contaminate DAI params, since we know we use lgbm-xgb as base

        has_eval_set = self.have_eval_set(
            kwargs)  # only needs (and does) operate at fit-time
        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
        fullspec_regression = inspect.getfullargspec(CatBoostRegressor)
        kwargs_regression = {
            k: v
            for k, v in zip(fullspec_regression.args,
                            fullspec_regression.defaults)
        }
        fullspec_classification = inspect.getfullargspec(CatBoostClassifier)
        kwargs_classification = {
            k: v
            for k, v in zip(fullspec_classification.args,
                            fullspec_classification.defaults)
        }

        if self.num_classes == 1:
            allowed_params = kwargs_regression
        else:
            allowed_params = kwargs_classification

        params_copy = copy.deepcopy(params)
        for k, v in params_copy.items():
            if k not in allowed_params.keys():
                del params[k]

        # now transcribe
        k = 'boosting_type'
        if k in params:
            params[k] = 'Plain'

        k = 'grow_policy'
        if k in params:
            params[
                k] = 'Depthwise' if params[k] == 'depthwise' else 'Lossguide'

        k = 'eval_metric'
        if k in params and params[k] is not None and params[k].upper(
        ) == 'AUC':
            params[k] = 'AUC'

        map = {
            'regression': 'RMSE',
            'mse': 'RMSE',
            'mae': 'MAE',
            "mape": 'MAPE',
            "huber": 'Huber',
            "fair": 'FairLoss',
            "rmse": "RMSE",
            "gamma": "RMSE",  # unsupported by catboost
            "tweedie": "Tweedie",
            "poisson": "Poisson",
            "quantile": "Quantile",
            'binary': 'Logloss',
            'auc': 'AUC',
            "xentropy": 'CrossEntropy',
            'multiclass': 'MultiClass'
        }

        k = 'objective'
        if k in params and params[k] in map.keys():
            params[k] = map[params[k]]

        k = 'eval_metric'
        if k in params and params[k] is not None and params[k] in map.keys():
            params[k] = map[params[k]]

        if 'objective' in params:
            # don't randomly choose these since then model not stable GA -> final
            # but backup shouldn't really be used AFAIK
            if params['objective'] == 'Huber':
                backup = float(config.huber_alpha_list[0])
                params['delta'] = params.pop('alpha', backup)
            if params['objective'] == 'Quantile':
                backup = float(config.quantile_alpha[0])
                params['delta'] = params.pop('alpha', backup)
            if params['objective'] == 'Tweedie':
                backup = float(config.tweedie_variance_power_list[0])
                params['tweedie_variance_power'] = params.pop(
                    'tweedie_variance_power', backup)
            if params['objective'] == 'FairLoss':
                backup = float(config.fair_c_list[0])
                params['smoothness'] = params.pop('fair_c', backup)

        params.pop('verbose', None)
        params.pop('verbose_eval', None)
        params.pop('logging_level', None)

        if 'grow_policy' in params:
            if params['grow_policy'] == 'Lossguide':
                params.pop('max_depth', None)
            if params['grow_policy'] == 'Depthwise':
                params.pop('num_leaves', None)
        else:
            params['grow_policy'] = 'SymmetricTree'

        uses_gpus, n_gpus = self.get_uses_gpus(params)

        if params['task_type'] == 'CPU':
            params.pop('grow_policy', None)
            params.pop('num_leaves', None)
            params.pop('max_leaves', None)
            params.pop('min_data_in_leaf', None)
            params.pop('min_child_samples', None)

        if params['task_type'] == 'GPU':
            params.pop('colsample_bylevel', None)  # : 0.35

        if 'grow_policy' in params and params['grow_policy'] in [
                'Depthwise', 'SymmetricTree'
        ]:
            if 'max_depth' in params and params['max_depth'] in [0, -1]:
                params['max_depth'] = max(
                    2, int(np.log(params.get('num_leaves', 2**6))))
        else:
            params.pop('max_depth', None)
            params.pop('depth', None)
        if 'grow_policy' in params and params['grow_policy'] == 'Lossguide':
            # if 'num_leaves' in params and params['num_leaves'] == -1:
            #    params['num_leaves'] = 2 ** params.get('max_depth', 6)
            if 'max_leaves' in params and params['max_leaves'] in [0, -1]:
                params['max_leaves'] = 2**params.get('max_depth', 6)
        else:
            params.pop('max_leaves', None)
        if 'num_leaves' in params and 'max_leaves' in params:
            params.pop('num_leaves', None)
        # apply limits
        if 'max_leaves' in params:
            params['max_leaves'] = min(params['max_leaves'], 65536)
        if 'max_depth' in params:
            params['max_depth'] = min(params['max_depth'], 16)

        params.update({
            'train_dir': user_dir(),
            'allow_writing_files': False,
            'thread_count': self.params_base.get('n_jobs', 4)
        })

        if 'reg_lambda' in params and params['reg_lambda'] <= 0.0:
            params['reg_lambda'] = 3.0  # assume meant unset

        if self._can_handle_categorical:
            if 'max_cat_to_onehot' in params:
                params['one_hot_max_size'] = params['max_cat_to_onehot']
                params.pop('max_cat_to_onehot', None)
            if uses_gpus:
                params['one_hot_max_size'] = min(
                    params.get('one_hot_max_size', 255), 255)
            else:
                params['one_hot_max_size'] = min(
                    params.get('one_hot_max_size', 65535), 65535)

        if 'one_hot_max_size' in params:
            params['one_hot_max_size'] = max(self._min_one_hot_max_size,
                                             params['one_hot_max_size'])

        params['max_bin'] = params.get('max_bin', 254)
        if params['task_type'] == 'CPU':
            params['max_bin'] = min(
                params['max_bin'],
                254)  # https://github.com/catboost/catboost/issues/1010
        if params['task_type'] == 'GPU':
            params['max_bin'] = min(
                params['max_bin'],
                127)  # https://github.com/catboost/catboost/issues/1010

        if uses_gpus:
            # https://catboost.ai/docs/features/training-on-gpu.html
            params['devices'] = "%d-%d" % (self.params_base.get(
                'gpu_id', 0), self.params_base.get('gpu_id', 0) + n_gpus - 1)
            #params['gpu_ram_part'] = 0.3  # per-GPU, assumes GPU locking or no other experiments running

        if self.num_classes > 2:
            params.pop("eval_metric", None)

        params['train_dir'] = self.context.experiment_tmp_dir
        params['allow_writing_files'] = False

        # assume during fit self.params_base could have been updated
        assert 'n_estimators' in params
        assert 'learning_rate' in params
        params['n_estimators'] = self.params_base.get('n_estimators', 100)
        params['learning_rate'] = self.params_base.get(
            'learning_rate', config.min_learning_rate)
        params['learning_rate'] = min(
            params['learning_rate'],
            0.5)  # 1.0 leads to illegal access on GPUs
        params['learning_rate'] = max(
            config.min_learning_rate,
            max(self._min_learning_rate_catboost, params['learning_rate']))
        if 'early_stopping_rounds' not in params and has_eval_set:
            params['early_stopping_rounds'] = 150  # temp fix
            # assert 'early_stopping_rounds' in params

        if uses_gpus:
            params.pop('sampling_frequency', None)

        if not uses_gpus and params['bootstrap_type'] == 'Poisson':
            params['bootstrap_type'] = 'Bayesian'  # revert to default
        if uses_gpus and params['bootstrap_type'] == 'MVS':
            params['bootstrap_type'] = 'Bayesian'  # revert to default

        if 'bootstrap_type' not in params or params['bootstrap_type'] not in [
                'Poisson', 'Bernoulli'
        ]:
            params.pop(
                'subsample',
                None)  # only allowed for those 2 bootstrap_type settings

        if params['bootstrap_type'] not in ['Bayesian']:
            params.pop('bagging_temperature', None)

        if not (self.num_classes == 2 and params['objective'] == 'Logloss'):
            params.pop('scale_pos_weight', None)

        # go back to some default eval_metric
        if self.num_classes == 1:
            if 'eval_metric' not in params or params['eval_metric'] not in [
                    'MAE', 'MAPE', 'Poisson', 'Quantile', 'RMSE',
                    'LogLinQuantile', 'Lq', 'Huber', 'Expectile', 'FairLoss',
                    'NumErrors', 'SMAPE', 'R2', 'MSLE', 'MedianAbsoluteError'
            ]:
                params['eval_metric'] = 'RMSE'
        elif self.num_classes == 2:
            if 'eval_metric' not in params or params['eval_metric'] not in [
                    'Logloss', 'CrossEntropy', 'Precision', 'Recall', 'F1',
                    'BalancedAccuracy', 'BalancedErrorRate', 'MCC', 'Accuracy',
                    'CtrFactor', 'AUC', 'NormalizedGini', 'BrierScore',
                    'HingeLoss', 'HammingLoss', 'ZeroOneLoss', 'Kappa',
                    'WKappa', 'LogLikelihoodOfPrediction'
            ]:
                params['eval_metric'] = 'Logloss'
        else:
            if 'eval_metric' not in params or params['eval_metric'] not in [
                    'MultiClass', 'MultiClassOneVsAll', 'Precision', 'Recall',
                    'F1', 'TotalF1', 'MCC', 'Accuracy', 'HingeLoss',
                    'HammingLoss', 'ZeroOneLoss', 'Kappa', 'WKappa', 'AUC'
            ]:
                params['eval_metric'] = 'MultiClass'

        # set system stuff here
        params['silent'] = self.params_base.get('silent', True)
        if config.debug_daimodel_level >= 1:
            params[
                'silent'] = False  # Can enable for tracking improvement in console/dai.log if have access
        params['random_state'] = self.params_base.get('random_state', 1234)
        params['thread_count'] = self.params_base.get(
            'n_jobs', max(1, physical_cores_count))  # -1 is not supported

        return params
Пример #25
0
import os
import uuid
from collections import OrderedDict
from zipfile import ZipFile

from h2oaicore.data import CustomData

import pandas as pd
import datatable as dt

from h2oaicore.systemutils import user_dir
from h2oaicore.systemutils_more import download
tmp_dir = os.path.join(user_dir(), str(uuid.uuid4())[:6])
path_to_zip = "https://files.slack.com/files-pri/T0329MHH6-F0150BK8L01/download/m5-forecasting-accuracy.zip?pub_secret=acfcbf3386"

holdout_splits = {
    'm5_private': range(1942, 1942 + 28)  # private LB
}


class PrepareM5Data(CustomData):
    """ Prepare data for m5 Kaggle Time-Series Forecast competition"""
    @staticmethod
    def create_data(X: dt.Frame = None):
        file = download(url=path_to_zip, dest_path=tmp_dir)
        with ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(tmp_dir)

        num_id_cols = 6
        main_data = dt.fread(
            os.path.join(tmp_dir, "sales_train_evaluation.csv"))
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        if kaggle_username == "XXX" or not kaggle_username:
            return []

        os.putenv("KAGGLE_USERNAME", kaggle_username)
        os.putenv("KAGGLE_KEY", kaggle_key)

        # find sample submission file
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)
        sub_file_dir = os.path.join(temp_path,
                                    "kaggle_%s" % str(uuid.uuid4())[:4])

        cmd_train = f'kaggle competitions download ' \
            f'-c two-sigma-connect-rental-listing-inquiries ' \
            f'-f train.json.zip ' \
            f'-p {sub_file_dir} -q'
        cmd_test = f'kaggle competitions download ' \
            f'-c two-sigma-connect-rental-listing-inquiries ' \
            f'-f test.json.zip ' \
            f'-p {sub_file_dir} -q'

        try:
            subprocess.check_output(cmd_train.split(),
                                    timeout=120).decode("utf-8")
        except TimeoutError:
            raise TimeoutError("Took longer than %s seconds, increase timeout")

        try:
            subprocess.check_output(cmd_test.split(),
                                    timeout=120).decode("utf-8")
        except TimeoutError:
            raise TimeoutError("Took longer than %s seconds, increase timeout")

        train = pd.read_json(os.path.join(sub_file_dir, 'train.json.zip'))
        test = pd.read_json(os.path.join(sub_file_dir, 'test.json.zip'))

        for df in [train, test]:
            df['str_features'] = df['features'].apply(lambda x: ' . '.join(x))
            df['nb_features'] = df['features'].apply(len)
            df['nb_photos'] = df['photos'].apply(len)
            df['cat_address'] = df['street_address'] + ' ' + df[
                'display_address']

        features = [
            'bathrooms', 'bedrooms', 'building_id', 'created', 'description',
            'display_address', 'latitude', 'listing_id', 'longitude',
            'manager_id', 'price', 'street_address', 'str_features',
            'nb_features', 'nb_photos', 'cat_address'
        ]

        return {
            'two_sigma_train': dt.Frame(train[features + ['interest_level']]),
            'two_sigma_test': dt.Frame(test[features])
        }
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.do_stemming = True  # turn off as needed
        self.do_lemmatization = True  # turn off as needed
        self.remove_stopwords = True  # turn off as needed

        import nltk
        nltk_data_path = os.path.join(user_dir(),
                                      config.contrib_env_relative_directory,
                                      "nltk_data")
        nltk_temp_path = os.path.join(user_dir(), "nltk_data")
        nltk.data.path.append(nltk_data_path)
        os.makedirs(nltk_data_path, exist_ok=True)
        nltk_download_lock_file = os.path.join(nltk_data_path, "nltk.lock")
        with filelock.FileLock(nltk_download_lock_file):
            nltk.download('stopwords', download_dir=nltk_data_path)
            nltk.download('punkt', download_dir=nltk_data_path)
            nltk.download('averaged_perceptron_tagger',
                          download_dir=nltk_data_path)
            nltk.download('maxent_treebank_pos_tagger',
                          download_dir=nltk_data_path)
            nltk.download('wordnet', download_dir=nltk_data_path)
            nltk.download('sonoritysequencing', download_dir=nltk_data_path)

        # download resources for stemming if needed
        if self.do_stemming:
            try:
                self.stemmer = nltk.stem.porter.PorterStemmer()
                self.stemmer.stem("test")
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                tokenizer_path = os.path.join(nltk_data_path, "tokenizers")
                os.makedirs(tokenizer_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, tokenizer_path)
                self.atomic_copy(file1, tokenizer_path)
                self.stemmer = nltk.stem.porter.PorterStemmer()
                self.stemmer.stem("test")

        # download resources for lemmatization if needed
        if self.do_lemmatization:
            try:
                from nltk.corpus import wordnet
                self.lemmatizer = nltk.stem.WordNetLemmatizer()
                self.pos_tagger = nltk.pos_tag
                self.lemmatizer.lemmatize("test", wordnet.NOUN)
                self.pos_tagger("test")
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                tagger_path = os.path.join(nltk_data_path, "taggers")
                corpora_path = os.path.join(nltk_data_path, "corpora")
                os.makedirs(tagger_path, exist_ok=True)
                os.makedirs(corpora_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
                    dest_path=nltk_temp_path)
                file2 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
                    dest_path=nltk_temp_path)
                file3 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, tagger_path)
                self.unzip_file(file2, tagger_path)
                self.unzip_file(file3, corpora_path)
                self.atomic_copy(file1, tagger_path)
                self.atomic_copy(file2, tagger_path)
                self.atomic_copy(file3, corpora_path)
                from nltk.corpus import wordnet
                self.lemmatizer = nltk.stem.WordNetLemmatizer()
                self.pos_tagger = nltk.pos_tag
                self.lemmatizer.lemmatize("test", wordnet.NOUN)
                self.pos_tagger("test")
            self.wordnet_map = {
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "J": wordnet.ADJ,
                "R": wordnet.ADV,
                "O": wordnet.NOUN
            }

        # download resources for stopwords if needed
        if self.remove_stopwords:
            try:
                self.stopwords = set(nltk.corpus.stopwords.words('english'))
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                corpora_path = os.path.join(nltk_data_path, "corpora")
                os.makedirs(corpora_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, corpora_path)
                self.atomic_copy(file1, corpora_path)
                self.stopwords = set(nltk.corpus.stopwords.words('english'))
Пример #28
0
def _setup_recipe():
    # for DAI 1.7.0 one is required to run this function manually
    # in DAI >=1.7.1, this function will be run by DAI itself
    import os
    from h2oaicore.systemutils_more import extract, download
    from h2oaicore.systemutils import config, remove
    from h2oaicore.systemutils import user_dir
    import shutil

    from h2oaicore.systemutils_more import arch_type  # don't remove this import, setup_recipe parsed-out separately
    return True  # WIP: Disable daal for now in general, just leave recipe floating there for migration purposes
    if arch_type == "ppc64le":
        if config.hard_asserts:
            # in CI testing just ignore
            return True
        else:
            # for user use, raise
            raise RuntimeError("Cannot use daal on PPC")

    daal_is_installed_path = os.path.join(
        user_dir(), config.contrib_env_relative_directory, "daal")
    daal_is_installed_file = os.path.join(daal_is_installed_path,
                                          "daal_is_installed")
    if not os.path.isfile(daal_is_installed_file):
        daal_temp_path = os.path.join(user_dir(),
                                      config.contrib_relative_directory,
                                      "daal")
        os.makedirs(daal_temp_path, exist_ok=True)
        prefix = "https://anaconda.org/intel"
        try:
            file1 = download(
                "%s/daal4py/2021.2.0/download/linux-64/daal4py-2021.2.0-py38_intel_358.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file2 = download(
                "%s/impi_rt/2021.2.0/download/linux-64/impi_rt-2021.2.0-intel_215.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file3 = download(
                "%s/daal/2021.2.0/download/linux-64/daal-2021.2.0-intel_358.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file4 = download(
                "https://github.com/intel/daal/releases/download/2019_u4/l_daal_oss_p_2019.4.007.tgz",
                dest_path=daal_temp_path)
        except:
            file1 = download(
                "https://0xdata-public.s3.amazonaws.com/daal4py-2019.4-py36h7b7c402_6.tar.bz2",
                dest_path=daal_temp_path)
            file2 = download(
                "https://0xdata-public.s3.amazonaws.com/impi_rt-2019.4-intel_243.tar.bz2",
                dest_path=daal_temp_path)
            file3 = download(
                "https://0xdata-public.s3.amazonaws.com/daal-2019.4-intel_243.tar.bz2",
                dest_path=daal_temp_path)
            file4 = download(
                "https://0xdata-public.s3.amazonaws.com/l_daal_oss_p_2019.4.007.tgz",
                dest_path=daal_temp_path)
        temp_path = os.path.join(user_dir(),
                                 config.contrib_env_relative_directory, "info")
        os.makedirs(temp_path, exist_ok=True)
        python_site_packages_path = os.path.join(
            user_dir(), config.contrib_env_relative_directory)
        extract(file1, python_site_packages_path)
        python_site_packages_path2 = os.path.join(
            user_dir(), config.contrib_env_relative_directory)
        extract(file2, python_site_packages_path2)
        extract(file3, python_site_packages_path2)
        extract(file4, python_site_packages_path2, "gz")

        other_path = os.path.join(python_site_packages_path2, "lib/libfabric/")
        import glob
        for file in glob.glob(os.path.join(other_path, "*.so*")):
            new_file = os.path.join(python_site_packages_path2, "lib",
                                    os.path.basename(file))
            if not os.path.isfile(new_file):
                shutil.copy(file, new_file)

        other_path = os.path.join(
            python_site_packages_path2,
            "l_daal_oss_p_2019.4.007/daal_prebuild/linux/tbb/lib/intel64_lin/gcc4.4/"
        )
        import glob
        for file in glob.glob(os.path.join(other_path, "*.so*")):
            new_file = os.path.join(python_site_packages_path2, "lib",
                                    os.path.basename(file))
            if not os.path.isfile(new_file):
                shutil.copy(file, new_file)
        os.makedirs(daal_is_installed_path, exist_ok=True)
        with open(daal_is_installed_file, "wt") as f:
            f.write("DONE")
        remove(file1)
        remove(file2)
        remove(file3)
        remove(file4)
        return True
Пример #29
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):

        # Get column names
        orig_cols = list(X.names)

        from h2oaicore.tensorflow_dynamic import got_cpu_tf, got_gpu_tf
        import tensorflow as tf
        import shap
        import scipy
        import pandas as pd

        self.setup_keras_session()

        import h2oaicore.keras as keras
        import matplotlib.pyplot as plt

        if not hasattr(self, 'save_model_path'):
            model_id = str(uuid.uuid4())[:8]
            self.save_model_path = os.path.join(user_dir(),
                                                "custom_xnn_model.hdf5")

        np.random.seed(self.random_state)

        my_init = keras.initializers.RandomUniform(seed=self.random_state)

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(
                experiment_id=self.context.experiment_id,
                tmp_dir=self.context.tmp_dir,
                experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folter
        tmp_folder = self._create_tmp_folder(logger)

        # define base model
        def xnn_initialize(features,
                           ridge_functions=3,
                           arch=[20, 12],
                           learning_rate=0.01,
                           bg_samples=100,
                           beta1=0.9,
                           beta2=0.999,
                           dec=0.0,
                           ams=True,
                           bseed=None,
                           is_categorical=False):

            #
            # Prepare model architecture
            #
            # Input to the network, our observation containing all the features
            input = keras.layers.Input(shape=(features, ), name='main_input')

            # Record current column names
            loggerinfo(logger, "XNN LOG")
            loggerdata(logger, "Feature list:")
            loggerdata(logger, str(orig_cols))

            # Input to ridge function number i is the dot product of our original input vector times coefficients
            ridge_input = keras.layers.Dense(ridge_functions,
                                             name="projection_layer",
                                             activation='linear')(input)

            ridge_networks = []
            # Each subnetwork uses only 1 neuron from the projection layer as input so we need to split it
            ridge_inputs = SplitLayer(ridge_functions)(ridge_input)
            for i, ridge_input in enumerate(ridge_inputs):
                # Generate subnetwork i
                mlp = _mlp(ridge_input, i, arch)
                ridge_networks.append(mlp)

            added = keras.layers.Concatenate(
                name='concatenate_1')(ridge_networks)

            # Add the correct output layer for the problem
            if is_categorical:
                out = keras.layers.Dense(1,
                                         activation='sigmoid',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)
            else:
                out = keras.layers.Dense(1,
                                         activation='linear',
                                         input_shape=(ridge_functions, ),
                                         name='main_output')(added)

            model = keras.models.Model(inputs=input, outputs=out)

            optimizer = keras.optimizers.Adam(lr=learning_rate,
                                              beta_1=beta1,
                                              beta_2=beta2,
                                              decay=dec,
                                              amsgrad=ams)

            # Use the correct loss for the problem
            if is_categorical:
                model.compile(loss={'main_output': 'binary_crossentropy'},
                              optimizer=optimizer)
            else:
                model.compile(loss={'main_output': 'mean_squared_error'},
                              optimizer=optimizer)

            return model

        def _mlp(input, idx, arch=[20, 12], activation='relu'):
            # Set up a submetwork

            # Hidden layers
            mlp = keras.layers.Dense(arch[0],
                                     activation=activation,
                                     name='mlp_{}_dense_0'.format(idx),
                                     kernel_initializer=my_init)(input)
            for i, layer in enumerate(arch[1:]):
                mlp = keras.layers.Dense(layer,
                                         activation=activation,
                                         name='mlp_{}_dense_{}'.format(
                                             idx, i + 1),
                                         kernel_initializer=my_init)(mlp)

            # Output of the MLP
            mlp = keras.layers.Dense(
                1,
                activation='linear',
                name='mlp_{}_dense_last'.format(idx),
                kernel_regularizer=keras.regularizers.l1(1e-3),
                kernel_initializer=my_init)(mlp)
            return mlp

        def get_shap(X, model):
            # Calculate the Shap values
            np.random.seed(24)
            bg_samples = min(X.shape[0], 1000)

            if isinstance(X, pd.DataFrame):
                background = X.iloc[np.random.choice(X.shape[0],
                                                     bg_samples,
                                                     replace=False)]
            else:
                background = X[np.random.choice(X.shape[0],
                                                bg_samples,
                                                replace=False)]

            # Explain predictions of the model on the subset
            explainer = shap.DeepExplainer(model, background)
            shap_values = explainer.shap_values(X)

            # Return the mean absolute value of each shap value for each dataset
            xnn_shap = np.abs(shap_values[0]).mean(axis=0)

            return xnn_shap

        # Initialize the xnn's
        features = X.shape[1]
        orig_cols = list(X.names)
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            self.is_cat = True
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)
        else:
            self.is_cat = False
            xnn1 = xnn_initialize(features=features,
                                  ridge_functions=features,
                                  arch=self.params["arch"],
                                  learning_rate=self.params["lr"],
                                  beta1=self.params["beta_1"],
                                  beta2=self.params["beta_1"],
                                  dec=self.params["decay"],
                                  ams=self.params["amsgrad"],
                                  is_categorical=self.is_cat)
            xnn = xnn_initialize(features=features,
                                 ridge_functions=features,
                                 arch=self.params["arch"],
                                 learning_rate=self.params["lr"],
                                 beta1=self.params["beta_1"],
                                 beta2=self.params["beta_1"],
                                 dec=self.params["decay"],
                                 ams=self.params["amsgrad"],
                                 is_categorical=self.is_cat)

        # Replace missing values with a value smaller than all observed values
        self.min = dict()
        for col in X.names:
            XX = X[:, col]
            self.min[col] = XX.min1()
            if self.min[col] is None or np.isnan(self.min[col]):
                self.min[col] = -1e10
            else:
                self.min[col] -= 1
            XX.replace(None, self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        X = X.to_numpy()

        inputs = {'main_input': X}
        validation_set = 0
        verbose = 0

        # Train the neural network once with early stopping and a validation set
        history = keras.callbacks.History()
        es = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min')

        history = xnn1.fit(inputs,
                           y,
                           epochs=self.params["n_estimators"],
                           batch_size=self.params["batch_size"],
                           validation_split=0.3,
                           verbose=verbose,
                           callbacks=[history, es])

        # Train again on the full data
        number_of_epochs_it_ran = len(history.history['loss'])

        xnn.fit(inputs,
                y,
                epochs=number_of_epochs_it_ran,
                batch_size=self.params["batch_size"],
                validation_split=0.0,
                verbose=verbose)

        # Get the mean absolute Shapley values
        importances = np.array(get_shap(X, xnn))

        int_output = {}
        int_weights = {}
        int_bias = {}
        int_input = {}

        original_activations = {}

        x_labels = list(map(lambda x: 'x' + str(x), range(features)))

        intermediate_output = []

        # Record and plot the projection weights
        #
        weight_list = []
        for layer in xnn.layers:

            layer_name = layer.get_config()['name']
            if layer_name != "main_input":
                print(layer_name)
                weights = layer.get_weights()

                # Record the biases
                try:
                    bias = layer.get_weights()[1]
                    int_bias[layer_name] = bias
                except:
                    print("No Bias")

                # Record outputs for the test set
                intermediate_layer_model = keras.models.Model(
                    inputs=xnn.input, outputs=xnn.get_layer(layer_name).output)

                # Record the outputs from the training set
                if self.is_cat and (layer_name == 'main_output'):
                    original_activations[layer_name] = scipy.special.logit(
                        intermediate_layer_model.predict(X))
                    original_activations[
                        layer_name +
                        "_p"] = intermediate_layer_model.predict(X)
                else:
                    original_activations[
                        layer_name] = intermediate_layer_model.predict(X)

                    # Record other weights, inputs, and outputs
                int_weights[layer_name] = weights
                int_input[layer_name] = layer.input
                int_output[layer_name] = layer.output

            # Plot the projection layers
            if "projection_layer" in layer.get_config()['name']:

                # print(layer.get_config()['name'])

                # Record the weights for each projection layer
                weights = [np.transpose(layer.get_weights()[0])]

                weight_list2 = []
                for i, weight in enumerate(weights[0]):
                    weight_list.append(weight)
                    weight_list2.append(
                        list(np.reshape(weight, (1, features))[0]))

                    # Plot weights
                    plt.bar(orig_cols,
                            abs(np.reshape(weight, (1, features))[0]),
                            1,
                            color="blue")
                    plt.ylabel("Coefficient value")
                    plt.title("Projection Layer Weights {}".format(i),
                              fontdict={'fontsize': 10})
                    plt.xticks(rotation=90)
                    plt.show()
                    plt.savefig(os.path.join(
                        tmp_folder, 'projection_layer_' + str(i) + '.png'),
                                bbox_inches="tight")
                    plt.clf()

            if "main_output" in layer.get_config()['name']:
                weights_main = layer.get_weights()
                print(weights_main)

        pd.DataFrame(weight_list2).to_csv(os.path.join(tmp_folder,
                                                       "projection_data.csv"),
                                          index=False)

        intermediate_output = []

        for feature_num in range(features):
            intermediate_layer_model = keras.models.Model(
                inputs=xnn.input,
                outputs=xnn.get_layer('mlp_' + str(feature_num) +
                                      '_dense_last').output)
            intermediate_output.append(intermediate_layer_model.predict(X))

        # Record and plot the ridge functions
        ridge_x = []
        ridge_y = []
        for weight_number in range(len(weight_list)):
            ridge_x.append(
                list(
                    sum(X[:, ii] * weight_list[weight_number][ii]
                        for ii in range(features))))
            ridge_y.append(list(intermediate_output[weight_number]))

            plt.plot(
                sum(X[:, ii] * weight_list[weight_number][ii]
                    for ii in range(features)),
                intermediate_output[weight_number], 'o')
            plt.xlabel("Input")
            plt.ylabel("Subnetwork " + str(weight_number))
            plt.title("Ridge Function {}".format(i), fontdict={'fontsize': 10})
            plt.show()
            plt.savefig(
                os.path.join(tmp_folder,
                             'ridge_' + str(weight_number) + '.png'))
            plt.clf()

        # Output the ridge function importance
        weights2 = np.array([item[0] for item in list(weights)[0]])

        output_activations = np.abs(
            np.array([
                item * weights2
                for item in list(original_activations["concatenate_1"])
            ])).mean(axis=0)
        loggerinfo(logger, str(output_activations))
        pd.DataFrame(output_activations).to_csv(os.path.join(
            tmp_folder, "ridge_weights.csv"),
                                                index=False)

        plt.bar(x_labels, output_activations, 1, color="blue")
        plt.xlabel("Ridge function number")
        plt.ylabel("Feature importance")
        plt.title("Ridge function importance", fontdict={'fontsize': 10})
        plt.show()
        plt.savefig(os.path.join(tmp_folder, 'Ridge_function_importance.png'))

        pd.DataFrame(ridge_y).applymap(lambda x: x[0]).to_csv(os.path.join(
            tmp_folder, "ridge_y.csv"),
                                                              index=False)
        pd.DataFrame(ridge_x).to_csv(os.path.join(tmp_folder, "ridge_x.csv"),
                                     index=False)

        pd.DataFrame(orig_cols).to_csv(os.path.join(tmp_folder,
                                                    "input_columns.csv"),
                                       index=False)

        self.set_model_properties(model=xnn,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=self.params['n_estimators'])