Exemplo n.º 1
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        import nltk
        nltk_data_path = os.path.join(config.data_directory, config.contrib_env_relative_directory, "nltk_data")
        nltk_temp_path = os.path.join(temporary_files_path, "nltk_data")
        nltk.data.path.append(nltk_data_path)
        try:
            self.pos_tagger = nltk.pos_tag
            self.pos_tagger("test")
        except LookupError:
            os.makedirs(nltk_data_path, exist_ok=True)
            os.makedirs(nltk_temp_path, exist_ok=True)
            tagger_path = os.path.join(nltk_data_path, "taggers")
            os.makedirs(tagger_path, exist_ok=True)
            file1 = download("https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
                             dest_path=nltk_temp_path)
            file2 = download("https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
                             dest_path=nltk_temp_path)
            self.unzip_file(file1, tagger_path)
            self.unzip_file(file2, tagger_path)
            self.atomic_move(file1, tagger_path)
            self.atomic_move(file2, tagger_path)
            self.pos_tagger = nltk.pos_tag
            self.pos_tagger("test")
Exemplo n.º 2
0
 def set_tagger(self):
     import nltk
     nltk_data_path = os.path.join(user_dir(),
                                   config.contrib_env_relative_directory,
                                   "nltk_data")
     nltk_temp_path = os.path.join(user_dir(), "nltk_data")
     nltk.data.path.append(nltk_data_path)
     nltk.download('averaged_perceptron_tagger',
                   download_dir=nltk_data_path)
     try:
         self.pos_tagger = nltk.pos_tag
         self.pos_tagger("test")
     except LookupError:
         os.makedirs(nltk_data_path, exist_ok=True)
         os.makedirs(nltk_temp_path, exist_ok=True)
         tagger_path = os.path.join(nltk_data_path, "taggers")
         os.makedirs(tagger_path, exist_ok=True)
         file1 = download(
             "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
             dest_path=nltk_temp_path)
         file2 = download(
             "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
             dest_path=nltk_temp_path)
         self.unzip_file(file1, tagger_path)
         self.unzip_file(file2, tagger_path)
         self.atomic_copy(file1, tagger_path)
         self.atomic_copy(file2, tagger_path)
         self.pos_tagger = nltk.pos_tag
         self.pos_tagger("test")
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory,
                                 "airlines")
        os.makedirs(temp_path, exist_ok=True)

        link = AirlinesData.base_url + "1990.csv.bz2"
        file = download(link, dest_path=temp_path)
        output_file1 = file.replace(".bz2", "")
        print("%s %s" % (file, output_file1))
        extract_bz2(file, output_file1)

        link = AirlinesData.base_url + "1991.csv.bz2"
        file = download(link, dest_path=temp_path)
        output_file2 = file.replace(".bz2", "")
        print("%s %s" % (file, output_file2))
        extract_bz2(file, output_file2)

        return [output_file1, output_file2]
Exemplo n.º 4
0
    def create_data(X: dt.Frame = None) -> Union[
        str, List[str],
        dt.Frame, List[dt.Frame],
        np.ndarray, List[np.ndarray],
        pd.DataFrame, List[pd.DataFrame],
        Dict[str, str],  # {data set names : paths}
        Dict[str, dt.Frame],  # {data set names : dt frames}
        Dict[str, np.ndarray],  # {data set names : np arrays}
        Dict[str, pd.DataFrame],  # {data set names : pd frames}
    ]:
        # Download files
        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(user_dir(), config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
        link_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"
        link_episodes = "https://datasets.imdbws.com/title.episode.tsv.gz"

        # Download the files
        file_basics = download(link_basics, dest_path=temp_path)
        file_ratings = download(link_ratings, dest_path=temp_path)
        file_episodes = download(link_episodes, dest_path=temp_path)

        # get COVID19 new cases data from Our World in Data github
        basics = dt.fread(file_basics, fill=True)
        ratings = dt.fread(file_ratings, fill=True)
        episodes = dt.fread(file_episodes, na_strings=['\\N'], fill=True)

        # remove files
        os.remove(file_basics)
        os.remove(file_ratings)
        os.remove(file_episodes)

        # Create Title with Ratings dataset
        # join titles with non-null ratings
        ratings = ratings[~dt.isna(dt.f.averageRating), :]
        ratings.key = "tconst"
        basics_ratings = basics[:, :, dt.join(ratings)]

        # Create Episodes dataset
        episodes = episodes[~dt.isna(dt.f.seasonNumber) & ~dt.isna(dt.f.episodeNumber), :]
        episode_ratings = episodes[:, :, dt.join(ratings)]
        episode_ratings.names = {'tconst': 'episodeTconst', 'parentTconst': 'tconst', 'averageRating': 'episodeAverageRating', 'numVotes': 'episodeNumVotes'}
        basics_ratings.key = 'tconst'
        title_episode_ratings = episode_ratings[:, :, dt.join(basics_ratings)]

        # enumerate series episodes from 1 to N
        title_episode_ratings = title_episode_ratings[:, :, dt.sort(dt.f.tconst, dt.f.seasonNumber, dt.f.episodeNumber)]
        result = title_episode_ratings[:, dt.count(), dt.by(dt.f.tconst)][:, 'count'].to_list()
        from itertools import chain
        cumcount = chain.from_iterable([i + 1 for i in range(n)] for n in result[0])
        title_episode_ratings['episodeSequence'] = dt.Frame(tuple(cumcount))

        # return datasets
        return {f"imdb_title_ratings": basics_ratings,
                f"imdb_episode_ratings": title_episode_ratings}
Exemplo n.º 5
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data_file = zipfile.read()
            open(output_file, 'wb').write(data_file)

        temp_path = os.path.join(config.data_directory,
                                 config.contrib_relative_directory, "airlines")
        os.makedirs(temp_path, exist_ok=True)

        link = "http://stat-computing.org/dataexpo/2009/1987.csv.bz2"
        file = download(link, dest_path=temp_path)
        output_file = file.replace(".bz2", "")
        print("%s %s" % (file, output_file))
        extract_bz2(file, output_file)

        return output_file
Exemplo n.º 6
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        # Location in DAI file system where we will save the data set
        temp_path = os.path.join(config.data_directory,
                                 config.contrib_relative_directory)
        os.makedirs(temp_path, exist_ok=True)

        # URL of desired data, this comes from the City of Seattle
        link = "https://data.seattle.gov/resource/rdtp-hzy3.csv"

        # Download the file
        file = download(link, dest_path=temp_path)

        # Give the file a descriptive name for the UI
        output_file = file.replace("rdtp-hzy3", "seattle_monthly_rain_raw")
        os.rename(file, output_file)

        # Return the location on the DAI server for this data set
        return output_file
Exemplo n.º 7
0
    def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = "http://data.un.org/_Docs/SYB/CSV/SYB63_226_202009_Net%20Disbursements%20from%20Official%20ODA%20to%20Recipients.csv"
        output_file1 = download(link, dest_path=temp_path)

        link = "http://data.un.org/_Docs/SYB/CSV/SYB63_223_202009_Net%20Disbursements%20from%20Official%20ODA%20from%20Donors.csv"
        output_file2 = download(link, dest_path=temp_path)

        return [output_file1, output_file2]
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory,
                                 "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
        output_file1 = download(link, dest_path=temp_path)

        link = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data"
        output_file2 = download(link, dest_path=temp_path)

        return [output_file1, output_file2]
    def create_data(X: dt.Frame = None) -> Union[str, List[str],
                                                 dt.Frame, List[dt.Frame],
                                                 np.ndarray, List[np.ndarray],
                                                 pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config

        temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "testdata_%s" % str(uuid.uuid4()))
        os.makedirs(temp_path, exist_ok=True)

        link = TestData.url
        file = download(link, dest_path=temp_path)

        return file
Exemplo n.º 10
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(user_dir(), "recipe_tmp", "airlines")
        os.makedirs(temp_path, exist_ok=True)
        dt.options.nthreads = 8

        # specify which years are used for training and testing
        training = list(range(2005, 2008))
        testing = [2008]

        # download and unzip files
        files = []
        for f in ["%d.csv.bz2" % year for year in training + testing]:
            link = AirlinesData.base_url + "%s" % f
            file = download(link, dest_path=temp_path)
            output_file = file.replace(".bz2", "")
            if not os.path.exists(output_file):
                extract_bz2(file, output_file)
            files.append(output_file)

        # parse with datatable
        X = dt.rbind(*[dt.fread(x) for x in files])

        # add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[
            'DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s_per_%d_min' % (name, timeslice_mins)
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # select flights leaving from SFO only
        X = X[dt.f['Origin'] == 'SFO', :]

        # Fill NaNs in DepDelay column
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0

        # create binary target column
        depdelay_threshold_mins = 15
        target = 'DepDelay%dm' % depdelay_threshold_mins
        X[:, target] = dt.f['DepDelay'] > depdelay_threshold_mins
        cols_to_keep.extend([
            target,
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
            'Distance',
            # Leaks for delay
            # 'DepTime',
            # 'ArrTime', #'CRSArrTime',
            # 'ActualElapsedTime',
            # 'AirTime', #'ArrDelay', #'DepDelay',
            # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay',
            # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay',
        ])
        X = X[:, cols_to_keep]

        # Join in some extra info
        join_files = [('UniqueCarrier', 'carriers.csv', 'Code'),
                      ('Origin', 'airports.csv', 'iata'),
                      ('Dest', 'airports.csv', 'iata'),
                      ('TailNum', 'plane-data.csv', 'tailnum')]

        for join_key, file, col in join_files:
            file = download(
                'https://0xdata-public.s3.amazonaws.com/data_recipes_data/%s' %
                file,
                dest_path=temp_path)
            X_join = dt.fread(file, fill=True)
            X_join.names = {col: join_key}
            X_join.names = [join_key] + [
                join_key + "_" + x for x in X_join.names if x != join_key
            ]
            X_join.key = join_key
            X = X[:, :, dt.join(X_join)]
            del X[:, join_key]

        split = True
        if not split:
            filename = os.path.join(
                temp_path, "flight_delays_data_recipe_%d-%d.csv" %
                (min(training), max(testing)))
            X.to_csv(filename)
            return filename
        else:
            # prepare splits (by year) and create binary .jay files for import into Driverless AI
            output_files = []
            for condition, name in [
                ((min(training) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(training)), 'training'),
                ((min(testing) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(testing)), 'test'),
            ]:
                X_split = X[condition, :]
                filename = os.path.join(
                    temp_path, "augmented_flights_%s-%d_%s.csv" %
                    (X_split[:, 'Year'].min1(), X_split[:,
                                                        'Year'].max1(), name))
                X_split.to_csv(filename)
                output_files.append(filename)
            return output_files
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.do_stemming = True  # turn off as needed
        self.do_lemmatization = True  # turn off as needed
        self.remove_stopwords = True  # turn off as needed

        import nltk
        nltk_data_path = os.path.join(config.data_directory,
                                      config.contrib_env_relative_directory,
                                      "nltk_data")
        nltk_temp_path = os.path.join(temporary_files_path, "nltk_data")
        nltk.data.path.append(nltk_data_path)

        # download resources for stemming if needed
        if self.do_stemming:
            try:
                self.stemmer = nltk.stem.porter.PorterStemmer()
                self.stemmer.stem("test")
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                tokenizer_path = os.path.join(nltk_data_path, "tokenizers")
                os.makedirs(tokenizer_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/tokenizers/punkt.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, tokenizer_path)
                self.atomic_move(file1, tokenizer_path)
                self.stemmer = nltk.stem.porter.PorterStemmer()
                self.stemmer.stem("test")

        # download resources for lemmatization if needed
        if self.do_lemmatization:
            try:
                from nltk.corpus import wordnet
                self.lemmatizer = nltk.stem.WordNetLemmatizer()
                self.pos_tagger = nltk.pos_tag
                self.lemmatizer.lemmatize("test", wordnet.NOUN)
                self.pos_tagger("test")
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                tagger_path = os.path.join(nltk_data_path, "taggers")
                corpora_path = os.path.join(nltk_data_path, "corpora")
                os.makedirs(tagger_path, exist_ok=True)
                os.makedirs(corpora_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/averaged_perceptron_tagger.zip",
                    dest_path=nltk_temp_path)
                file2 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/taggers/maxent_treebank_pos_tagger.zip",
                    dest_path=nltk_temp_path)
                file3 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/wordnet.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, tagger_path)
                self.unzip_file(file2, tagger_path)
                self.unzip_file(file3, corpora_path)
                self.atomic_move(file1, tagger_path)
                self.atomic_move(file2, tagger_path)
                self.atomic_move(file3, corpora_path)
                from nltk.corpus import wordnet
                self.lemmatizer = nltk.stem.WordNetLemmatizer()
                self.pos_tagger = nltk.pos_tag
                self.lemmatizer.lemmatize("test", wordnet.NOUN)
                self.pos_tagger("test")
            self.wordnet_map = {
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "J": wordnet.ADJ,
                "R": wordnet.ADV,
                "O": wordnet.NOUN
            }

        # download resources for stopwords if needed
        if self.remove_stopwords:
            try:
                self.stopwords = set(nltk.corpus.stopwords.words('english'))
            except LookupError:
                os.makedirs(nltk_data_path, exist_ok=True)
                os.makedirs(nltk_temp_path, exist_ok=True)
                corpora_path = os.path.join(nltk_data_path, "corpora")
                os.makedirs(corpora_path, exist_ok=True)
                file1 = download(
                    "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/packages/corpora/stopwords.zip",
                    dest_path=nltk_temp_path)
                self.unzip_file(file1, corpora_path)
                self.atomic_move(file1, corpora_path)
                self.stopwords = set(nltk.corpus.stopwords.words('english'))
Exemplo n.º 12
0
def _setup_recipe():
    # for DAI 1.7.0 one is required to run this function manually
    # in DAI >=1.7.1, this function will be run by DAI itself
    import os
    from h2oaicore.systemutils_more import extract, download
    from h2oaicore.systemutils import config, remove
    from h2oaicore.systemutils import user_dir
    import shutil

    from h2oaicore.systemutils_more import arch_type  # don't remove this import, setup_recipe parsed-out separately
    return True  # WIP: Disable daal for now in general, just leave recipe floating there for migration purposes
    if arch_type == "ppc64le":
        if config.hard_asserts:
            # in CI testing just ignore
            return True
        else:
            # for user use, raise
            raise RuntimeError("Cannot use daal on PPC")

    daal_is_installed_path = os.path.join(
        user_dir(), config.contrib_env_relative_directory, "daal")
    daal_is_installed_file = os.path.join(daal_is_installed_path,
                                          "daal_is_installed")
    if not os.path.isfile(daal_is_installed_file):
        daal_temp_path = os.path.join(user_dir(),
                                      config.contrib_relative_directory,
                                      "daal")
        os.makedirs(daal_temp_path, exist_ok=True)
        prefix = "https://anaconda.org/intel"
        try:
            file1 = download(
                "%s/daal4py/2021.2.0/download/linux-64/daal4py-2021.2.0-py38_intel_358.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file2 = download(
                "%s/impi_rt/2021.2.0/download/linux-64/impi_rt-2021.2.0-intel_215.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file3 = download(
                "%s/daal/2021.2.0/download/linux-64/daal-2021.2.0-intel_358.tar.bz2"
                % prefix,
                dest_path=daal_temp_path)
            file4 = download(
                "https://github.com/intel/daal/releases/download/2019_u4/l_daal_oss_p_2019.4.007.tgz",
                dest_path=daal_temp_path)
        except:
            file1 = download(
                "https://0xdata-public.s3.amazonaws.com/daal4py-2019.4-py36h7b7c402_6.tar.bz2",
                dest_path=daal_temp_path)
            file2 = download(
                "https://0xdata-public.s3.amazonaws.com/impi_rt-2019.4-intel_243.tar.bz2",
                dest_path=daal_temp_path)
            file3 = download(
                "https://0xdata-public.s3.amazonaws.com/daal-2019.4-intel_243.tar.bz2",
                dest_path=daal_temp_path)
            file4 = download(
                "https://0xdata-public.s3.amazonaws.com/l_daal_oss_p_2019.4.007.tgz",
                dest_path=daal_temp_path)
        temp_path = os.path.join(user_dir(),
                                 config.contrib_env_relative_directory, "info")
        os.makedirs(temp_path, exist_ok=True)
        python_site_packages_path = os.path.join(
            user_dir(), config.contrib_env_relative_directory)
        extract(file1, python_site_packages_path)
        python_site_packages_path2 = os.path.join(
            user_dir(), config.contrib_env_relative_directory)
        extract(file2, python_site_packages_path2)
        extract(file3, python_site_packages_path2)
        extract(file4, python_site_packages_path2, "gz")

        other_path = os.path.join(python_site_packages_path2, "lib/libfabric/")
        import glob
        for file in glob.glob(os.path.join(other_path, "*.so*")):
            new_file = os.path.join(python_site_packages_path2, "lib",
                                    os.path.basename(file))
            if not os.path.isfile(new_file):
                shutil.copy(file, new_file)

        other_path = os.path.join(
            python_site_packages_path2,
            "l_daal_oss_p_2019.4.007/daal_prebuild/linux/tbb/lib/intel64_lin/gcc4.4/"
        )
        import glob
        for file in glob.glob(os.path.join(other_path, "*.so*")):
            new_file = os.path.join(python_site_packages_path2, "lib",
                                    os.path.basename(file))
            if not os.path.isfile(new_file):
                shutil.copy(file, new_file)
        os.makedirs(daal_is_installed_path, exist_ok=True)
        with open(daal_is_installed_file, "wt") as f:
            f.write("DONE")
        remove(file1)
        remove(file2)
        remove(file3)
        remove(file4)
        return True
Exemplo n.º 13
0
def _setup_recipe():
    # for DAI 1.7.0 one is required to run this function manually
    # in DAI >=1.7.1, this function will be run by DAI itself
    import os
    from h2oaicore.systemutils_more import extract, download
    from h2oaicore.systemutils import config
    import shutil

    daal_is_installed_path = os.path.join(
        config.data_directory, config.contrib_env_relative_directory, "daal")
    daal_is_installed_file = os.path.join(daal_is_installed_path,
                                          "daal_is_installed")
    if not os.path.isfile(daal_is_installed_file):
        daal_temp_path = os.path.join(config.data_directory,
                                      config.contrib_relative_directory,
                                      "daal")
        os.makedirs(daal_temp_path, exist_ok=True)
        prefix = "https://anaconda.org/intel"
        file1 = download(
            "%s/daal4py/2019.4/download/linux-64/daal4py-2019.4-py36h7b7c402_6.tar.bz2"
            % prefix,
            dest_path=daal_temp_path)
        file2 = download(
            "%s/impi_rt/2019.4/download/linux-64/impi_rt-2019.4-intel_243.tar.bz2"
            % prefix,
            dest_path=daal_temp_path)
        file3 = download(
            "%s/daal/2019.4/download/linux-64/daal-2019.4-intel_243.tar.bz2" %
            prefix,
            dest_path=daal_temp_path)
        file4 = download(
            "https://github.com/intel/daal/releases/download/2019_u1.1/l_daal_oss_p_2019.1.004.tgz",
            dest_path=daal_temp_path)
        temp_path = os.path.join(config.data_directory,
                                 config.contrib_env_relative_directory, "info")
        os.makedirs(temp_path, exist_ok=True)
        python_site_packages_path = os.path.join(
            config.data_directory, config.contrib_env_relative_directory)
        extract(file1, python_site_packages_path)
        python_site_packages_path2 = os.path.join(
            config.data_directory, config.contrib_env_relative_directory)
        extract(file2, python_site_packages_path2)
        extract(file3, python_site_packages_path2)
        extract(file4, python_site_packages_path2, "gz")

        other_path = os.path.join(python_site_packages_path2, "lib/libfabric/")
        import glob
        for file in glob.glob(os.path.join(other_path, "*.so*")):
            new_file = os.path.join(python_site_packages_path2, "lib",
                                    os.path.basename(file))
            if not os.path.isfile(new_file):
                shutil.copy(file, new_file)

        other_path = os.path.join(
            python_site_packages_path2,
            "l_daal_oss_p_2019.1.004/daal_prebuild/linux/tbb/lib/intel64_lin/gcc4.4/"
        )
        import glob
        for file in glob.glob(os.path.join(other_path, "*.so*")):
            new_file = os.path.join(python_site_packages_path2, "lib",
                                    os.path.basename(file))
            if not os.path.isfile(new_file):
                shutil.copy(file, new_file)
        os.makedirs(daal_is_installed_path, exist_ok=True)
        with open(daal_is_installed_file, "wt") as f:
            f.write("DONE")
        return True
Exemplo n.º 14
0
    def create_data(X: dt.Frame = None):
        file = download(url=path_to_zip, dest_path=tmp_dir)
        with ZipFile(file, 'r') as zip_ref:
            zip_ref.extractall(tmp_dir)

        num_id_cols = 6
        main_data = dt.fread(
            os.path.join(tmp_dir, "sales_train_evaluation.csv"))
        all_cols = list(main_data.names)
        id_cols = all_cols[:num_id_cols]
        date_cols = all_cols[num_id_cols + 1125:]

        # training data
        target = "target"
        data = pd.melt(main_data.to_pandas(),
                       id_vars=id_cols,
                       value_vars=date_cols,
                       var_name="d",
                       value_name=target)
        data[target] = data[target].astype(float)
        data = dt.Frame(data)
        data_splits = [data]
        names = ["m5_train"]

        # test data for submission
        submission = dt.fread(os.path.join(tmp_dir, "sample_submission.csv"))

        for name, ranges in holdout_splits.items():
            test_cls = ["d_" + str(k) for k in ranges]
            test_data = []
            ids = submission["id"].to_list()[0]
            new_test_cols = ["d"] + id_cols
            for i in range(len(ids)):
                id = ids[i]
                splits = ids[i].split("_")
                item_id = splits[0] + "_" + splits[1] + "_" + splits[2]
                dept_id = splits[0] + "_" + splits[1]
                cat_id = splits[0]
                store_id = splits[3] + "_" + splits[4]
                state_id = splits[3]
                id_values = [id, item_id, dept_id, cat_id, store_id, state_id]
                for j in range(len(test_cls)):
                    row_values = [test_cls[j]] + id_values
                    test_data.append(row_values)

            test_data = pd.DataFrame(test_data, columns=new_test_cols)
            test_data = dt.Frame(test_data)
            data_splits.append(test_data)
            names.append(name)

        weather_data = dt.fread(os.path.join(tmp_dir, "calendar.csv"))
        weather_data.key = "d"

        price_data = dt.fread(os.path.join(tmp_dir, "sell_prices.csv"))
        price_data.key = ["store_id", "item_id", "wm_yr_wk"]

        ret = OrderedDict()
        for n, f in zip(names, data_splits):
            f = f[:, :, dt.join(weather_data)]
            f = f[:, :, dt.join(price_data)]
            ret[n] = f
        return ret
Exemplo n.º 15
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        # import packages
        import os
        import gc
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import zipfile

        # define constants
        train_data_url = "https://files.slack.com/files-pri/T0329MHH6-F012UF3T2J0/download/bosch_train_full.zip?pub_secret=c59d0f381a"
        test_data_url = "https://files.slack.com/files-pri/T0329MHH6-F013ES4F6N4/download/bosch_test_full.zip?pub_secret=8726e8b7e2"

        # function for unzipping data
        def extract_zip(file, output_directory):
            with zipfile.ZipFile(file, "r") as zip_ref:
                zip_ref.extractall(output_directory)

        # download and unzip files
        temp_path = os.path.join(config.data_directory, "recipe_tmp", "bosch")
        os.makedirs(temp_path, exist_ok=True)

        for link in [train_data_url, test_data_url]:
            raw_file = download(link, dest_path=temp_path)
            extract_zip(raw_file, temp_path)

        # parse with datatable
        train_path = os.path.join(temp_path, "bosch_train_full.csv")
        test_path = os.path.join(temp_path, "bosch_test_full.csv")

        X_train = dt.fread(train_path)
        X_test = dt.fread(test_path)

        # add leak features
        train = X_train[:, ["Id", "Response"]].to_pandas()
        test = X_test[:, ["Id"]].to_pandas()

        date_features = [colname for colname in X_test.names if "D" in colname]

        train["Min_Date"] = X_train[:, date_features].to_pandas().min(
            axis=1).values
        test["Min_Date"] = X_test[:,
                                  date_features].to_pandas().min(axis=1).values

        ntrain = train.shape[0]
        train_test = pd.concat([train, test]).reset_index(drop=True)

        train_test.sort_values(by=["Min_Date", "Id"],
                               ascending=True,
                               inplace=True)

        train_test["Leak_1"] = train_test["Id"].diff()
        train_test["Leak_2"] = train_test["Id"].iloc[::-1].diff()

        train_test["Leak_3"] = train_test["Response"].shift(1)
        train_test["Leak_4"] = train_test["Response"].shift(-1)

        train_test = dt.Frame(train_test.drop("Response", axis=1))
        train_test.key = "Id"

        X_train = X_train[:, :, dt.join(train_test)]
        X_test = X_test[:, :, dt.join(train_test)]

        return {"bosch_train_leak": X_train, "bosch_test_leak": X_test}
Exemplo n.º 16
0
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame]]:
        import os
        from h2oaicore.systemutils_more import download
        from h2oaicore.systemutils import config
        import bz2

        def extract_bz2(file, output_file):
            zipfile = bz2.BZ2File(file)
            data = zipfile.read()
            open(output_file, 'wb').write(data)

        temp_path = os.path.join(config.data_directory,
                                 config.contrib_relative_directory, "airlines")
        os.makedirs(temp_path, exist_ok=True)

        # specify which years are used for training and testing
        training = [2007]
        testing = [2008]

        # download and unzip files
        files = []
        for f in ["%d.csv.bz2" % year for year in training + testing]:
            link = "http://stat-computing.org/dataexpo/2009/%s" % f
            file = download(link, dest_path=temp_path)
            output_file = file.replace(".bz2", "")
            extract_bz2(file, output_file)
            files.append(output_file)

        # parse with datatable
        X = dt.rbind(*[dt.fread(x) for x in files])

        # add date
        date_col = 'Date'
        X[:, date_col] = dt.f['Year'] * 10000 + dt.f['Month'] * 100 + dt.f[
            'DayofMonth']
        cols_to_keep = ['Date']

        # add number of flights in/out for each airport per given interval
        timeslice_mins = 60
        for name, new_col, col, group in [
            ("out", "CRSDepTime_mod", "CRSDepTime", "Origin"),
            ("in", "CRSArrTime_mod", "CRSArrTime", "Dest")
        ]:
            X[:, new_col] = X[:, dt.f[col] // timeslice_mins]
            group_cols = [date_col, group, new_col]
            new_name = 'flights_%s' % name
            flights = X[:, {new_name: dt.count()}, dt.by(*group_cols)]
            flights.key = group_cols
            cols_to_keep.append(new_name)
            X = X[:, :, dt.join(flights)]

        # Fill NaNs with 0s
        X[dt.isna(dt.f['DepDelay']), 'DepDelay'] = 0
        cols_to_keep.extend([
            'DepDelay',
            'Year',
            'Month',
            'DayofMonth',
            'DayOfWeek',
            'CRSDepTime',
            'UniqueCarrier',
            'FlightNum',
            'TailNum',
            'CRSElapsedTime',
            'Origin',
            'Dest',
            'Distance',
            # Leaks for delay
            # 'DepTime',
            # 'ArrTime', #'CRSArrTime',
            # 'ActualElapsedTime',
            # 'AirTime', #'ArrDelay', #'DepDelay',
            # 'TaxiIn', #'TaxiOut', #'Cancelled', #'CancellationCode', #'Diverted', #'CarrierDelay',
            # #'WeatherDelay', #'NASDelay', #'SecurityDelay', #'LateAircraftDelay',
        ])
        X = X[:, cols_to_keep]

        # Join in some extra info
        join_files = [('UniqueCarrier', 'carriers.csv', 'Code'),
                      ('Origin', 'airports.csv', 'iata'),
                      ('Dest', 'airports.csv', 'iata'),
                      ('TailNum', 'plane-data.csv', 'tailnum')]

        for join_key, file, col in join_files:
            file = download('http://stat-computing.org/dataexpo/2009/%s' %
                            file,
                            dest_path=temp_path)
            X_join = dt.fread(file, fill=True)
            X_join.names = {col: join_key}
            X_join.names = [join_key] + [
                join_key + "_" + x for x in X_join.names if x != join_key
            ]
            X_join.key = join_key
            X = X[:, :, dt.join(X_join)]
            del X[:, join_key]

        split = False

        if not split:
            filename = os.path.join(
                temp_path, "flight_delays_regression_%d-%d.jay" %
                (min(training), max(testing)))
            X.to_jay(filename)
            return filename
        else:
            # prepare splits (by year) and create binary .jay files for import into Driverless AI
            output_files = []
            for condition, name in [
                ((min(training) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(training)), 'training'),
                ((min(testing) <= dt.f['Year']) &
                 (dt.f['Year'] <= max(testing)), 'test'),
            ]:
                X_split = X[condition, :]
                filename = os.path.join(temp_path,
                                        "flight_delays_%s.jay" % name)
                X_split.to_jay(filename)
                output_files.append(filename)
            return output_files