예제 #1
0
파일: __init__.py 프로젝트: yarenty/ludwig
    def write_output_dataset(self, labels, images, output_dir):
        """Create output directories where we write out the images.

        :args:
            labels (str) : the labels for the image
            data (np.array) : the binary array corresponding to the image
            output_dir (str) : the output directory that we need to write to
            path (str): the raw dataset path
        :returns:
            A tuple of the label for the image, the file array, the size and rows and columns for the image
        """
        # create child image output directories
        output_dirs = [
            os.path.join(output_dir, str(i)) for i in range(NUM_LABELS)
        ]

        for output_dir in output_dirs:
            makedirs(output_dir, exist_ok=True)

        def write_processed_image(t):
            i, label = t
            output_filename = os.path.join(output_dirs[label], str(i) + ".png")
            torch_image = torch.from_numpy(images[i].copy()).view(1, 28, 28)
            self.write_png(torch_image, output_filename)

        # write out image data
        tasks = list(enumerate(labels))
        pool = ThreadPool(NUM_LABELS)
        pool.map(write_processed_image, tasks)
        pool.close()
        pool.join()
예제 #2
0
파일: __init__.py 프로젝트: yarenty/ludwig
    def process_downloaded_dataset(self):
        df = pd.read_csv(os.path.join(self.raw_dataset_path, "UCI CBM Dataset",
                                      "data.txt"),
                         header=None,
                         sep="   ")

        columns = [
            "lp",
            "v",
            "gtt",
            "gtn",
            "ggn",
            "ts",
            "tp",
            "t48",
            "t1",
            "t2",
            "p48",
            "p1",
            "p2",
            "pexh",
            "tic",
            "mf",
            "gtcdsc",
            "gttdsc",
        ]
        df.columns = columns

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename),
                  index=False)
        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #3
0
    def process_downloaded_dataset(self):
        downloaded_files = self.download_filenames
        filetype = self.download_file_type

        train_files = ["train_identity.csv", "train_transaction.csv"]
        test_files = ["test_identity.csv", "test_transaction.csv"]

        train_dfs, test_dfs = {}, {}

        for split_name, filename in downloaded_files.items():
            file_df = self.read_file(filetype, filename, header=0)
            if filename in train_files:
                train_dfs[split_name] = file_df
            elif filename in test_files:
                test_dfs[split_name] = file_df

        # Merge on TransactionID
        final_train = pd.merge(train_dfs["train_transaction"],
                               train_dfs["train_identity"],
                               on="TransactionID",
                               how="left")

        makedirs(self.processed_dataset_path)
        # Only save train split as test split has no ground truth labels
        final_train.to_csv(os.path.join(self.processed_dataset_path,
                                        self.csv_filename),
                           index=False)
예제 #4
0
파일: dask_df_utils.py 프로젝트: cxz/ludwig
def dask_to_tfrecords(df,
                      folder,
                      compression_type="GZIP",
                      compression_level=9):
    """Store Dask.dataframe to TFRecord files."""
    makedirs(folder, exist_ok=True)
    compression_ext = get_compression_ext(compression_type)
    filenames = [
        get_part_filename(i, compression_ext) for i in range(df.npartitions)
    ]

    # Also write a meta data file
    write_meta(df, folder, compression_type)

    dsk = {}
    name = "to-tfrecord-" + tokenize(df, folder)
    part_tasks = []
    kwargs = {}

    for d, filename in enumerate(filenames):
        dsk[(name, d)] = (apply, pandas_df_to_tfrecords, [
            (df._name, d),
            os.path.join(folder, filename), compression_type, compression_level
        ], kwargs)
        part_tasks.append((name, d))

    dsk[name] = (lambda x: None, part_tasks)

    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[df])
    out = Delayed(name, graph)
    out = out.compute()
    return out
예제 #5
0
    def process_downloaded_dataset(self):
        df = pd.read_csv(
            os.path.join(self.raw_dataset_path, 'HIGGS.csv.gz'),
            header=None
        )

        df.columns = [
            "label", "lepton_pT", "lepton_eta", "lepton_phi",
            "missing_energy_magnitude", "missing_energy_phi",
            "jet_1_pt", "jet_1_eta", "jet_1_phi", "jet_1_b-tag",
            "jet_2_pt", "jet_2_eta", "jet_2_phi", "jet_2_b-tag",
            "jet_3_pt", "jet_3_eta", "jet_3_phi", "jet_3_b-tag",
            "jet_4_pt", "jet_4_eta", "jet_4_phi", "jet_4_b-tag",
            "m_jj", "m_jjj", "m_lv", "m_jlv", "m_bb", "m_wbb", "m_wwbb"
        ]

        df['label'] = df['label'].astype('int32')
        if self.add_validation_set:
            df['split'] = [0] * 10000000 + [1] * 500000 + [2] * 500000
        else:
            df['split'] = [0] * 10500000 + [2] * 500000

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_parquet(os.path.join(self.processed_temp_path, self.parquet_filename),
                      engine='pyarrow',
                      row_group_size=50000,
                      index=False)

        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #6
0
파일: process.py 프로젝트: kanishk16/ludwig
    def process_downloaded_dataset(self, header=0):
        """Processes dataset

        :param header: indicates whether raw data files contain headers
        """
        downloaded_files = self.download_filenames
        filetype = self.download_file_type
        all_files = []
        for split_name, filename in downloaded_files.items():
            file_df = self.read_file(filetype, filename, header)
            if split_name == 'train_file':
                file_df['split'] = 0
            elif split_name == 'val_file':
                file_df['split'] = 1
            elif split_name == 'test_file':
                file_df['split'] = 2
            else:
                raise ValueError(f'Unrecognized split name: {split_name}')
            all_files.append(file_df)

        concat_df = pd.concat(all_files, ignore_index=True)
        makedirs(self.processed_dataset_path, exist_ok=True)
        concat_df.to_csv(os.path.join(self.processed_dataset_path,
                                      self.csv_filename),
                         index=False)
예제 #7
0
    def process_downloaded_dataset(self):
        train_df = pd.read_csv(os.path.join(self.raw_dataset_path,
                                            "adult.data"),
                               header=None)
        test_df = pd.read_csv(os.path.join(self.raw_dataset_path,
                                           "adult.test"),
                              header=None,
                              skiprows=1)

        # age: continuous.
        # workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. # noqa: E501
        # fnlwgt: continuous.
        # education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. # noqa: E501
        # education-num: continuous.
        # marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.    # noqa: E501
        # occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. # noqa: E501
        # relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
        # race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
        # sex: Female, Male.
        # capital-gain: continuous.
        # capital-loss: continuous.
        # hours-per-week: continuous.
        # native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.   # noqa: E501
        # income: >50K, <=50K.
        columns = [
            "age",
            "workclass",
            "fnlwgt",
            "education",
            "education-num",
            "marital-status",
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital-gain",
            "capital-loss",
            "hours-per-week",
            "native-country",
            "income",
        ]
        train_df.columns = columns
        test_df.columns = columns
        # Remove the trailing period on the income field in adult.test (not in adult.data)
        test_df["income"] = test_df["income"].str.rstrip(".")

        train_df["split"] = 0
        test_df["split"] = 2

        df = pd.concat([train_df, test_df])
        # remove trailing space
        df["income"] = df["income"].str.strip()

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename),
                  index=False)

        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #8
0
    def process_downloaded_dataset(self):
        """The final method where we create a concatenated CSV file with both training ant dest data."""
        train_file = self.config["split_filenames"]["train_file"]

        df = pd.read_csv(os.path.join(self.raw_dataset_path, train_file))
        df[SPLIT] = df.index.to_series().map(lambda x: np.random.choice(3, 1, p=(0.7, 0.1, 0.2))).astype(np.int8)

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False)
        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #9
0
파일: pandas.py 프로젝트: kanishk16/ludwig
    def to_tfrecord(self, df, path):
        compression_type = "GZIP"
        compression_ext = get_compression_ext(compression_type)

        makedirs(path, exist_ok=True)
        write_meta(df, path, compression_type)

        filename = os.path.join(path, get_part_filename(0, compression_ext))
        pandas_df_to_tfrecords(df,
                               filename,
                               compression_type=compression_type,
                               compression_level=9)
예제 #10
0
 def process_downloaded_dataset(self):
     """Read the training and test directories and write out a csv containing the training path and the
     label."""
     makedirs(self.processed_temp_path, exist_ok=True)
     for dataset in ["training", "testing"]:
         print(f">>> create ludwig formatted {dataset} data")
         labels, data = self.read_source_dataset(dataset,
                                                 self.raw_dataset_path)
         self.write_output_dataset(
             labels, data, os.path.join(self.processed_temp_path, dataset))
     self.output_training_and_test_data()
     rename(self.processed_temp_path, self.processed_dataset_path)
     print(">>> completed data preparation")
예제 #11
0
    def process_downloaded_dataset(self, header=0):
        zip_file = ZipFile(
            os.path.join(self.raw_dataset_path, "orange_small_train.data.zip"))
        train_df = pd.read_csv(zip_file.open("orange_small_train.data"),
                               sep="\t")

        zip_file = ZipFile(
            os.path.join(self.raw_dataset_path, "orange_small_test.data.zip"))
        test_df = pd.read_csv(zip_file.open("orange_small_test.data"),
                              sep="\t")

        train_df = process_categorical_features(train_df, categorical_features)
        train_df = process_numerical_features(train_df, categorical_features)

        targets = (pd.read_csv(os.path.join(
            self.raw_dataset_path,
            f"orange_small_train_{self.task_name}.labels"),
                               header=None)[0].astype(str).apply(
                                   lambda x: "true" if x == "1" else "false"))

        train_idcs = pd.read_csv(os.path.join(
            self.raw_dataset_path,
            f"stratified_train_idx_{self.task_name}.txt"),
                                 header=None)[0]

        val_idcs = pd.read_csv(os.path.join(
            self.raw_dataset_path,
            f"stratified_test_idx_{self.task_name}.txt"),
                               header=None)[0]

        processed_train_df = train_df.iloc[train_idcs].copy()
        processed_train_df["target"] = targets.iloc[train_idcs]
        processed_train_df["split"] = 0

        processed_val_df = train_df.iloc[val_idcs].copy()
        processed_val_df["target"] = targets.iloc[val_idcs]
        processed_val_df["split"] = 1

        test_df["target"] = ""
        test_df["split"] = 2

        df = pd.concat([processed_train_df, processed_val_df, test_df])

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename),
                  index=False)

        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #12
0
    def process_downloaded_dataset(self):
        raw_df = pd.read_csv(
            os.path.join(self.raw_dataset_path, self.data_filename))
        columns = [
            "sepal_length_cm",
            "sepal_width_cm",
            "petal_length_cm",
            "petal_width_cm",
            "class",
        ]
        raw_df.columns = columns

        makedirs(self.processed_dataset_path, exist_ok=True)
        raw_df.to_csv(os.path.join(self.processed_dataset_path,
                                   self.csv_filename),
                      index=False)
예제 #13
0
파일: __init__.py 프로젝트: skanjila/ludwig
    def process_downloaded_dataset(self):

        stores_df = pd.read_csv(os.path.join(self.raw_dataset_path, "store.csv"))

        train_df = pd.read_csv(os.path.join(self.raw_dataset_path, "train.csv"), low_memory=False)
        train_df = preprocess_df(train_df, stores_df)

        train_df["split"] = -1
        train_df.loc[train_df["Year"] == 2014, "split"] = 0
        train_df.loc[train_df["Year"] == 2015, "split"] = 2
        train_df.drop(train_df[train_df["split"] == -1].index, inplace=True)
        df = train_df

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False)
        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #14
0
    def process_downloaded_dataset(self):
        """The final method where we create a concatenated CSV file with both training ant dest data."""
        train_file = self.config["split_filenames"]["train_file"]
        test_file = self.config["split_filenames"]["test_file"]

        train_df = pd.read_csv(os.path.join(self.raw_dataset_path, train_file))
        test_df = pd.read_csv(os.path.join(self.raw_dataset_path, test_file))

        train_df["split"] = 0
        test_df["split"] = 2

        df = pd.concat([train_df, test_df])

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename),
                  index=False)
        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #15
0
    def process_downloaded_dataset(self):
        makedirs(self.processed_temp_path, exist_ok=True)

        # create a dictionary matching image_path --> list of captions
        image_to_caption = defaultdict(list)
        with open(
            f"{self.raw_dataset_path}/Flickr8k.token.txt",
            "r"
        ) as captions_file:
            image_to_caption = defaultdict(list)
            for line in captions_file:
                line = line.split("#")
                # the regex is to format the string to fit properly in a csv
                line[1] = line[1].strip("\n01234.\t ")
                line[1] = re.sub('\"', '\"\"', line[1])
                line[1] = '\"' + line[1] + '\"'
                image_to_caption[line[0]].append(line[1])
        # create csv file with 7 columns: image_path, 5 captions, and split
        with open(
                os.path.join(self.processed_temp_path, self.csv_filename),
                'w'
        ) as output_file:
            output_file.write('image_path,caption0,caption1,caption2,')
            output_file.write('caption3,caption4,split\n')
            splits = ["train", "dev", "test"]
            for i in range(len(splits)):
                split = splits[i]
                with open(
                    f"{self.raw_dataset_path}/Flickr_8k.{split}Images.txt",
                    "r"
                ) as split_file:
                    for image_name in split_file:
                        image_name = image_name.strip('\n')
                        if image_name in image_to_caption:
                            output_file.write('{},{},{},{},{},{},{}\n'.format(
                                # Note: image folder is named Flicker8k_Dataset
                                "{}/Flicker8k_Dataset/{}".format(
                                    self.raw_dataset_path, image_name
                                ),
                                *image_to_caption[image_name],
                                i
                            ))
        # Note: csv is stored in /processed while images are stored in /raw
        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #16
0
    def process_downloaded_dataset(self):
        makedirs(self.processed_temp_path, exist_ok=True)

        dataset_name = self.config["kaggle_dataset_name"]
        for url in self.config["kaggle_dataset_files"]:
            file_name = os.path.join(self.raw_dataset_path, dataset_name, url)
            # TODO(shreya): DataFrame created twice: here + CSVMixin. Figure out
            # options for using it once.
            df = pd.read_csv(
                file_name,
                header=0,
                names=[
                    "image_path",
                    "insurance_company",
                    "cost_of_vehicle",
                    "min_coverage",
                    "expiry_date",
                    "max_coverage",
                    "condition",
                    "amount",
                ],
            )
            df["image_path"] = df["image_path"].apply(
                lambda x: os.path.join(self.raw_dataset_path, dataset_name, "trainImages", os.path.basename(x))
            )
            df.to_csv(
                os.path.join(self.processed_temp_path, self.csv_filename),
                columns=[
                    "image_path",
                    "insurance_company",
                    "cost_of_vehicle",
                    "min_coverage",
                    "expiry_date",
                    "max_coverage",
                    "condition",
                    "amount",
                ],
            )

        # Note: csv is stored in /processed while images are stored in /raw
        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #17
0
    def process_downloaded_dataset(self):
        df = pd.read_csv(os.path.join(self.raw_dataset_path, "covtype.data.gz"), header=None)

        # Elevation                               quantitative    meters                       Elevation in meters
        # Aspect                                  quantitative    azimuth                      Aspect in degrees azimuth
        # Slope                                   quantitative    degrees                      Slope in degrees
        # Horizontal_Distance_To_Hydrology        quantitative    meters                       Horz Dist to nearest surface water features      # noqa: E501
        # Vertical_Distance_To_Hydrology          quantitative    meters                       Vert Dist to nearest surface water features      # noqa: E501
        # Horizontal_Distance_To_Roadways         quantitative    meters                       Horz Dist to nearest roadway                     # noqa: E501
        # Hillshade_9am                           quantitative    0 to 255 index               Hillshade index at 9am, summer solstice          # noqa: E501
        # Hillshade_Noon                          quantitative    0 to 255 index               Hillshade index at noon, summer soltice          # noqa: E501
        # Hillshade_3pm                           quantitative    0 to 255 index               Hillshade index at 3pm, summer solstice          # noqa: E501
        # Horizontal_Distance_To_Fire_Points      quantitative    meters                       Horz Dist to nearest wildfire ignition points    # noqa: E501
        # Wilderness_Area (4 binary columns)      qualitative     0 (absence) or 1 (presence)  Wilderness area designation                      # noqa: E501
        # Soil_Type (40 binary columns)           qualitative     0 (absence) or 1 (presence)  Soil Type designation
        # Cover_Type (7 types)                    integer         1 to 7                       Forest Cover Type designation                    # noqa: E501
        columns = [
            "Elevation",
            "Aspect",
            "Slope",
            "Horizontal_Distance_To_Hydrology",
            "Vertical_Distance_To_Hydrology",
            "Horizontal_Distance_To_Roadways",
            "Hillshade_9am",
            "Hillshade_Noon",
            "Hillshade_3pm",
            "Horizontal_Distance_To_Fire_Points",
            "Wilderness_Area_1",
            "Wilderness_Area_2",
            "Wilderness_Area_3",
            "Wilderness_Area_4",
            "Soil_Type_1",
            "Soil_Type_2",
            "Soil_Type_3",
            "Soil_Type_4",
            "Soil_Type_5",
            "Soil_Type_6",
            "Soil_Type_7",
            "Soil_Type_8",
            "Soil_Type_9",
            "Soil_Type_10",
            "Soil_Type_11",
            "Soil_Type_12",
            "Soil_Type_13",
            "Soil_Type_14",
            "Soil_Type_15",
            "Soil_Type_16",
            "Soil_Type_17",
            "Soil_Type_18",
            "Soil_Type_19",
            "Soil_Type_20",
            "Soil_Type_21",
            "Soil_Type_22",
            "Soil_Type_23",
            "Soil_Type_24",
            "Soil_Type_25",
            "Soil_Type_26",
            "Soil_Type_27",
            "Soil_Type_28",
            "Soil_Type_29",
            "Soil_Type_30",
            "Soil_Type_31",
            "Soil_Type_32",
            "Soil_Type_33",
            "Soil_Type_34",
            "Soil_Type_35",
            "Soil_Type_36",
            "Soil_Type_37",
            "Soil_Type_38",
            "Soil_Type_39",
            "Soil_Type_40",
            "Cover_Type",
        ]
        df.columns = columns

        # Map the 40 soil types to a single integer
        # instead of 40 binary columns
        st_cols = [
            "Soil_Type_1",
            "Soil_Type_2",
            "Soil_Type_3",
            "Soil_Type_4",
            "Soil_Type_5",
            "Soil_Type_6",
            "Soil_Type_7",
            "Soil_Type_8",
            "Soil_Type_9",
            "Soil_Type_10",
            "Soil_Type_11",
            "Soil_Type_12",
            "Soil_Type_13",
            "Soil_Type_14",
            "Soil_Type_15",
            "Soil_Type_16",
            "Soil_Type_17",
            "Soil_Type_18",
            "Soil_Type_19",
            "Soil_Type_20",
            "Soil_Type_21",
            "Soil_Type_22",
            "Soil_Type_23",
            "Soil_Type_24",
            "Soil_Type_25",
            "Soil_Type_26",
            "Soil_Type_27",
            "Soil_Type_28",
            "Soil_Type_29",
            "Soil_Type_30",
            "Soil_Type_31",
            "Soil_Type_32",
            "Soil_Type_33",
            "Soil_Type_34",
            "Soil_Type_35",
            "Soil_Type_36",
            "Soil_Type_37",
            "Soil_Type_38",
            "Soil_Type_39",
            "Soil_Type_40",
        ]
        st_vals = []
        for _, row in df[st_cols].iterrows():
            st_vals.append(row.to_numpy().nonzero()[0].item(0))
        df = df.drop(columns=st_cols)
        df["Soil_Type"] = st_vals

        # Map the 4 wilderness areas to a single integer
        # instead of 4 binary columns
        wa_cols = ["Wilderness_Area_1", "Wilderness_Area_2", "Wilderness_Area_3", "Wilderness_Area_4"]
        wa_vals = []
        for _, row in df[wa_cols].iterrows():
            wa_vals.append(row.to_numpy().nonzero()[0].item(0))
        df = df.drop(columns=wa_cols)
        df["Wilderness_Area"] = wa_vals

        if not self.use_tabnet_split:
            # first 11340 records used for training data subset
            # next 3780 records used for validation data subset
            # last 565892 records used for testing data subset
            df["split"] = [0] * 11340 + [1] * 3780 + [2] * 565892
        else:
            # Split used in the tabNet paper
            # https://github.com/google-research/google-research/blob/master/tabnet/download_prepare_covertype.py
            train_val_indices, test_indices = train_test_split(range(len(df)), test_size=0.2, random_state=0)
            train_indices, val_indices = train_test_split(train_val_indices, test_size=0.2 / 0.6, random_state=0)

            df["split"] = 0
            df.loc[val_indices, "split"] = 1
            df.loc[test_indices, "split"] = 2

        makedirs(self.processed_temp_path, exist_ok=True)
        df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False)
        rename(self.processed_temp_path, self.processed_dataset_path)
예제 #18
0
파일: run.py 프로젝트: zhisbug/ludwig
def hyperopt(
    config: Union[str, dict],
    dataset: Union[str, dict, pd.DataFrame] = None,
    training_set: Union[str, dict, pd.DataFrame] = None,
    validation_set: Union[str, dict, pd.DataFrame] = None,
    test_set: Union[str, dict, pd.DataFrame] = None,
    training_set_metadata: Union[str, dict] = None,
    data_format: str = None,
    experiment_name: str = 'hyperopt',
    model_name: str = 'run',
    skip_save_training_description: bool = False,
    skip_save_training_statistics: bool = False,
    skip_save_model: bool = False,
    skip_save_progress: bool = False,
    skip_save_log: bool = False,
    skip_save_processed_input: bool = True,
    skip_save_unprocessed_output: bool = False,
    skip_save_predictions: bool = False,
    skip_save_eval_stats: bool = False,
    skip_save_hyperopt_statistics: bool = False,
    output_directory: str = 'results',
    gpus: Union[str, int, List[int]] = None,
    gpu_memory_limit: int = None,
    allow_parallel_threads: bool = True,
    backend: Union[Backend, str] = None,
    random_seed: int = default_random_seed,
    debug: bool = False,
    **kwargs,
) -> HyperoptResults:
    """This method performs an hyperparameter optimization.

    # Inputs

    :param config: (Union[str, dict]) config which defines
        the different parameters of the model, features, preprocessing and
        training.  If `str`, filepath to yaml configuration file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the experiment.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocessed
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming. If you do not want to keep
        the weights and just find out what performance a model can get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
    :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files.
    :param skip_save_eval_stats: (bool, default: `False`) skips saving test
        statistics JSON file.
    :param skip_save_hyperopt_statistics: (bool, default: `False`) skips saving
        hyperopt stats file.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param debug: (bool, default: `False) if `True` turns on `tfdbg` with
        `inf_or_nan` checks.

    # Return

    :return: (List[dict]) List of results for each trial, ordered by
        descending performance on the target metric.
    """
    backend = initialize_backend(backend)

    # check if config is a path or a dict
    if isinstance(config, str):  # assume path
        with open_file(config, 'r') as def_file:
            config_dict = yaml.safe_load(def_file)
    else:
        config_dict = config

    # merge config with defaults
    config = merge_with_defaults(config_dict)

    if HYPEROPT not in config:
        raise ValueError("Hyperopt Section not present in config")

    hyperopt_config = config["hyperopt"]

    update_hyperopt_params_with_defaults(hyperopt_config)

    # print hyperopt config
    logger.info(pformat(hyperopt_config, indent=4))
    logger.info('\n')

    sampler = hyperopt_config["sampler"]
    executor = hyperopt_config["executor"]
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    ######################
    # check validity of output_feature / metric/ split combination
    ######################
    if split == TRAINING:
        if training_set is None and (
                config['preprocessing']['split_probabilities'][0] <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    elif split == VALIDATION:
        if validation_set is None and (
                config['preprocessing']['split_probabilities'][1] <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    elif split == TEST:
        if test_set is None and (
                config['preprocessing']['split_probabilities'][2] <= 0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                'was not provided, '
                'or the split amount specified in the preprocessing section '
                'of the config is not greater than 0'.format(split))
    else:
        raise ValueError('unrecognized hyperopt split "{}". '
                         'Please provide one of: {}'.format(
                             split, {TRAINING, VALIDATION, TEST}))
    if output_feature == COMBINED:
        if metric != LOSS:
            raise ValueError(
                'The only valid metric for "combined" output feature is "loss"'
            )
    else:
        output_feature_names = set(of['name']
                                   for of in config['output_features'])
        if output_feature not in output_feature_names:
            raise ValueError('The output feature specified for hyperopt "{}" '
                             'cannot be found in the config. '
                             'Available ones are: {} and "combined"'.format(
                                 output_feature, output_feature_names))

        output_feature_type = None
        for of in config['output_features']:
            if of['name'] == output_feature:
                output_feature_type = of[TYPE]
        feature_class = get_from_registry(output_feature_type,
                                          output_type_registry)
        if metric not in feature_class.metric_functions:
            # todo v0.4: allow users to specify also metrics from the overall
            #  and per class metrics from the trainign stats and in general
            #  and potprocessed metric
            raise ValueError(
                'The specified metric for hyperopt "{}" is not a valid metric '
                'for the specified output feature "{}" of type "{}". '
                'Available metrics are: {}'.format(
                    metric, output_feature, output_feature_type,
                    feature_class.metric_functions.keys()))

    hyperopt_sampler = get_build_hyperopt_sampler(sampler[TYPE])(goal,
                                                                 parameters,
                                                                 **sampler)

    hyperopt_executor = get_build_hyperopt_executor(executor[TYPE])(
        hyperopt_sampler, output_feature, metric, split, **executor)

    hyperopt_results = hyperopt_executor.execute(
        config,
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        # model_load_path=None,
        # model_resume_path=None,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        backend=backend,
        random_seed=random_seed,
        debug=debug,
        **kwargs)

    if backend.is_coordinator():
        print_hyperopt_results(hyperopt_results)

        if not skip_save_hyperopt_statistics:
            makedirs(output_directory, exist_ok=True)

            hyperopt_stats = {
                'hyperopt_config':
                hyperopt_config,
                'hyperopt_results':
                [t.to_dict() for t in hyperopt_results.ordered_trials],
            }

            save_hyperopt_stats(hyperopt_stats, output_directory)
            logger.info('Hyperopt stats saved to: {}'.format(output_directory))

    logger.info('Finished hyperopt')

    return hyperopt_results
예제 #19
0
파일: run.py 프로젝트: ludwig-ai/ludwig
def hyperopt(
    config: Union[str, dict],
    dataset: Union[str, dict, pd.DataFrame] = None,
    training_set: Union[str, dict, pd.DataFrame] = None,
    validation_set: Union[str, dict, pd.DataFrame] = None,
    test_set: Union[str, dict, pd.DataFrame] = None,
    training_set_metadata: Union[str, dict] = None,
    data_format: str = None,
    experiment_name: str = "hyperopt",
    model_name: str = "run",
    resume: Optional[bool] = None,
    skip_save_training_description: bool = False,
    skip_save_training_statistics: bool = False,
    skip_save_model: bool = False,
    skip_save_progress: bool = False,
    skip_save_log: bool = False,
    skip_save_processed_input: bool = True,
    skip_save_unprocessed_output: bool = False,
    skip_save_predictions: bool = False,
    skip_save_eval_stats: bool = False,
    skip_save_hyperopt_statistics: bool = False,
    output_directory: str = "results",
    gpus: Union[str, int, List[int]] = None,
    gpu_memory_limit: int = None,
    allow_parallel_threads: bool = True,
    callbacks: List[Callback] = None,
    backend: Union[Backend, str] = None,
    random_seed: int = default_random_seed,
    hyperopt_log_verbosity: int = 3,
    **kwargs,
) -> HyperoptResults:
    """This method performs an hyperparameter optimization.

    # Inputs

    :param config: (Union[str, dict]) config which defines
        the different parameters of the model, features, preprocessing and
        training.  If `str`, filepath to yaml configuration file.
    :param dataset: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing the entire dataset to be used in the experiment.
        If it has a split column, it will be used for splitting (0 for train,
        1 for validation, 2 for test), otherwise the dataset will be
        randomly split.
    :param training_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing training data.
    :param validation_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing validation data.
    :param test_set: (Union[str, dict, pandas.DataFrame], default: `None`)
        source containing test data.
    :param training_set_metadata: (Union[str, dict], default: `None`)
        metadata JSON file or loaded metadata.  Intermediate preprocessed
        structure containing the mappings of the input
        dataset created the first time an input file is used in the same
        directory with the same name and a '.meta.json' extension.
    :param data_format: (str, default: `None`) format to interpret data
        sources. Will be inferred automatically if not specified.  Valid
        formats are `'auto'`, `'csv'`, `'df'`, `'dict'`, `'excel'`, `'feather'`,
        `'fwf'`, `'hdf5'` (cache file produced during previous training),
        `'html'` (file containing a single HTML `<table>`), `'json'`, `'jsonl'`,
        `'parquet'`, `'pickle'` (pickled Pandas DataFrame), `'sas'`, `'spss'`,
        `'stata'`, `'tsv'`.
    :param experiment_name: (str, default: `'experiment'`) name for
        the experiment.
    :param model_name: (str, default: `'run'`) name of the model that is
        being used.
    :param resume: (bool) If true, continue hyperopt from the state of the previous
        run in the output directory with the same experiment name. If false, will create
        new trials, ignoring any previous state, even if they exist in the output_directory.
        By default, will attempt to resume if there is already an existing experiment with
        the same name, and will create new trials if not.
    :param skip_save_training_description: (bool, default: `False`) disables
        saving the description JSON file.
    :param skip_save_training_statistics: (bool, default: `False`) disables
        saving training statistics JSON file.
    :param skip_save_model: (bool, default: `False`) disables
        saving model weights and hyperparameters each time the model
        improves. By default Ludwig saves model weights after each epoch
        the validation metric improves, but if the model is really big
        that can be time consuming. If you do not want to keep
        the weights and just find out what performance a model can get
        with a set of hyperparameters, use this parameter to skip it,
        but the model will not be loadable later on and the returned model
        will have the weights obtained at the end of training, instead of
        the weights of the epoch with the best validation performance.
    :param skip_save_progress: (bool, default: `False`) disables saving
        progress each epoch. By default Ludwig saves weights and stats
        after each epoch for enabling resuming of training, but if
        the model is really big that can be time consuming and will uses
        twice as much space, use this parameter to skip it, but training
        cannot be resumed later on.
    :param skip_save_log: (bool, default: `False`) disables saving
        TensorBoard logs. By default Ludwig saves logs for the TensorBoard,
        but if it is not needed turning it off can slightly increase the
        overall speed.
    :param skip_save_processed_input: (bool, default: `False`) if input
        dataset is provided it is preprocessed and cached by saving an HDF5
        and JSON files to avoid running the preprocessing again. If this
        parameter is `False`, the HDF5 and JSON file are not saved.
    :param skip_save_unprocessed_output: (bool, default: `False`) by default
        predictions and their probabilities are saved in both raw
        unprocessed numpy files containing tensors and as postprocessed
        CSV files (one for each output feature). If this parameter is True,
        only the CSV ones are saved and the numpy ones are skipped.
    :param skip_save_predictions: (bool, default: `False`) skips saving test
        predictions CSV files.
    :param skip_save_eval_stats: (bool, default: `False`) skips saving test
        statistics JSON file.
    :param skip_save_hyperopt_statistics: (bool, default: `False`) skips saving
        hyperopt stats file.
    :param output_directory: (str, default: `'results'`) the directory that
        will contain the training statistics, TensorBoard logs, the saved
        model and the training progress files.
    :param gpus: (list, default: `None`) list of GPUs that are available
        for training.
    :param gpu_memory_limit: (int, default: `None`) maximum memory in MB to
        allocate per GPU device.
    :param allow_parallel_threads: (bool, default: `True`) allow TensorFlow
        to use multithreading parallelism to improve performance at
        the cost of determinism.
    :param callbacks: (list, default: `None`) a list of
        `ludwig.callbacks.Callback` objects that provide hooks into the
        Ludwig pipeline.
    :param backend: (Union[Backend, str]) `Backend` or string name
        of backend to use to execute preprocessing / training steps.
    :param random_seed: (int: default: 42) random seed used for weights
        initialization, splits and any other random function.
    :param hyperopt_log_verbosity: (int: default: 3) controls verbosity of
        ray tune log messages.  Valid values: 0 = silent, 1 = only status updates,
        2 = status and brief trial results, 3 = status and detailed trial results.

    # Return

    :return: (List[dict]) List of results for each trial, ordered by
        descending performance on the target metric.
    """
    from ludwig.hyperopt.execution import get_build_hyperopt_executor, RayTuneExecutor

    # check if config is a path or a dict
    if isinstance(config, str):  # assume path
        with open_file(config, "r") as def_file:
            config_dict = yaml.safe_load(def_file)
    else:
        config_dict = config

    # Get mapping of input/output features that don't have an encoder for shared parameters
    features_eligible_for_shared_params = {
        INPUT_FEATURES:
        get_features_eligible_for_shared_params(config_dict, INPUT_FEATURES),
        OUTPUT_FEATURES:
        get_features_eligible_for_shared_params(config_dict, OUTPUT_FEATURES),
    }

    # merge config with defaults
    config = merge_with_defaults(config_dict)

    if HYPEROPT not in config:
        raise ValueError("Hyperopt Section not present in config")

    hyperopt_config = config[HYPEROPT]

    update_hyperopt_params_with_defaults(hyperopt_config)

    # print hyperopt config
    logging.info("Hyperopt config")
    logging.info(pformat(hyperopt_config, indent=4))
    logging.info("\n")

    logging.info(
        "Features that may be updated in hyperopt trials if default parameters are specified in the search space"
    )
    logging.info(pformat(dict(features_eligible_for_shared_params), indent=4))
    logging.info("\n")

    search_alg = hyperopt_config["search_alg"]
    executor = hyperopt_config[EXECUTOR]
    parameters = hyperopt_config["parameters"]
    split = hyperopt_config["split"]
    output_feature = hyperopt_config["output_feature"]
    metric = hyperopt_config["metric"]
    goal = hyperopt_config["goal"]

    ######################
    # check validity of output_feature / metric/ split combination
    ######################
    splitter = get_splitter(**config[PREPROCESSING]["split"])
    if split == TRAINING:
        if training_set is None and not splitter.has_split(0):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                "was not provided, "
                "or the split amount specified in the preprocessing section "
                "of the config is not greater than 0".format(split))
    elif split == VALIDATION:
        if validation_set is None and not splitter.has_split(1):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                "was not provided, "
                "or the split amount specified in the preprocessing section "
                "of the config is not greater than 0".format(split))
    elif split == TEST:
        if test_set is None and not splitter.has_split(2):
            raise ValueError(
                'The data for the specified split for hyperopt "{}" '
                "was not provided, "
                "or the split amount specified in the preprocessing section "
                "of the config is not greater than 0".format(split))
    else:
        raise ValueError('unrecognized hyperopt split "{}". '
                         "Please provide one of: {}".format(
                             split, {TRAINING, VALIDATION, TEST}))
    if output_feature == COMBINED:
        if metric != LOSS:
            raise ValueError(
                'The only valid metric for "combined" output feature is "loss"'
            )
    else:
        output_feature_names = {of[NAME] for of in config[OUTPUT_FEATURES]}
        if output_feature not in output_feature_names:
            raise ValueError('The output feature specified for hyperopt "{}" '
                             "cannot be found in the config. "
                             'Available ones are: {} and "combined"'.format(
                                 output_feature, output_feature_names))

        output_feature_type = None
        for of in config[OUTPUT_FEATURES]:
            if of[NAME] == output_feature:
                output_feature_type = of[TYPE]
        feature_class = get_from_registry(output_feature_type,
                                          output_type_registry)
        if metric not in feature_class.metric_functions:
            # todo v0.4: allow users to specify also metrics from the overall
            #  and per class metrics from the trainign stats and in general
            #  and potprocessed metric
            raise ValueError(
                'The specified metric for hyperopt "{}" is not a valid metric '
                'for the specified output feature "{}" of type "{}". '
                "Available metrics are: {}".format(
                    metric, output_feature, output_feature_type,
                    feature_class.metric_functions.keys()))

    hyperopt_executor = get_build_hyperopt_executor(executor[TYPE])(
        parameters,
        output_feature,
        metric,
        goal,
        split,
        search_alg=search_alg,
        **executor)

    # Explicitly default to a local backend to avoid picking up Ray or Horovod
    # backend from the environment.
    backend = backend or config_dict.get("backend") or "local"
    backend = initialize_backend(backend)
    if not (isinstance(backend, LocalBackend) or
            (isinstance(hyperopt_executor, RayTuneExecutor)
             and isinstance(backend, RayBackend))):
        raise ValueError(
            "Hyperopt requires using a `local` backend at this time, or "
            "`ray` backend with `ray` executor.")

    for callback in callbacks or []:
        callback.on_hyperopt_init(experiment_name)

    if not should_tune_preprocessing(config):
        # preprocessing is not being tuned, so generate it once before starting trials
        for callback in callbacks or []:
            callback.on_hyperopt_preprocessing_start(experiment_name)

        model = LudwigModel(
            config=config,
            backend=backend,
            gpus=gpus,
            gpu_memory_limit=gpu_memory_limit,
            allow_parallel_threads=allow_parallel_threads,
            callbacks=callbacks,
        )

        training_set, validation_set, test_set, training_set_metadata = model.preprocess(
            dataset=dataset,
            training_set=training_set,
            validation_set=validation_set,
            test_set=test_set,
            training_set_metadata=training_set_metadata,
            data_format=data_format,
            skip_save_processed_input=skip_save_processed_input,
            random_seed=random_seed,
        )
        dataset = None

        for callback in callbacks or []:
            callback.on_hyperopt_preprocessing_end(experiment_name)

    for callback in callbacks or []:
        callback.on_hyperopt_start(experiment_name)

    hyperopt_results = hyperopt_executor.execute(
        config,
        dataset=dataset,
        training_set=training_set,
        validation_set=validation_set,
        test_set=test_set,
        training_set_metadata=training_set_metadata,
        data_format=data_format,
        experiment_name=experiment_name,
        model_name=model_name,
        resume=resume,
        skip_save_training_description=skip_save_training_description,
        skip_save_training_statistics=skip_save_training_statistics,
        skip_save_model=skip_save_model,
        skip_save_progress=skip_save_progress,
        skip_save_log=skip_save_log,
        skip_save_processed_input=skip_save_processed_input,
        skip_save_unprocessed_output=skip_save_unprocessed_output,
        skip_save_predictions=skip_save_predictions,
        skip_save_eval_stats=skip_save_eval_stats,
        output_directory=output_directory,
        gpus=gpus,
        gpu_memory_limit=gpu_memory_limit,
        allow_parallel_threads=allow_parallel_threads,
        callbacks=callbacks,
        backend=backend,
        random_seed=random_seed,
        hyperopt_log_verbosity=hyperopt_log_verbosity,
        features_eligible_for_shared_params=features_eligible_for_shared_params,
        **kwargs,
    )

    if backend.is_coordinator():
        print_hyperopt_results(hyperopt_results)

        if not skip_save_hyperopt_statistics:
            results_directory = os.path.join(output_directory, experiment_name)
            makedirs(results_directory, exist_ok=True)

            hyperopt_stats = {
                "hyperopt_config":
                hyperopt_config,
                "hyperopt_results":
                [t.to_dict() for t in hyperopt_results.ordered_trials],
            }

            save_hyperopt_stats(hyperopt_stats, results_directory)
            logging.info(f"Hyperopt stats saved to: {results_directory}")

    for callback in callbacks or []:
        callback.on_hyperopt_end(experiment_name)
        callback.on_hyperopt_finish(experiment_name)

    logging.info("Finished hyperopt")

    return hyperopt_results