def process_downloaded_dataset(self): df = pd.read_csv(os.path.join(self.raw_dataset_path, "HIGGS.csv.gz"), header=None) df.columns = [ "label", "lepton_pT", "lepton_eta", "lepton_phi", "missing_energy_magnitude", "missing_energy_phi", "jet_1_pt", "jet_1_eta", "jet_1_phi", "jet_1_b-tag", "jet_2_pt", "jet_2_eta", "jet_2_phi", "jet_2_b-tag", "jet_3_pt", "jet_3_eta", "jet_3_phi", "jet_3_b-tag", "jet_4_pt", "jet_4_eta", "jet_4_phi", "jet_4_b-tag", "m_jj", "m_jjj", "m_lv", "m_jlv", "m_bb", "m_wbb", "m_wwbb", ] df["label"] = df["label"].astype("int32") if self.add_validation_set: df["split"] = [0] * 10000000 + [1] * 500000 + [2] * 500000 else: df["split"] = [0] * 10500000 + [2] * 500000 makedirs(self.processed_temp_path, exist_ok=True) df.to_parquet( os.path.join(self.processed_temp_path, self.parquet_filename), engine="pyarrow", row_group_size=50000, index=False, ) rename(self.processed_temp_path, self.processed_dataset_path)
def process_downloaded_dataset(self): """Read the training and test directories and write out a csv containing the training path and the label. """ makedirs(self.processed_temp_path, exist_ok=True) for dataset in ["training", "testing"]: print(f'>>> create ludwig formatted {dataset} data') labels, data = self.read_source_dataset(dataset, self.raw_dataset_path) self.write_output_dataset( labels, data, os.path.join(self.processed_temp_path, dataset)) self.output_training_and_test_data() rename(self.processed_temp_path, self.processed_dataset_path) print('>>> completed data preparation')
def process_downloaded_dataset(self): train_df = pd.read_csv(os.path.join(self.raw_dataset_path, "adult.data"), header=None) test_df = pd.read_csv(os.path.join(self.raw_dataset_path, "adult.test"), header=None, skiprows=1) # age: continuous. # workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. # fnlwgt: continuous. # education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. # education-num: continuous. # marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. # occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. # relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. # race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. # sex: Female, Male. # capital-gain: continuous. # capital-loss: continuous. # hours-per-week: continuous. # native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. # income: >50K, <=50K. columns = [ "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income", ] train_df.columns = columns test_df.columns = columns # Remove the trailing period on the income field in adult.test (not in adult.data) test_df["income"] = test_df["income"].str.rstrip(".") train_df["split"] = 0 test_df["split"] = 2 df = pd.concat([train_df, test_df]) makedirs(self.processed_temp_path, exist_ok=True) df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False) rename(self.processed_temp_path, self.processed_dataset_path)
def process_downloaded_dataset(self, header=0): zip_file = ZipFile( os.path.join(self.raw_dataset_path, "orange_small_train.data.zip")) train_df = pd.read_csv(zip_file.open("orange_small_train.data"), sep="\t") zip_file = ZipFile( os.path.join(self.raw_dataset_path, "orange_small_test.data.zip")) test_df = pd.read_csv(zip_file.open("orange_small_test.data"), sep="\t") train_df = process_categorical_features(train_df, categorical_features) train_df = process_numerical_features(train_df, categorical_features) targets = (pd.read_csv(os.path.join( self.raw_dataset_path, f"orange_small_train_{self.task_name}.labels"), header=None)[0].astype(str).apply( lambda x: "true" if x == "1" else "false")) train_idcs = pd.read_csv(os.path.join( self.raw_dataset_path, f"stratified_train_idx_{self.task_name}.txt"), header=None)[0] val_idcs = pd.read_csv(os.path.join( self.raw_dataset_path, f"stratified_test_idx_{self.task_name}.txt"), header=None)[0] processed_train_df = train_df.iloc[train_idcs].copy() processed_train_df["target"] = targets.iloc[train_idcs] processed_train_df["split"] = 0 processed_val_df = train_df.iloc[val_idcs].copy() processed_val_df["target"] = targets.iloc[val_idcs] processed_val_df["split"] = 1 test_df["target"] = "" test_df["split"] = 2 df = pd.concat([processed_train_df, processed_val_df, test_df]) makedirs(self.processed_temp_path, exist_ok=True) df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False) rename(self.processed_temp_path, self.processed_dataset_path)
def process_downloaded_dataset(self): stores_df = pd.read_csv(os.path.join(self.raw_dataset_path, "store.csv")) train_df = pd.read_csv(os.path.join(self.raw_dataset_path, "train.csv"), low_memory=False) train_df = preprocess_df(train_df, stores_df) train_df["split"] = -1 train_df.loc[train_df["Year"] == 2014, "split"] = 0 train_df.loc[train_df["Year"] == 2015, "split"] = 2 train_df.drop(train_df[train_df["split"] == -1].index, inplace=True) df = train_df makedirs(self.processed_temp_path, exist_ok=True) df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False) rename(self.processed_temp_path, self.processed_dataset_path)
def process_downloaded_dataset(self): """The final method where we create a concatenated CSV file with both training ant dest data.""" train_file = self.config["split_filenames"]["train_file"] test_file = self.config["split_filenames"]["test_file"] train_df = pd.read_csv(os.path.join(self.raw_dataset_path, train_file)) test_df = pd.read_csv(os.path.join(self.raw_dataset_path, test_file)) train_df["split"] = 0 test_df["split"] = 2 df = pd.concat([train_df, test_df]) makedirs(self.processed_temp_path, exist_ok=True) df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False) rename(self.processed_temp_path, self.processed_dataset_path)
def process_downloaded_dataset(self): makedirs(self.processed_temp_path, exist_ok=True) # create a dictionary matching image_path --> list of captions image_to_caption = defaultdict(list) with open( f"{self.raw_dataset_path}/Flickr8k.token.txt", "r" ) as captions_file: image_to_caption = defaultdict(list) for line in captions_file: line = line.split("#") # the regex is to format the string to fit properly in a csv line[1] = line[1].strip("\n01234.\t ") line[1] = re.sub('\"', '\"\"', line[1]) line[1] = '\"' + line[1] + '\"' image_to_caption[line[0]].append(line[1]) # create csv file with 7 columns: image_path, 5 captions, and split with open( os.path.join(self.processed_temp_path, self.csv_filename), 'w' ) as output_file: output_file.write('image_path,caption0,caption1,caption2,') output_file.write('caption3,caption4,split\n') splits = ["train", "dev", "test"] for i in range(len(splits)): split = splits[i] with open( f"{self.raw_dataset_path}/Flickr_8k.{split}Images.txt", "r" ) as split_file: for image_name in split_file: image_name = image_name.strip('\n') if image_name in image_to_caption: output_file.write('{},{},{},{},{},{},{}\n'.format( # Note: image folder is named Flicker8k_Dataset "{}/Flicker8k_Dataset/{}".format( self.raw_dataset_path, image_name ), *image_to_caption[image_name], i )) # Note: csv is stored in /processed while images are stored in /raw rename(self.processed_temp_path, self.processed_dataset_path)
def process_downloaded_dataset(self): makedirs(self.processed_temp_path, exist_ok=True) dataset_name = self.config["kaggle_dataset_name"] for url in self.config["kaggle_dataset_files"]: file_name = os.path.join(self.raw_dataset_path, dataset_name, url) # TODO(shreya): DataFrame created twice: here + CSVMixin. Figure out # options for using it once. df = pd.read_csv( file_name, header=0, names=[ "image_path", "insurance_company", "cost_of_vehicle", "min_coverage", "expiry_date", "max_coverage", "condition", "amount", ], ) df["image_path"] = df["image_path"].apply( lambda x: os.path.join(self.raw_dataset_path, dataset_name, "trainImages", os.path.basename(x)) ) df.to_csv( os.path.join(self.processed_temp_path, self.csv_filename), columns=[ "image_path", "insurance_company", "cost_of_vehicle", "min_coverage", "expiry_date", "max_coverage", "condition", "amount", ], ) # Note: csv is stored in /processed while images are stored in /raw rename(self.processed_temp_path, self.processed_dataset_path)
def process_downloaded_dataset(self): df = pd.read_csv(os.path.join(self.raw_dataset_path, "covtype.data.gz"), header=None) # Elevation quantitative meters Elevation in meters # Aspect quantitative azimuth Aspect in degrees azimuth # Slope quantitative degrees Slope in degrees # Horizontal_Distance_To_Hydrology quantitative meters Horz Dist to nearest surface water features # noqa: E501 # Vertical_Distance_To_Hydrology quantitative meters Vert Dist to nearest surface water features # noqa: E501 # Horizontal_Distance_To_Roadways quantitative meters Horz Dist to nearest roadway # noqa: E501 # Hillshade_9am quantitative 0 to 255 index Hillshade index at 9am, summer solstice # noqa: E501 # Hillshade_Noon quantitative 0 to 255 index Hillshade index at noon, summer soltice # noqa: E501 # Hillshade_3pm quantitative 0 to 255 index Hillshade index at 3pm, summer solstice # noqa: E501 # Horizontal_Distance_To_Fire_Points quantitative meters Horz Dist to nearest wildfire ignition points # noqa: E501 # Wilderness_Area (4 binary columns) qualitative 0 (absence) or 1 (presence) Wilderness area designation # noqa: E501 # Soil_Type (40 binary columns) qualitative 0 (absence) or 1 (presence) Soil Type designation # Cover_Type (7 types) integer 1 to 7 Forest Cover Type designation # noqa: E501 columns = [ "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points", "Wilderness_Area_1", "Wilderness_Area_2", "Wilderness_Area_3", "Wilderness_Area_4", "Soil_Type_1", "Soil_Type_2", "Soil_Type_3", "Soil_Type_4", "Soil_Type_5", "Soil_Type_6", "Soil_Type_7", "Soil_Type_8", "Soil_Type_9", "Soil_Type_10", "Soil_Type_11", "Soil_Type_12", "Soil_Type_13", "Soil_Type_14", "Soil_Type_15", "Soil_Type_16", "Soil_Type_17", "Soil_Type_18", "Soil_Type_19", "Soil_Type_20", "Soil_Type_21", "Soil_Type_22", "Soil_Type_23", "Soil_Type_24", "Soil_Type_25", "Soil_Type_26", "Soil_Type_27", "Soil_Type_28", "Soil_Type_29", "Soil_Type_30", "Soil_Type_31", "Soil_Type_32", "Soil_Type_33", "Soil_Type_34", "Soil_Type_35", "Soil_Type_36", "Soil_Type_37", "Soil_Type_38", "Soil_Type_39", "Soil_Type_40", "Cover_Type", ] df.columns = columns # Map the 40 soil types to a single integer # instead of 40 binary columns st_cols = [ "Soil_Type_1", "Soil_Type_2", "Soil_Type_3", "Soil_Type_4", "Soil_Type_5", "Soil_Type_6", "Soil_Type_7", "Soil_Type_8", "Soil_Type_9", "Soil_Type_10", "Soil_Type_11", "Soil_Type_12", "Soil_Type_13", "Soil_Type_14", "Soil_Type_15", "Soil_Type_16", "Soil_Type_17", "Soil_Type_18", "Soil_Type_19", "Soil_Type_20", "Soil_Type_21", "Soil_Type_22", "Soil_Type_23", "Soil_Type_24", "Soil_Type_25", "Soil_Type_26", "Soil_Type_27", "Soil_Type_28", "Soil_Type_29", "Soil_Type_30", "Soil_Type_31", "Soil_Type_32", "Soil_Type_33", "Soil_Type_34", "Soil_Type_35", "Soil_Type_36", "Soil_Type_37", "Soil_Type_38", "Soil_Type_39", "Soil_Type_40", ] st_vals = [] for _, row in df[st_cols].iterrows(): st_vals.append(row.to_numpy().nonzero()[0].item(0)) df = df.drop(columns=st_cols) df["Soil_Type"] = st_vals # Map the 4 wilderness areas to a single integer # instead of 4 binary columns wa_cols = ["Wilderness_Area_1", "Wilderness_Area_2", "Wilderness_Area_3", "Wilderness_Area_4"] wa_vals = [] for _, row in df[wa_cols].iterrows(): wa_vals.append(row.to_numpy().nonzero()[0].item(0)) df = df.drop(columns=wa_cols) df["Wilderness_Area"] = wa_vals if not self.use_tabnet_split: # first 11340 records used for training data subset # next 3780 records used for validation data subset # last 565892 records used for testing data subset df["split"] = [0] * 11340 + [1] * 3780 + [2] * 565892 else: # Split used in the tabNet paper # https://github.com/google-research/google-research/blob/master/tabnet/download_prepare_covertype.py train_val_indices, test_indices = train_test_split(range(len(df)), test_size=0.2, random_state=0) train_indices, val_indices = train_test_split(train_val_indices, test_size=0.2 / 0.6, random_state=0) df["split"] = 0 df.loc[val_indices, "split"] = 1 df.loc[test_indices, "split"] = 2 makedirs(self.processed_temp_path, exist_ok=True) df.to_csv(os.path.join(self.processed_temp_path, self.csv_filename), index=False) rename(self.processed_temp_path, self.processed_dataset_path)
def process_downloaded_dataset(self): rename(self.raw_dataset_path, self.processed_dataset_path)
from ludwig.api import LudwigModel from ludwig.datasets import twitter_bots from ludwig.utils.fs_utils import rename from ludwig.visualize import confusion_matrix, learning_curves if __name__ == "__main__": # Cleans out prior results shutil.rmtree("./results", ignore_errors=True) shutil.rmtree(".visualizations", ignore_errors=True) # Loads the dataset dataset = twitter_bots.TwitterBots(cache_dir=".") training_set, val_set, test_set = dataset.load(split=True) # Moves profile images into local directory, so relative paths in the dataset will be resolved. rename(os.path.join(dataset.processed_dataset_path, "profile_images"), "./profile_images") with open("./config.yaml") as f: config = yaml.safe_load(f.read()) model = LudwigModel(config, logging_level=logging.INFO) train_stats, preprocessed_data, output_directory = model.train( dataset=training_set) # Generates predictions and performance statistics for the test set. test_stats, predictions, output_directory = model.evaluate( test_set, collect_predictions=True, collect_overall_stats=True) confusion_matrix( [test_stats],