def insert_missing_values(df, percent_rows, random_state=None): """ Inserts missing values into a data frame. :param df: data frame we're operating on :param percent_rows: the percentage of rows that should have a missing value. :param random_state: the numpy RandomState :return: a df with missing values """ # get the initialized random_state (if not already initialized) random_state = get_random_state(random_state) df = df.copy() def _insert_random_null(x): """ Chose a random column in a df row to null. This operates in-place. But it's on the copy, so it should be OK. :param x: the data frame """ # -1 because last col will always be y x[random_state.randint(0, len(x) - 1)] = np.nan return x # this is a "truthy" check. If it's zero or False, this will work. if not percent_rows: return df else: # otherwise validate that it's a float percent_rows = assert_valid_percent(percent_rows, eq_upper=True) # eq_lower not necessary because != 0. sample_index = df.sample(frac=percent_rows, random_state=random_state).index # random sample of rows to null df.loc[sample_index] = df.loc[sample_index].apply(_insert_random_null, axis=1) return df
def __init__(self, n_classes, weights, n_samples, output_dir, random_state=None): self.ilsvrc_synsets = self.get_ilsvrc_1000_synsets() self.random_state = get_random_state(random_state) self.chosen_synsets = self.random_state.choice(self.ilsvrc_synsets, n_classes, replace=False) self.n_samples = n_samples self.output_dir = output_dir self.weights = weights
def create_regression_dataset(n_samples, n_features, n_informative, effective_rank, tail_strength, noise, random_state=None): """ Creates a regression dataset :param n_samples: number of observations :param n_features: number of features :param n_informative: number of informative features :param n_targets: The number of regression targets, i.e., the dimension of the y output vector associated with a sample. By default, the output is a scalar. :param effective_rank: approximate number of singular vectors required to explain data :param tail_strength: relative importance of the fat noisy tail of the singular values profile :param noise: standard deviation of the gaussian noise applied to the output :param random_state: the numpy RandomState :return: the requested dataframe """ random_state = get_random_state(random_state) X, y = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_targets=1, effective_rank=effective_rank, tail_strength=tail_strength, noise=noise, random_state=random_state) # cast to a data frame df = pd.DataFrame(X) # rename X columns df = rename_columns(df) # and add the Y df['y'] = y return df
def create_classification_dataset(n_samples, n_features, n_informative, n_redundant, n_repeated, n_clusters_per_class, weights, n_classes, random_state=None): """ Creates a binary classifier dataset :param n_samples: number of observations :param n_features: number of features :param n_informative: number of informative features :param n_redundant: number of multicolinear :param n_repeated: number of perfect collinear features :param n_clusters_per_class: gaussian clusters per class :param weights: list of class balances, e.g. [.5, .5] :param n_classes: the number of class levels :param random_state: the numpy RandomState :return: the requested dataframe """ random_state = get_random_state(random_state) X, y = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, n_repeated=n_repeated, n_clusters_per_class=n_clusters_per_class, weights=weights, scale=(np.random.rand(n_features) * 10), n_classes=n_classes, random_state=random_state) # cast to a data frame df = pd.DataFrame(X) # rename X columns df = rename_columns(df) # and add the Y df['y'] = y return df
def insert_special_char(character, df, random_state=None): """ Chooses a column to reformat as currency or percentage, including a $ or % string, to make cleaning harder :param character: either $ or % :param df: the dataframe we're operating on :param random_state: the numpy RandomState :return: A dataframe with a single column chosen at random converted to a % or $ format """ # get the initialized random_state (if not already initialized) random_state = get_random_state(random_state) df = df.copy() # choose a column at random, that isn't Y. Only choose from numeric columns (no other eviled up columns) chosen_col = random_state.choice([col for col in df.select_dtypes(include=['number']).columns if col != 'y']) # assert that character is a string and that it's in ('$', '%') assert_is_type(character, six.string_types) if character not in ('$', '%'): raise ValueError('expected `character` to be in ("$", "%"), but got {0}'.format(character)) # do scaling first: df[chosen_col] = (df[chosen_col] - df[chosen_col].mean()) / df[chosen_col].std() # do the specific div/mul operations if character is "$": # multiply by 1000, finally add a $ df[chosen_col] = (df[chosen_col] * 1000).round(decimals=2).map(lambda x: "$" + str(x)) else: # elif character is "%": # divide by 100, finally add a $ df[chosen_col] = (df[chosen_col] / 100).round(decimals=2).map(lambda x: str(x) + "%") return df
def create_categorical_features(df, label_list, random_state=None, label_name='y'): """ Creates random categorical variables :param df: data frame we're operation on :param label_list: A list of lists, each list is the labels for one categorical variable :param random_state: the numpy RandomState :param label_name: the column name of rht label, if any. Default is 'y' :return: A modified dataframe Example: create_categorical_features(df, [['a','b'], ['red','blue']]) """ random_state = get_random_state(random_state) df = df.copy() n_categorical = len(label_list) # get numeric columns ONCE so we don't have to do it every time we loop: numer_cols = [ col for col in df.select_dtypes(include=['number']).columns if col != label_name ] for i in range(0, n_categorical): # we might be out of numerical columns! if not numer_cols: break # chose a random numeric column that isn't y chosen_col = random_state.choice(numer_cols) # pop the chosen_col out of the numer_cols numer_cols.pop(numer_cols.index(chosen_col)) # use cut to convert that column to categorical df[chosen_col] = pd.cut(df[chosen_col], bins=len(label_list[i]), labels=label_list[i]) return df
def make_image_dataset(config=None): if config is None: # called from the command line so parse configuration args = parse_args(sys.argv[1:]) config = load_config(args['config']) random_state = get_random_state(config["random_seed"]) if config["image_source"] == "imagenet": _ImageNet(n_classes=config["n_classes"], weights=config["weights"], n_samples=config["n_samples"], output_dir=config["out_path"], random_state=random_state).get_images() elif config["image_source"] == "openimages": print("Not yet supported. The only image_source currently supported is 'imagenet'") elif config["image_source"] == "googlesearch": print("Not yet supported. The only image_source currently supported is 'imagenet'") else: print(config["image_source"], "is not a supported image_source") print("The only image_source currently supported is 'imagenet'")
import shutil from snape.make_image_dataset import * from snape.make_image_dataset import _ImageNet, _ImageGrabber from snape.utils import get_random_state from nose.tools import assert_raises conf = { "n_classes": 2, "n_samples": 11, "out_path": "./test_images/", "weights": [.8, .2], "image_source": "imagenet", "random_seed": 42 } random_state = get_random_state(conf["random_seed"]) def test_make_image_dataset(): os.mkdir(conf["out_path"]) try: make_image_dataset(conf) sub_dir = conf["out_path"] + os.listdir(conf["out_path"])[0] print("SUBDIR:", sub_dir) n_images = len(os.listdir(sub_dir)) class1_size = int(conf["n_samples"] * conf["weights"][0]) assert class1_size == n_images, "Did not download n images" assert len( os.listdir(conf["out_path"]) ) == conf["n_classes"], "Did not produce the specified # of classes" except:
def make_dataset(config=None): """ Creates a machine learning dataset based on command line arguments passed :param config: a configuration dictionary, or None if called from the command line :return: None """ if config is None: # called from the command line so parse configuration args = parse_args(sys.argv[1:]) config = load_config(args['config']) print('-' * 80) c_type = config[ 'type'] # avoid multiple lookups - fails with key error if not present if c_type not in ('regression', 'classification'): raise ValueError( 'type must be in ("regression", "classification"), but got %s' % c_type) reg = c_type == 'regression' # get defaults - these are the defaults from sklearn. def _safe_get_with_default(cfg, key, default): if key not in cfg: print("Warning: %s not in configuration, defaulting to %r" % (key, default)) return default return cfg[key] n_samples = _safe_get_with_default(config, 'n_samples', 100) n_features = _safe_get_with_default( config, 'n_features', 20 if not reg else 100) # diff defaults in sklearn n_informative = _safe_get_with_default( config, 'n_informative', 2 if not reg else 10) # diff defaults in sklearn n_redundant = _safe_get_with_default(config, 'n_redundant', 2) n_repeated = _safe_get_with_default(config, 'n_repeated', 0) n_clusters_per_class = _safe_get_with_default(config, 'n_clusters_per_class', 2) weights = _safe_get_with_default(config, 'weights', None) n_classes = _safe_get_with_default(config, 'n_classes', 2) effective_rank = _safe_get_with_default(config, 'effective_rank', None) tail_strength = _safe_get_with_default(config, 'tail_strength', 0.5) noise = _safe_get_with_default(config, 'noise', 0.) seed = _safe_get_with_default(config, 'random_seed', 42) shuffle = _safe_get_with_default(config, 'shuffle', True) # get the random state random_state = get_random_state(seed) # create the base dataset if not reg: print('Creating Classification Dataset...') df = create_classification_dataset( n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, n_repeated=n_repeated, n_clusters_per_class=n_clusters_per_class, weights=weights, n_classes=n_classes, random_state=random_state, shuffle=shuffle) else: # elif c_type == 'regression': print('Creating Regression Dataset...') df = create_regression_dataset(n_samples=n_samples, n_features=n_features, n_informative=n_informative, effective_rank=effective_rank, tail_strength=tail_strength, noise=noise, random_state=random_state, shuffle=shuffle) # make sure to use safe lookups to avoid KeyErrors!!! label_list = _safe_get_with_default(config, 'label_list', None) do_categorical = label_list is not None and len(label_list) > 0 if do_categorical: print("Creating Categorical Features...") df = create_categorical_features(df, label_list, random_state=random_state) # insert entropy insert_dollar = _safe_get_with_default(config, 'insert_dollar', "No") insert_percent = _safe_get_with_default(config, 'insert_percent', "No") if any(entropy == "Yes" for entropy in (insert_dollar, insert_percent)): print("Inserting Requested Entropy...") # add $ or % column if requested if insert_dollar == "Yes": df = insert_special_char('$', df, random_state=random_state) if insert_percent == "Yes": df = insert_special_char('%', df, random_state=random_state) # insert missing values pct_missing = _safe_get_with_default(config, 'pct_missing', None) df = insert_missing_values(df, pct_missing, random_state=random_state) # Convert dataset to star schema if requested star_schema = _safe_get_with_default(config, 'star_schema', "No") outpath = _safe_get_with_default(config, 'out_path', "." + os.path.sep) if star_schema == "Yes": # Check the number of categorical variables if do_categorical: df = make_star_schema(df, outpath) else: print( "No categorical variables added. Dataset cannot be transformed into a star schema. " "Dataset will be generated as a single-table dataset...") print("Writing Train/Test Datasets") write_dataset(df, _safe_get_with_default(config, 'output', 'my_dataset'), outpath)
from numpy.testing import assert_almost_equal from snape.score_dataset import * from snape.utils import get_random_state, assert_valid_percent import os random_state = get_random_state(42) y_rand = (random_state.rand(200)) r = {'y': y_rand * 10, 'y_hat': y_rand * 10 - y_rand} regression_df = pd.DataFrame(r) c = { 'y': [1, 1, 1, 1, 0, 0, 0, 0], 'y_hat': [1, 0.9, 0.4, 0.95, 0, 0.1, 0.6, 0.15] } classification_df = pd.DataFrame(c) m = { 'y': [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], 'y_hat': [0, 1, 0, 1, 1, 3, 1, 2, 2, 3, 2, 3] } multiclass_df = pd.DataFrame(m) def test_guess_problem_type(): assert guess_problem_type(regression_df['y']), 'regression' assert guess_problem_type(classification_df['y']), 'binary' assert guess_problem_type(multiclass_df['y']), 'multiclass'
def test_random_state_fails(x): with pytest.raises(TypeError): get_random_state(x)