def get_multiclass_dataset(source_df, dataset_name, column_set_name, column_names, test_frac=.2, balanced=False, random_seed=42): """ Return a dataset dict for multi-class data. TODO: support multi-label: stick them all in test. Parameters ---------- source_df: pandas.DataFrame dataset_name: string column_set_name: string Name for the given set of columns. For example, 'artists' column_names: sequence of string test_frac: float Use this fraction of the examples to test. Will use the same amount for validation. balanced: bool [False] If True, val set will have nearly equal class distribution. random_seed: int [42] """ assert (source_df.index.dtype == object) np.random.seed(random_seed) # Drop rows with no positive labels (ava_style has these...) ind = source_df[column_names].sum(1) == 0 logging.info('Dropping {} rows with no positive labels.'.format(ind.sum())) source_df = source_df[~ind] N = source_df.shape[0] num_test = int(round(test_frac * N)) num_val = num_test num_labels = len(column_names) task = 'clf' # If source_df does not have split info, do the split here. if '_split' not in source_df.columns: multilabel_ind = np.where(source_df[column_names].sum(1) > 1)[0] test_ind = [] if len(multilabel_ind) > 0: # Put all multi-label examples in test, even if # test_frac is exceeded. test_ind = multilabel_ind.tolist() num_remaining = num_test - len(test_ind) if num_remaining > 0: singlelabel_ind = np.where(source_df[column_names].sum(1) == 1)[0] test_ind += np.random.choice(singlelabel_ind, num_remaining, replace=False).tolist() test_ids = source_df.index[test_ind] # Otherwise, just take the given test ids. else: test_ids = source_df[source_df['_split'] == 'test'].index trainval_ids = source_df.index - test_ids # Split into single-label trainval and possible multi-label test. test_df = source_df[column_names].ix[test_ids] # test_df needs dummy 'label' and 'importance' columns for vw_filter test_df['label'] = 1 test_df['importance'] = 1. trainval_df = source_df[column_names].ix[trainval_ids] assert (np.all(trainval_df.sum(1) == 1)) # Split trainval into train and val. label = trainval_df.values.argmax(1) df = pd.DataFrame({'label': label}, trainval_df.index) ids = df.index[np.random.permutation(df.shape[0])] if balanced: # Construct a balanced validation set. counts = trainval_df.sum(0).astype(int) min_count = counts[counts.argmin()] permutation = lambda N, K: np.random.permutation(N)[:K] min_size_balanced_set = np.concatenate([ np.where(df['label'] == l)[0][permutation(count, min_count)] for l, count in enumerate(counts) ]) P = min_size_balanced_set.shape[0] if P < num_val: raise Exception('Not enough balanced data for validation set.') min_size_balanced_set = np.random.permutation(min_size_balanced_set) val_ids = df.index[min_size_balanced_set[:num_val]] else: val_ids = ids[:num_val] train_ids = ids.diff(val_ids.tolist()) train_ids = train_ids[np.random.permutation(len(train_ids))] # Assert that there is no overlap between the sets. assert (len(train_ids.intersection(val_ids)) == 0) assert (len(train_ids.intersection(test_ids)) == 0) assert (len(val_ids.intersection(test_ids)) == 0) # Add 1 to 'label', since VW needs values on [1, K]. df['label'] += 1 # Get the train/val/test datasets. dataset = { 'train_df': get_split_df(df, train_ids, num_labels).join(trainval_df), 'val_df': get_split_df(df, val_ids, num_labels).join(trainval_df), 'test_df': test_df } # Add all relevant info to the data dict to return. dataset.update({ 'dataset_name': dataset_name, 'name': '{}_{}_train_{}'.format(dataset_name, column_set_name, dataset['train_df'].shape[0]), 'task': task, 'num_labels': num_labels, 'column_names': column_names, 'salient_parts': { 'data': '{}_{}'.format(dataset_name, column_set_name), 'num_train': dataset['train_df'].shape[0], 'num_val': dataset['val_df'].shape[0], 'num_test': dataset['test_df'].shape[0] } }) return dataset
def get_multiclass_dataset( source_df, dataset_name, column_set_name, column_names, test_frac=.2, balanced=False, random_seed=42): """ Return a dataset dict for multi-class data. TODO: support multi-label: stick them all in test. Parameters ---------- source_df: pandas.DataFrame dataset_name: string column_set_name: string Name for the given set of columns. For example, 'artists' column_names: sequence of string test_frac: float Use this fraction of the examples to test. Will use the same amount for validation. balanced: bool [False] If True, val set will have nearly equal class distribution. random_seed: int [42] """ assert(source_df.index.dtype == object) np.random.seed(random_seed) # Drop rows with no positive labels (ava_style has these...) ind = source_df[column_names].sum(1) == 0 logging.info('Dropping {} rows with no positive labels.'.format(ind.sum())) source_df = source_df[~ind] N = source_df.shape[0] num_test = int(round(test_frac * N)) num_val = num_test num_labels = len(column_names) task = 'clf' # If source_df does not have split info, do the split here. if '_split' not in source_df.columns: multilabel_ind = np.where(source_df[column_names].sum(1) > 1)[0] test_ind = [] if len(multilabel_ind) > 0: # Put all multi-label examples in test, even if # test_frac is exceeded. test_ind = multilabel_ind.tolist() num_remaining = num_test - len(test_ind) if num_remaining > 0: singlelabel_ind = np.where(source_df[column_names].sum(1) == 1)[0] test_ind += np.random.choice( singlelabel_ind, num_remaining, replace=False).tolist() test_ids = source_df.index[test_ind] # Otherwise, just take the given test ids. else: test_ids = source_df[source_df['_split'] == 'test'].index trainval_ids = source_df.index - test_ids # Split into single-label trainval and possible multi-label test. test_df = source_df[column_names].ix[test_ids] # test_df needs dummy 'label' and 'importance' columns for vw_filter test_df['label'] = 1 test_df['importance'] = 1. trainval_df = source_df[column_names].ix[trainval_ids] assert(np.all(trainval_df.sum(1) == 1)) # Split trainval into train and val. label = trainval_df.values.argmax(1) df = pd.DataFrame({'label': label}, trainval_df.index) ids = df.index[np.random.permutation(df.shape[0])] if balanced: # Construct a balanced validation set. counts = trainval_df.sum(0).astype(int) min_count = counts[counts.argmin()] permutation = lambda N, K: np.random.permutation(N)[:K] min_size_balanced_set = np.concatenate([ np.where(df['label'] == l)[0][permutation(count, min_count)] for l, count in enumerate(counts) ]) P = min_size_balanced_set.shape[0] if P < num_val: raise Exception('Not enough balanced data for validation set.') min_size_balanced_set = np.random.permutation(min_size_balanced_set) val_ids = df.index[min_size_balanced_set[:num_val]] else: val_ids = ids[:num_val] train_ids = ids.diff(val_ids.tolist()) train_ids = train_ids[np.random.permutation(len(train_ids))] # Assert that there is no overlap between the sets. assert(len(train_ids.intersection(val_ids)) == 0) assert(len(train_ids.intersection(test_ids)) == 0) assert(len(val_ids.intersection(test_ids)) == 0) # Add 1 to 'label', since VW needs values on [1, K]. df['label'] += 1 # Get the train/val/test datasets. dataset = { 'train_df': get_split_df(df, train_ids, num_labels).join(trainval_df), 'val_df': get_split_df(df, val_ids, num_labels).join(trainval_df), 'test_df': test_df } # Add all relevant info to the data dict to return. dataset.update({ 'dataset_name': dataset_name, 'name': '{}_{}_train_{}'.format( dataset_name, column_set_name, dataset['train_df'].shape[0]), 'task': task, 'num_labels': num_labels, 'column_names': column_names, 'salient_parts': { 'data': '{}_{}'.format(dataset_name, column_set_name), 'num_train': dataset['train_df'].shape[0], 'num_val': dataset['val_df'].shape[0], 'num_test': dataset['test_df'].shape[0] } }) return dataset
def get_binary_or_regression_dataset(source_df, dataset_name, column_name, test_frac=.2, min_pos_frac=.1, random_seed=42): """ Return a dataset dict suitable for the prediction code of binary or regression data in column_name column of source_df. Whether the data is binary or regression is inferred from dtype. # TODO: add ability to pass a filter to use for the AVA delta stuff Parameters ---------- source_df: pandas.DataFrame dataset_name: string column_name: string test_frac: float Use this fraction of the positive examples to test. Will use the same amount for validation. min_pos_frac: float Subsample negative data s.t. pos/neg ratio is at least this. Only relevant if the data is binary, obviously. Ignored if < 0. random_seed: int [42] """ assert (source_df.index.dtype == object) np.random.seed(random_seed) df = pd.DataFrame({'label': source_df[column_name]}, source_df.index) # Establish whether the data is for binary or regression, # and split the dataset into train/val/test appropriately. unique_labels = df['label'].unique() if df['label'].dtype == bool or len(unique_labels) == 2: task = 'clf' num_labels = 2 if df['label'].dtype != bool: assert (1 in unique_labels and -1 in unique_labels) df['label'][df['label'] == 1] = True df['label'][df['label'] == -1] = False df['label'] = df['label'].astype(bool) if '_split' in source_df.columns: df, train_ids, val_ids, test_ids = \ _process_df_for_binary_clf_with_split( df, source_df['_split'], test_frac, min_pos_frac) else: df, train_ids, val_ids, test_ids = _process_df_for_binary_clf( df, test_frac, min_pos_frac) elif df['label'].dtype == float: task = 'regr' num_labels = -1 df, train_ids, val_ids, test_ids = _process_df_for_regression( df, test_frac) else: raise Exception("Can only deal with binary or float values.") # Get the train/val/test datasets. dataset = { 'train_df': get_split_df(df, train_ids, num_labels), 'val_df': get_split_df(df, val_ids, num_labels), 'test_df': get_split_df(df, test_ids, num_labels) } # Add all relevant info to the data dict to return. dataset.update({ 'dataset_name': dataset_name, 'name': '{}_{}_train_{}'.format(dataset_name, column_name, dataset['train_df'].shape[0]), 'task': task, 'num_labels': num_labels, 'salient_parts': { 'data': '{}_{}'.format(dataset_name, column_name), 'num_train': dataset['train_df'].shape[0], 'num_val': dataset['val_df'].shape[0], 'num_test': dataset['test_df'].shape[0] } }) return dataset
def get_binary_or_regression_dataset( source_df, dataset_name, column_name, test_frac=.2, min_pos_frac=.1, random_seed=42): """ Return a dataset dict suitable for the prediction code of binary or regression data in column_name column of source_df. Whether the data is binary or regression is inferred from dtype. # TODO: add ability to pass a filter to use for the AVA delta stuff Parameters ---------- source_df: pandas.DataFrame dataset_name: string column_name: string test_frac: float Use this fraction of the positive examples to test. Will use the same amount for validation. min_pos_frac: float Subsample negative data s.t. pos/neg ratio is at least this. Only relevant if the data is binary, obviously. Ignored if < 0. random_seed: int [42] """ assert(source_df.index.dtype == object) np.random.seed(random_seed) df = pd.DataFrame( {'label': source_df[column_name]}, source_df.index) # Establish whether the data is for binary or regression, # and split the dataset into train/val/test appropriately. unique_labels = df['label'].unique() if df['label'].dtype == bool or len(unique_labels) == 2: task = 'clf' num_labels = 2 if df['label'].dtype != bool: assert(1 in unique_labels and -1 in unique_labels) df['label'][df['label'] == 1] = True df['label'][df['label'] == -1] = False df['label'] = df['label'].astype(bool) if '_split' in source_df.columns: df, train_ids, val_ids, test_ids = \ _process_df_for_binary_clf_with_split( df, source_df['_split'], test_frac, min_pos_frac) else: df, train_ids, val_ids, test_ids = _process_df_for_binary_clf( df, test_frac, min_pos_frac) elif df['label'].dtype == float: task = 'regr' num_labels = -1 df, train_ids, val_ids, test_ids = _process_df_for_regression( df, test_frac) else: raise Exception("Can only deal with binary or float values.") # Get the train/val/test datasets. dataset = { 'train_df': get_split_df(df, train_ids, num_labels), 'val_df': get_split_df(df, val_ids, num_labels), 'test_df': get_split_df(df, test_ids, num_labels) } # Add all relevant info to the data dict to return. dataset.update({ 'dataset_name': dataset_name, 'name': '{}_{}_train_{}'.format( dataset_name, column_name, dataset['train_df'].shape[0]), 'task': task, 'num_labels': num_labels, 'salient_parts': { 'data': '{}_{}'.format(dataset_name, column_name), 'num_train': dataset['train_df'].shape[0], 'num_val': dataset['val_df'].shape[0], 'num_test': dataset['test_df'].shape[0] } }) return dataset