示例#1
0
def get_multiclass_dataset(source_df,
                           dataset_name,
                           column_set_name,
                           column_names,
                           test_frac=.2,
                           balanced=False,
                           random_seed=42):
    """
    Return a dataset dict for multi-class data.

    TODO: support multi-label: stick them all in test.

    Parameters
    ----------
    source_df: pandas.DataFrame
    dataset_name: string
    column_set_name: string
        Name for the given set of columns. For example, 'artists'
    column_names: sequence of string
    test_frac: float
        Use this fraction of the examples to test.
        Will use the same amount for validation.
    balanced: bool [False]
        If True, val set will have nearly equal class distribution.
    random_seed: int [42]
    """
    assert (source_df.index.dtype == object)
    np.random.seed(random_seed)

    # Drop rows with no positive labels (ava_style has these...)
    ind = source_df[column_names].sum(1) == 0
    logging.info('Dropping {} rows with no positive labels.'.format(ind.sum()))
    source_df = source_df[~ind]

    N = source_df.shape[0]
    num_test = int(round(test_frac * N))
    num_val = num_test
    num_labels = len(column_names)
    task = 'clf'

    # If source_df does not have split info, do the split here.
    if '_split' not in source_df.columns:
        multilabel_ind = np.where(source_df[column_names].sum(1) > 1)[0]
        test_ind = []
        if len(multilabel_ind) > 0:
            # Put all multi-label examples in test, even if
            # test_frac is exceeded.
            test_ind = multilabel_ind.tolist()
        num_remaining = num_test - len(test_ind)
        if num_remaining > 0:
            singlelabel_ind = np.where(source_df[column_names].sum(1) == 1)[0]
            test_ind += np.random.choice(singlelabel_ind,
                                         num_remaining,
                                         replace=False).tolist()
        test_ids = source_df.index[test_ind]

    # Otherwise, just take the given test ids.
    else:
        test_ids = source_df[source_df['_split'] == 'test'].index

    trainval_ids = source_df.index - test_ids

    # Split into single-label trainval and possible multi-label test.
    test_df = source_df[column_names].ix[test_ids]
    # test_df needs dummy 'label' and 'importance' columns for vw_filter
    test_df['label'] = 1
    test_df['importance'] = 1.
    trainval_df = source_df[column_names].ix[trainval_ids]
    assert (np.all(trainval_df.sum(1) == 1))

    # Split trainval into train and val.
    label = trainval_df.values.argmax(1)
    df = pd.DataFrame({'label': label}, trainval_df.index)

    ids = df.index[np.random.permutation(df.shape[0])]

    if balanced:
        # Construct a balanced validation set.
        counts = trainval_df.sum(0).astype(int)
        min_count = counts[counts.argmin()]
        permutation = lambda N, K: np.random.permutation(N)[:K]
        min_size_balanced_set = np.concatenate([
            np.where(df['label'] == l)[0][permutation(count, min_count)]
            for l, count in enumerate(counts)
        ])
        P = min_size_balanced_set.shape[0]
        if P < num_val:
            raise Exception('Not enough balanced data for validation set.')
        min_size_balanced_set = np.random.permutation(min_size_balanced_set)
        val_ids = df.index[min_size_balanced_set[:num_val]]
    else:
        val_ids = ids[:num_val]

    train_ids = ids.diff(val_ids.tolist())
    train_ids = train_ids[np.random.permutation(len(train_ids))]

    # Assert that there is no overlap between the sets.
    assert (len(train_ids.intersection(val_ids)) == 0)
    assert (len(train_ids.intersection(test_ids)) == 0)
    assert (len(val_ids.intersection(test_ids)) == 0)

    # Add 1 to 'label', since VW needs values on [1, K].
    df['label'] += 1

    # Get the train/val/test datasets.
    dataset = {
        'train_df': get_split_df(df, train_ids, num_labels).join(trainval_df),
        'val_df': get_split_df(df, val_ids, num_labels).join(trainval_df),
        'test_df': test_df
    }

    # Add all relevant info to the data dict to return.
    dataset.update({
        'dataset_name':
        dataset_name,
        'name':
        '{}_{}_train_{}'.format(dataset_name, column_set_name,
                                dataset['train_df'].shape[0]),
        'task':
        task,
        'num_labels':
        num_labels,
        'column_names':
        column_names,
        'salient_parts': {
            'data': '{}_{}'.format(dataset_name, column_set_name),
            'num_train': dataset['train_df'].shape[0],
            'num_val': dataset['val_df'].shape[0],
            'num_test': dataset['test_df'].shape[0]
        }
    })

    return dataset
示例#2
0
def get_multiclass_dataset(
        source_df, dataset_name, column_set_name, column_names,
        test_frac=.2, balanced=False, random_seed=42):
    """
    Return a dataset dict for multi-class data.

    TODO: support multi-label: stick them all in test.

    Parameters
    ----------
    source_df: pandas.DataFrame
    dataset_name: string
    column_set_name: string
        Name for the given set of columns. For example, 'artists'
    column_names: sequence of string
    test_frac: float
        Use this fraction of the examples to test.
        Will use the same amount for validation.
    balanced: bool [False]
        If True, val set will have nearly equal class distribution.
    random_seed: int [42]
    """
    assert(source_df.index.dtype == object)
    np.random.seed(random_seed)

    # Drop rows with no positive labels (ava_style has these...)
    ind = source_df[column_names].sum(1) == 0
    logging.info('Dropping {} rows with no positive labels.'.format(ind.sum()))
    source_df = source_df[~ind]

    N = source_df.shape[0]
    num_test = int(round(test_frac * N))
    num_val = num_test
    num_labels = len(column_names)
    task = 'clf'

    # If source_df does not have split info, do the split here.
    if '_split' not in source_df.columns:
        multilabel_ind = np.where(source_df[column_names].sum(1) > 1)[0]
        test_ind = []
        if len(multilabel_ind) > 0:
            # Put all multi-label examples in test, even if
            # test_frac is exceeded.
            test_ind = multilabel_ind.tolist()
        num_remaining = num_test - len(test_ind)
        if num_remaining > 0:
            singlelabel_ind = np.where(source_df[column_names].sum(1) == 1)[0]
            test_ind += np.random.choice(
                singlelabel_ind, num_remaining, replace=False).tolist()
        test_ids = source_df.index[test_ind]

    # Otherwise, just take the given test ids.
    else:
        test_ids = source_df[source_df['_split'] == 'test'].index

    trainval_ids = source_df.index - test_ids

    # Split into single-label trainval and possible multi-label test.
    test_df = source_df[column_names].ix[test_ids]
    # test_df needs dummy 'label' and 'importance' columns for vw_filter
    test_df['label'] = 1
    test_df['importance'] = 1.
    trainval_df = source_df[column_names].ix[trainval_ids]
    assert(np.all(trainval_df.sum(1) == 1))

    # Split trainval into train and val.
    label = trainval_df.values.argmax(1)
    df = pd.DataFrame({'label': label}, trainval_df.index)

    ids = df.index[np.random.permutation(df.shape[0])]

    if balanced:
        # Construct a balanced validation set.
        counts = trainval_df.sum(0).astype(int)
        min_count = counts[counts.argmin()]
        permutation = lambda N, K: np.random.permutation(N)[:K]
        min_size_balanced_set = np.concatenate([
            np.where(df['label'] == l)[0][permutation(count, min_count)]
            for l, count in enumerate(counts)
        ])
        P = min_size_balanced_set.shape[0]
        if P < num_val:
            raise Exception('Not enough balanced data for validation set.')
        min_size_balanced_set = np.random.permutation(min_size_balanced_set)
        val_ids = df.index[min_size_balanced_set[:num_val]]
    else:
        val_ids = ids[:num_val]

    train_ids = ids.diff(val_ids.tolist())
    train_ids = train_ids[np.random.permutation(len(train_ids))]

    # Assert that there is no overlap between the sets.
    assert(len(train_ids.intersection(val_ids)) == 0)
    assert(len(train_ids.intersection(test_ids)) == 0)
    assert(len(val_ids.intersection(test_ids)) == 0)

    # Add 1 to 'label', since VW needs values on [1, K].
    df['label'] += 1

    # Get the train/val/test datasets.
    dataset = {
        'train_df': get_split_df(df, train_ids, num_labels).join(trainval_df),
        'val_df': get_split_df(df, val_ids, num_labels).join(trainval_df),
        'test_df': test_df
    }

    # Add all relevant info to the data dict to return.
    dataset.update({
        'dataset_name': dataset_name,
        'name': '{}_{}_train_{}'.format(
            dataset_name, column_set_name, dataset['train_df'].shape[0]),
        'task': task,
        'num_labels': num_labels,
        'column_names': column_names,
        'salient_parts': {
            'data': '{}_{}'.format(dataset_name, column_set_name),
            'num_train': dataset['train_df'].shape[0],
            'num_val': dataset['val_df'].shape[0],
            'num_test': dataset['test_df'].shape[0]
        }
    })

    return dataset
示例#3
0
def get_binary_or_regression_dataset(source_df,
                                     dataset_name,
                                     column_name,
                                     test_frac=.2,
                                     min_pos_frac=.1,
                                     random_seed=42):
    """
    Return a dataset dict suitable for the prediction code of binary
    or regression data in column_name column of source_df.
    Whether the data is binary or regression is inferred from dtype.

    # TODO: add ability to pass a filter to use for the AVA delta stuff

    Parameters
    ----------
    source_df: pandas.DataFrame
    dataset_name: string
    column_name: string
    test_frac: float
        Use this fraction of the positive examples to test.
        Will use the same amount for validation.
    min_pos_frac: float
        Subsample negative data s.t. pos/neg ratio is at least this.
        Only relevant if the data is binary, obviously.
        Ignored if < 0.
    random_seed: int [42]
    """
    assert (source_df.index.dtype == object)

    np.random.seed(random_seed)

    df = pd.DataFrame({'label': source_df[column_name]}, source_df.index)

    # Establish whether the data is for binary or regression,
    # and split the dataset into train/val/test appropriately.
    unique_labels = df['label'].unique()
    if df['label'].dtype == bool or len(unique_labels) == 2:
        task = 'clf'
        num_labels = 2

        if df['label'].dtype != bool:
            assert (1 in unique_labels and -1 in unique_labels)
            df['label'][df['label'] == 1] = True
            df['label'][df['label'] == -1] = False
            df['label'] = df['label'].astype(bool)

        if '_split' in source_df.columns:
            df, train_ids, val_ids, test_ids = \
                _process_df_for_binary_clf_with_split(
                    df, source_df['_split'], test_frac, min_pos_frac)
        else:
            df, train_ids, val_ids, test_ids = _process_df_for_binary_clf(
                df, test_frac, min_pos_frac)

    elif df['label'].dtype == float:
        task = 'regr'
        num_labels = -1
        df, train_ids, val_ids, test_ids = _process_df_for_regression(
            df, test_frac)

    else:
        raise Exception("Can only deal with binary or float values.")

    # Get the train/val/test datasets.
    dataset = {
        'train_df': get_split_df(df, train_ids, num_labels),
        'val_df': get_split_df(df, val_ids, num_labels),
        'test_df': get_split_df(df, test_ids, num_labels)
    }

    # Add all relevant info to the data dict to return.
    dataset.update({
        'dataset_name':
        dataset_name,
        'name':
        '{}_{}_train_{}'.format(dataset_name, column_name,
                                dataset['train_df'].shape[0]),
        'task':
        task,
        'num_labels':
        num_labels,
        'salient_parts': {
            'data': '{}_{}'.format(dataset_name, column_name),
            'num_train': dataset['train_df'].shape[0],
            'num_val': dataset['val_df'].shape[0],
            'num_test': dataset['test_df'].shape[0]
        }
    })

    return dataset
示例#4
0
def get_binary_or_regression_dataset(
        source_df, dataset_name, column_name,
        test_frac=.2, min_pos_frac=.1, random_seed=42):
    """
    Return a dataset dict suitable for the prediction code of binary
    or regression data in column_name column of source_df.
    Whether the data is binary or regression is inferred from dtype.

    # TODO: add ability to pass a filter to use for the AVA delta stuff

    Parameters
    ----------
    source_df: pandas.DataFrame
    dataset_name: string
    column_name: string
    test_frac: float
        Use this fraction of the positive examples to test.
        Will use the same amount for validation.
    min_pos_frac: float
        Subsample negative data s.t. pos/neg ratio is at least this.
        Only relevant if the data is binary, obviously.
        Ignored if < 0.
    random_seed: int [42]
    """
    assert(source_df.index.dtype == object)

    np.random.seed(random_seed)

    df = pd.DataFrame(
        {'label': source_df[column_name]}, source_df.index)

    # Establish whether the data is for binary or regression,
    # and split the dataset into train/val/test appropriately.
    unique_labels = df['label'].unique()
    if df['label'].dtype == bool or len(unique_labels) == 2:
        task = 'clf'
        num_labels = 2

        if df['label'].dtype != bool:
            assert(1 in unique_labels and -1 in unique_labels)
            df['label'][df['label'] == 1] = True
            df['label'][df['label'] == -1] = False
            df['label'] = df['label'].astype(bool)

        if '_split' in source_df.columns:
            df, train_ids, val_ids, test_ids = \
                _process_df_for_binary_clf_with_split(
                    df, source_df['_split'], test_frac, min_pos_frac)
        else:
            df, train_ids, val_ids, test_ids = _process_df_for_binary_clf(
                df, test_frac, min_pos_frac)

    elif df['label'].dtype == float:
        task = 'regr'
        num_labels = -1
        df, train_ids, val_ids, test_ids = _process_df_for_regression(
            df, test_frac)

    else:
        raise Exception("Can only deal with binary or float values.")

    # Get the train/val/test datasets.
    dataset = {
        'train_df': get_split_df(df, train_ids, num_labels),
        'val_df': get_split_df(df, val_ids, num_labels),
        'test_df': get_split_df(df, test_ids, num_labels)
    }

    # Add all relevant info to the data dict to return.
    dataset.update({
        'dataset_name': dataset_name,
        'name': '{}_{}_train_{}'.format(
            dataset_name, column_name, dataset['train_df'].shape[0]),
        'task': task,
        'num_labels': num_labels,
        'salient_parts': {
            'data': '{}_{}'.format(dataset_name, column_name),
            'num_train': dataset['train_df'].shape[0],
            'num_val': dataset['val_df'].shape[0],
            'num_test': dataset['test_df'].shape[0]
        }
    })

    return dataset