示例#1
0
def fewer_bias(questions, test_concepts, ratio, logger):
    logger('Biasing a set of questions')
    with logger.levelup():
        logger(f'Original size: {len(questions)}')
        involved_indexes = {
            concept: [
                ind for ind, qst in questions.questions.items()
                if concept in qst['keywords']
            ]
            for concept in test_concepts
        }
        remove = union(
            *tuple(
                random_choice_ratio(indexes, 1 - ratio)
                for indexes in involved_indexes.values()),
            as_set=True,
        )
        full = union(
            *tuple(involved_indexes.values()),
            as_set=True,
        )
        logger(f'Removed: {len(remove)} out of {len(full)}', resume=True)
        output = questions.copy()
        output.set_indexes(difference(output.indexes, remove))
    return output
示例#2
0
def split_train_val(visual_dataset, logger, args):
    """
    This splitting method takes only the train & val part of the visual
    dataset. The train split is re-split into train and val, and the original
    val split is regarded as test split.
    """

    logger('Splitting with train-val parts')
    train_dataset = visual_dataset.copy().filter(
        lambda scene: scene['split'] == 'train')
    test_dataset = visual_dataset.copy().filter(
        lambda scene: scene['split'] == 'val')

    val_dataset = train_dataset.copy()
    train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes,
                                                  6 / 7))
    val_dataset.set_indexes(
        difference(val_dataset.indexes, train_dataset.indexes))

    visual_datasets = {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }

    show_split_sizes(visual_datasets, logger)
    return visual_datasets
示例#3
0
def split_by_visual_bias(visual_dataset, logger, args, visual_bias):
    """
    Splitting the visual dataset as sepcified in `default_bias`
    """

    logger('Splitting by visual bias')
    logger(visual_bias, resume=True, pretty=True)

    isinstanceof_stats = load_knowledge(args.task, 'isinstanceof')

    def resplit_fn(scene):
        return get_split_by_visual_bias(scene, visual_bias[args.task],
                                        isinstanceof_stats)

    resplited = visual_dataset.resplit(resplit_fn)
    train_dataset = resplited['train']
    val_dataset = train_dataset.copy()
    test_dataset = resplited['test']

    train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes,
                                                  6 / 7))
    val_dataset.set_indexes(
        difference(val_dataset.indexes, train_dataset.indexes))

    visual_datasets = {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }
    show_split_sizes(visual_datasets, logger)
    return visual_datasets
示例#4
0
def cub_split(visual_dataset, logger, args):
    """
    Splitting method specially for CUB dataset
    """

    logger('Special splitting function for CUB dataset\n'
           'Splitting by ratio specified in args')

    def get_species(scene):
        return scene['objects']['0']['name']

    species_by_index = {
        image_id: get_species(scene)
        for image_id, scene in visual_dataset.sceneGraphs.items()
    }
    index_by_species = {
        species: [
            image_id for image_id, name in species_by_index.items()
            if name == species
        ]
        for species in set(species_by_index.values())
    }
    train_indexes = union(
        *tuple(
            random_choice_ratio(indexes, args.split_ratio['train'])
            for indexes in index_by_species.values()),
        as_set=True,
    )
    val_indexes = union(*tuple(
        random_choice_ratio(
            difference(indexes, train_indexes), args.split_ratio['val'] /
            (1 - args.split_ratio['train']))
        for indexes in index_by_species.values()),
                        as_set=True)
    test_indexes = difference(visual_dataset.indexes,
                              union(train_indexes, val_indexes))
    train_dataset = visual_dataset.copy().set_indexes(train_indexes)
    val_dataset = visual_dataset.copy().set_indexes(val_indexes)
    test_dataset = visual_dataset.copy().set_indexes(test_indexes)

    visual_datasets = {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }
    show_split_sizes(visual_datasets, logger)
    return visual_datasets
示例#5
0
def get_testConcepts_zeroshot(source, args, logger):
    synonym_stats = load_knowledge(args.task, 'synonym')
    syn_groups = [
        intersection(source, synset)
        for exampler, synset in synonym_stats.items()
    ]
    test_concepts = [
        random_one(synset)
        for synset in random_choice_ratio(syn_groups, 0.5)
        if len(synset) > 1
    ]
    logger(f'Selecting test concepts: \n{test_concepts}')
    logger(f'num = {len(test_concepts)}', resume=True)
    return test_concepts
示例#6
0
def split_by_visual_bias_leaked(visual_dataset, logger, args, visual_bias):
    """
    Splitting the visual dataset as sepcified in `default_bias`
    """

    logger('Splitting by visual bias, with a few unbiased samples')
    logger(visual_bias, resume=True, pretty=True)

    isinstanceof_stats = load_knowledge(args.task, 'isinstanceof')

    def resplit_fn(scene):
        raw_split = get_split_by_visual_bias(scene, visual_bias[args.task],
                                             isinstanceof_stats)
        '''
        if raw_split == 'test' and \
                np.random.rand() < args.debiasing_leak:
            return 'train'
        else:
        '''
        return raw_split

    resplited = visual_dataset.resplit(resplit_fn)
    train_dataset = resplited['train']
    val_dataset = train_dataset.copy()
    test_dataset = resplited['test']

    train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes,
                                                  6 / 7))
    val_dataset.set_indexes(
        difference(val_dataset.indexes, train_dataset.indexes))

    leaked_indexes = random_choice(test_dataset.indexes, args.debiasing_leak)
    train_dataset.add_indexes(leaked_indexes)
    test_dataset.remove_indexes(leaked_indexes)

    visual_datasets = {
        'train': train_dataset,
        'val': val_dataset,
        'test': test_dataset
    }
    show_split_sizes(visual_datasets, logger)
    return visual_datasets
示例#7
0
def get_testConcepts(source, args, logger):
    test_concepts = random_choice_ratio(
        source, args.generalization_ratio)
    logger(f'Selecting test concepts: \n{test_concepts}')
    logger(f'num = {len(test_concepts)}', resume=True)
    return test_concepts