def split_by_visual_bias(visual_dataset, logger, args, visual_bias): """ Splitting the visual dataset as sepcified in `default_bias` """ logger('Splitting by visual bias') logger(visual_bias, resume=True, pretty=True) isinstanceof_stats = load_knowledge(args.task, 'isinstanceof') def resplit_fn(scene): return get_split_by_visual_bias(scene, visual_bias[args.task], isinstanceof_stats) resplited = visual_dataset.resplit(resplit_fn) train_dataset = resplited['train'] val_dataset = train_dataset.copy() test_dataset = resplited['test'] train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes, 6 / 7)) val_dataset.set_indexes( difference(val_dataset.indexes, train_dataset.indexes)) visual_datasets = { 'train': train_dataset, 'val': val_dataset, 'test': test_dataset } show_split_sizes(visual_datasets, logger) return visual_datasets
def register_visual_concepts( visual_dataset, concepts, args, register_synonyms, register_hypernyms, register_meronyms, forbidden_concepts, logger, experiment_name, ): # registering from visual and conceptual datasets if forbidden_concepts is not None: logger('Filtering out forbidden concepts:') logger(forbidden_concepts, resume=True, pretty=True) concepts.filter_out(forbidden_concepts) logger('Registering visual dataset') visual_dataset.register_concepts(concepts) if args.most_frequent != -1: word_count = load_knowledge(args.task, 'most_common', logger) concepts.filter_most_frequent(word_count, args.most_frequent) # visual_dataset.filter_concepts(concepts) # registering from knowledge if register_synonyms: logger('Registering synonyms') with logger.levelup(): concepts.register_related(load_knowledge(args.task, 'synonym')) if register_hypernyms: logger('Registering hypernyms') with logger.levelup(): concepts.register_related(load_knowledge(args.task, 'hypernym')) if register_meronyms: logger('Registering meronyms') with logger.levelup(): concepts.register( all_secondary_elements(load_knowledge(args.task, 'meronym')), multiple=True, ) return concepts
def get_testConcepts_zeroshot(source, args, logger): synonym_stats = load_knowledge(args.task, 'synonym') syn_groups = [ intersection(source, synset) for exampler, synset in synonym_stats.items() ] test_concepts = [ random_one(synset) for synset in random_choice_ratio(syn_groups, 0.5) if len(synset) > 1 ] logger(f'Selecting test concepts: \n{test_concepts}') logger(f'num = {len(test_concepts)}', resume=True) return test_concepts
def select_by_visual_bias(visual_dataset, visual_bias): iio_stats = load_knowledge(args.task, 'isinstanceof') splits = { image_id: split_by_visual_bias(scene, visual_bias, iio_stats) for image_id, scene in visual_dataset.sceneGraphs.items() # for image_id, scene in visual_dataset.local_sceneGraphs.items() } resplit = { split: [ image_id for image_id, this_split in splits.items() if this_split == split ] for split in ['train', 'val', 'test'] } return resplit
def split_by_visual_bias_leaked(visual_dataset, logger, args, visual_bias): """ Splitting the visual dataset as sepcified in `default_bias` """ logger('Splitting by visual bias, with a few unbiased samples') logger(visual_bias, resume=True, pretty=True) isinstanceof_stats = load_knowledge(args.task, 'isinstanceof') def resplit_fn(scene): raw_split = get_split_by_visual_bias(scene, visual_bias[args.task], isinstanceof_stats) ''' if raw_split == 'test' and \ np.random.rand() < args.debiasing_leak: return 'train' else: ''' return raw_split resplited = visual_dataset.resplit(resplit_fn) train_dataset = resplited['train'] val_dataset = train_dataset.copy() test_dataset = resplited['test'] train_dataset.set_indexes(random_choice_ratio(train_dataset.indexes, 6 / 7)) val_dataset.set_indexes( difference(val_dataset.indexes, train_dataset.indexes)) leaked_indexes = random_choice(test_dataset.indexes, args.debiasing_leak) train_dataset.add_indexes(leaked_indexes) test_dataset.remove_indexes(leaked_indexes) visual_datasets = { 'train': train_dataset, 'val': val_dataset, 'test': test_dataset } show_split_sizes(visual_datasets, logger) return visual_datasets
def build_dataset(args): """ Main function for building question dataset """ local_dir = os.path.join(args.dataset_dir, args.name) make_dir(local_dir) logger = Logger(local_dir, is_main=True) prepare.print_args(args, logger) dataset_config = args.dataset_config[args.task] config = lazy_import()[args.mode, args.task, args.experiment].config # loading visual dataset logger('Loading dataset') with logger.levelup(): visual_dataset = prepare.load_visual_dataset( args, logger, dataset_config['scene_process']) logger('Splittinng dataset') with logger.levelup(): # task-specific visual split visual_splits = dataset_config['visual_split_fn'](visual_dataset, logger, args) visual_dataset.mark_splits(get_split_indexes(visual_splits)) # experiment-specific visual split visual_splits = config['visual_split_fn'](visual_dataset, logger, args, **config['split_kwarg']) split_indexes = get_split_indexes(visual_splits) # Registering concepts, building exist-checktable tools = register.init_word2index(logger) logger('Registering visual concepts') with logger.levelup(): register.register_visual_concepts( visual_dataset, tools.concepts, args, config['register_synonyms'], config['register_hypernyms'], config['register_meronyms'], load_knowledge(args.task, 'forbidden'), logger, args.experiment) logger(f'Num of concepts: {len(tools.concepts)}') logger('Building exist-checktable') with logger.levelup(): exist_checktable = misc.exist_checktable(tools.concepts, args, logger) # building conceptual and visual questions builders = register_builders(args, tools.concepts, config) logger('Building conceptual questions') with logger.levelup(): conceptual_questions = build_all_conceptual_questions( args, builders, tools.concepts, config['conceptual_question_types'], logger) logger('Building visual questions') with logger.levelup(): visual_questions = build_all_visual_questions( args, config, builders, tools.concepts, visual_splits, config['visual_question_types'], exist_checktable, logger) # registering question tokens iter_conceptual = list(q for questions in conceptual_questions.values() for q in questions) iter_visual = list(q for one_split in visual_questions.values() for questions in one_split.values() for q in questions) register.register_question_token(iter_conceptual, tools, logger) register.register_question_token(iter_visual, tools, logger) # save logger('Saving') with logger.levelup(): save(local_dir, logger, conceptual_questions, visual_questions, visual_dataset.sceneGraphs, tools, split_indexes) embed()
def task_knowledge(name): return load_knowledge(args.task, name)
def exist_checktable(all_concepts, args, logger): """ This function returns a look-up table for determining the entailment and mutual-exclusion among concepts. """ synonym_stats = load_knowledge(args.task, 'synonym', logger) isinstanceof_stats = load_knowledge(args.task, 'isinstanceof', logger) hierarchy = load_knowledge( args.task, 'hierarchy', logger, from_source=True) all_concepts = set(all_concepts) cues = { concept: { True: set([concept]), False: set(), } for concept in all_concepts } results = copy.deepcopy(cues) # Dealing with synonyms first if synonym_stats is not None: logger('Dealing with synonyms first') if isinstanceof_stats is not None: logger('expand the isinstanceof stats', resume=True) ambiguous = 0 for examplar, synset in synonym_stats.items(): group = set(synset) group.add(examplar) for x in group: for y in group: if x in all_concepts: cues[x][True].add(y) if y in all_concepts: results[y][True].add(x) # expanding isinstanceof knowledge if isinstanceof_stats is not None: categories = set([belongs_to(isinstanceof_stats, name) for name in group]) if None in categories: categories.remove(None) if len(categories) == 1: cat = list(categories)[0] isinstanceof_stats[cat] = \ union(isinstanceof_stats[cat], group) else: ambiguous += 1 logger(f'{ambiguous} out of {len(synonym_stats)} synsets ' 'are ambiguous', resume=True) # Dealing with hierarchy information then, by walking through the forest trace_line = set() def trace_down(forest): # tracing down the current tree for sub_root, sub_forest in forest.items(): trace_line.add(sub_root) for hyper in trace_line: if hyper in all_concepts: cues[hyper][True].add(sub_root) if sub_root in all_concepts: results[sub_root][True].add(hyper) if sub_forest is not None: trace_down(sub_forest) trace_line.remove(sub_root) if hierarchy is not None: logger('Dealing with hierarchy information') if isinstanceof_stats is not None: logger('expand the isinstanceof_stats', resume=True) ambiguous = 0 for root, forest in hierarchy.items(): trace_down({root: forest}) # expand the isinstanceof stats if isinstanceof_stats is not None: all_nodes = all_in_hierarchy({root: forest}) categories = set([belongs_to(isinstanceof_stats, name) for name in all_nodes]) if None in categories: categories.remove(None) if len(categories) == 1: cat = list(categories)[0] isinstanceof_stats[cat] = \ union(isinstanceof_stats[cat], all_nodes) else: ambiguous += 1 logger(f'{ambiguous} out of {len(hierarchy)} hierarchies ' 'are ambiguous', resume=True) # Dealing with the isinstanceof information. if isinstanceof_stats is not None: logger('Dealing with the isinstanceof information') for group in isinstanceof_stats.values(): for x in group: if x in all_concepts: for y in group: if y in all_concepts: if y not in cues[x][True] and \ y not in results[x][True]: cues[x][False].add(y) results[y][False].add(x) return {'cues': cues, 'results': results}