def normalise_datadict(datadict, cut_to): """Take a dictionary of groundtruth and cut all classes to `cut_to` items. If a class has fewer items, discard it. Return newdatadict, removed where `removed` is a dictionary of items in datadict that haven't been added to newdatadict""" dataset = collections.defaultdict(list) for r, cls in datadict.items(): dataset[cls].append(r) newdataset = {} remaining = {} for cls, items in dataset.items(): if len(items) > cut_to: sample = random.sample(items, cut_to) for i in sample: newdataset[i] = cls rest = list(set(items) - set(sample)) for i in rest: remaining[i] = cls return newdataset, remaining
def normalise_datadict(datadict, cut_to): """Take a dictionary of groundtruth and cut all classes to `cut_to` items. If a class has fewer items, discard it. Return newdatadict, removed where `removed` is a dictionary of items in datadict that haven't been added to newdatadict""" dataset = collections.defaultdict(list) for r, cls in datadict.items(): dataset[cls].append(r) newdataset = {} remaining = {} for cls, items in dataset.items(): if len(items) > cut_to: sample = random.sample(items, cut_to) for i in sample: newdataset[i] = cls rest = list(set(items)-set(sample)) for i in rest: remaining[i] = cls return newdataset, remaining