def create_segmentations(directory, data_sets, splits, assignments, size, segmentation_size, crop, cats, test_limit, single_process, verbose): ''' Phase 5 of unification. Create the normalized segmentation files ''' if size is not None and segmentation_size is None: segmentation_size = size # Get assignments into a nice form, once, here. # (dataset, category): [numpy array with new indexes] index_max = build_histogram([((ds, cat), i) for ds, cat, i in assignments.keys()], max) index_mapping = dict([k, numpy.zeros(i + 1, dtype=numpy.int16)] for k, i in index_max.items()) for (ds, cat, oldindex), newindex in assignments.items(): index_mapping[(ds, cat)][oldindex] = newindex # Count frequency and coverage for each individual image segmented = map_in_pool(partial(translate_segmentation, directory=directory, mapping=index_mapping, size=size, segmentation_size=segmentation_size, categories=cats, crop=crop, verbose=verbose), all_dataset_segmentations(data_sets, test_limit), single_process=single_process, verbose=verbose) # Sort nonempty itesm randomly+reproducibly by md5 hash of the filename. ordered = sorted([(hashed_float(r['image']), r) for r in segmented if r]) # Assign splits, pullout out last 20% for validation. cutoffs = cumulative_splits(splits) for floathash, record in ordered: for name, cutoff in cutoffs: if floathash <= cutoff: record['split'] = name break else: assert False, 'hash %f exceeds last split %f' % (floathash, c) # Now write one row per image and one column per category with open(os.path.join(directory, 'index.csv'), 'w') as csvfile: fields = ['image', 'split', 'ih', 'iw', 'sh', 'sw'] + cats writer = DictUnicodeWriter(csvfile, fieldnames=fields) writer.writeheader() for f, record in ordered: writer.writerow(record)
def write_label_files(directory, names, assignments, frequency, coverage, syns, verbose): ''' Phase 4 of unification. Collate some stats and then write then to two metadata files. ''' # Make lists of synonyms claimed by each label synmap = invert_dict(dict( (w, assignments[lab]) for w, lab in syns.items())) # We need an (index, category) count ic_freq = join_histogram_fn(frequency, lambda x: (assignments[x], x[1])) ic_cov = join_histogram_fn(coverage, lambda x: (assignments[x], x[1])) for z in [(j, cat) for j, cat in ic_freq if j == 0]: del ic_freq[z] del ic_cov[z] catstats = [[] for n in names] # For each index, get a (category, frequency) list in descending order for (ind, cat), f in sorted(ic_freq.items(), key=lambda x: -x[1]): catstats[ind].append((cat, f)) index_coverage = join_histogram(coverage, assignments) with open(os.path.join(directory, 'label.csv'), 'w') as csvfile: fields = [ 'number', 'name', 'category', 'frequency', 'coverage', 'syns' ] writer = DictUnicodeWriter(csvfile, fieldnames=fields) writer.writeheader() for ind, name in enumerate(names): if ind == 0: continue writer.writerow( dict(number='%d' % ind, name=name, category=';'.join('%s(%d)' % s for s in catstats[ind]), frequency='%d' % sum(f for c, f in catstats[ind]), coverage='%f' % index_coverage[ind], syns=';'.join([s for s in synmap[ind] if s != name]))) # For each category, figure the first, last, and other stats cat_ind = [(cat, ind) for ind, cat in ic_freq.keys()] first_index = build_histogram(cat_ind, min) last_index = build_histogram(cat_ind, max) count_labels = build_histogram([(cat, 1) for cat, _ in cat_ind]) cat_freq = join_histogram_fn(ic_freq, lambda x: x[1]) cats = sorted(first_index.keys(), key=lambda x: first_index[x]) with open(os.path.join(directory, 'category.csv'), 'w') as csvfile: fields = ['name', 'first', 'last', 'count', 'frequency'] writer = DictUnicodeWriter(csvfile, fieldnames=fields) writer.writeheader() for cat in cats: writer.writerow( dict(name=cat, first=first_index[cat], last=last_index[cat], count=count_labels[cat], frequency=cat_freq[cat])) # And for each category, create a dense coding file. for cat in cats: dense_code = [0] + sorted([i for i, c in ic_freq if c == cat], key=lambda i: (-ic_freq[(i, cat)], -ic_cov[ (i, cat)])) fields = ['code', 'number', 'name', 'frequency', 'coverage'] with open(os.path.join(directory, 'c_%s.csv' % cat), 'w') as csvfile: writer = DictUnicodeWriter(csvfile, fieldnames=fields) writer.writeheader() for code, i in enumerate(dense_code): if code == 0: continue writer.writerow( dict(code=code, number=i, name=names[i], frequency=ic_freq[(i, cat)], coverage=ic_cov[(i, cat)])) return cats