def update_locals(_pickled_cache, locls): if type(_pickled_cache) is dict: it_consumes( locls.__setitem__(k, v) for k, v in iteritems(_pickled_cache)) else: raise TypeError("_pickled cache isn't dict") # elif key is not None: locls[key] = _pickled_cache return _pickled_cache
def construct_worst_per_image(series, root_directory, new_df): def g(val): fname = construct_filename(root_directory, val.image_position, val.folder_name) if pd.isnull(val.choice): return val new_idx = categories.index(val.choice) try: cur_fname = new_df.get(fname, None) except TypeError: cur_fname = None if construct_worst_per_image.t > 0: construct_worst_per_image.t -= 1 just = 35 print( "val:".ljust(just), "{!r}\n".format(val), "val.choice:".ljust(just), "{!r}\n".format(val.choice), "fname:".ljust(just), "{!r}\n".format(fname), "new_df[fname]:".ljust(just), "{!r}\n".format(cur_fname), sep="", ) print( "categories.index(val.choice):".ljust(just), "{!r}\n".format(new_idx), sep="", ) print( "categories.index(new_df[fname]):".ljust(just), "{!r}\n".format(cur_fname if cur_fname is None else categories. index(cur_fname)), sep="", ) cur_idx = (cur_fname if cur_fname is None or pd.isnull(cur_fname) else categories.index(cur_fname)) new_df[fname] = (val.choice if any(( fname not in new_df, pd.isnull(cur_fname), pd.isnull(cur_idx), cur_idx is None or cur_idx < new_idx, )) else new_df[fname]) return val it_consumes(map(g, series.values)) return series
def partition_symlink(series): def g(filename_category): filename, category = filename_category if pd.isnull(filename): return category symlinks.append((filename, category)) return category it_consumes(map(g, series.items())) return series
def make_symlinks(dest_dir, filenames, clean_dir=False): if path.isdir(dest_dir): if clean_dir: rmtree(dest_dir) makedirs(dest_dir) # no goto :( else: makedirs(dest_dir) it_consumes( imap( lambda fname: symlink(fname, path.join(dest_dir, path.basename(fname))), filenames, ))
def main( root_directory, manual_dir ): # type: (str, str or None) -> (str, pd.DataFrame, pd.Series, pd.DataFrame) ensure_is_dir(root_directory) it_consumes( map(mkdir, filterfalse(path.isdir, (path.dirname(manual_dir), manual_dir)))) paths = "Fundus Photographs for AI", "DR SPOC Dataset", "DR SPOC Photo Dataset" if path.basename(root_directory) == paths[0]: root_directory = path.dirname(root_directory) elif path.basename(root_directory) == paths[1]: root_directory = path.dirname(path.dirname(root_directory)) elif path.basename(root_directory) == paths[2]: root_directory = path.dirname( path.dirname(path.dirname(root_directory))) levels = list(reversed(paths)) if root_directory.endswith(path.join(*paths)): for _ in range(len(levels)): root_directory = path.dirname(root_directory) prev = path.join(root_directory, levels.pop()) while len(levels): ensure_is_dir(prev) prev = path.join(prev, levels.pop()) del levels, prev db_df = handle_db(root_directory=root_directory) df = handle_spreadsheet(root_directory=root_directory) filename2cat = pd.Series() df.apply(construct_worst_per_image, args=(root_directory, filename2cat)) combined_df = combine_spreadsheet_db(db_df=db_df, filename2cat=filename2cat) # combined_df.apply(partition_symlink, 1) if manual_dir is None or ( path.realpath(manual_dir) == path.realpath(root_directory) and "{sep}symlinked_datasets{sep}".format(sep=path.sep) not in manual_dir): manual_dir = path.join(root_directory, "symlinked_datasets") symbolically_link(manual_dir, combined_df) return root_directory, df, filename2cat, combined_df
def combine_spreadsheet_db( filename2cat, db_df): # type: (pd.Series, pd.DataFrame) -> pd.DataFrame def g(idx_val): idx, val = idx_val assert isinstance(val, str), "Got type {!r} containing {!r}".format( type(val), val) if g.t > 0: g.t -= 1 print("val:", val, "\n", "idx:", idx, "\n", sep="") if idx in g.db_df.index: if g.tt > 0: g.tt -= 1 # print('categories.index({!r}):'.format(val), categories.index(val), '\n', # 'categories.index({!r}):'.format(g.db_df.loc[idx].category), # categories.index(g.db_df.loc[idx].category), '\n' # ) if categories.index(val) < categories.index( g.db_df.loc[idx].category): g.db_df.loc[idx].category = val g.changed_cond += 1 if g.tt > 0: print( "db_df.loc[{!r}].category is now".format(idx), g.db_df.loc[idx].category, ) g.changed += 1 else: if g.tt > 0: print("{!r} not found in {!r}".format(idx, g.db_df.index)) g.db_df = g.db_df.append(pd.Series({"category": val}, name=idx)) # db_df[idx] = val return val g.t = 0 g.tt = 0 g.changed_cond = 0 g.changed = 0 g.db_df = db_df it_consumes(map(g, filename2cat.items())) # display(HTML(g.db_df.to_html())) assert len(g.db_df.index) == 1574, "Actually got {:d}".format( len(g.db_df.index)) return g.db_df
def _vanilla_stats(skip_save=True): global pickled_cache cache.update_locals(pickled_cache, locals()) tbl = pickled_cache["tbl"] # type: dict assert len(tbl.keys()) > 0 sas_tbl = pickled_cache["sas_tbl"] # type: dict id2ideyefname = pickled_cache["id2ideyefname"] if "oags1" not in pickled_cache or not len(pickled_cache["oags1"]): pickled_cache["oags1"] = oags1 = tuple(v.rec.IDNUM for v in itervalues(tbl) if v.rec.oag1) else: oags1 = pickled_cache[ "oags1"] # Weird, this doesn't get into locals() from `update_locals` if "loags1" not in pickled_cache or not len(pickled_cache["loags1"]): pickled_cache["loag1"] = loag1 = tuple(v.rec.IDNUM for v in itervalues(tbl) if v.rec.loag1) pickled_cache["roag1"] = roag1 = tuple(v.rec.IDNUM for v in itervalues(tbl) if v.rec.roag1) else: loag1 = pickled_cache["loag1"] roag1 = pickled_cache["roag1"] if "no_oags1" not in pickled_cache or not len(pickled_cache["no_oags1"]): pickled_cache["no_oags1"] = no_oags1 = tuple(v.rec.IDNUM for v in itervalues(tbl) if not v.rec.oag1) if "_vanilla_stats" not in pickled_cache or not len( pickled_cache["_vanilla_stats"]): pickled_cache["_vanilla_stats"] = vanilla_stats = "\n".join( "{0}{1}".format(*t) for t in ( ("# total:".ljust(just), len(tbl)), ("# with oag1:".ljust(just), len(oags1)), ("# with roag1:".ljust(just), len(roag1)), ("# with loag1:".ljust(just), len(loag1)), ( "# with oag1 and roag1 and loag1:".ljust(just), sum(1 for v in itervalues(tbl) if v.rec.oag1 and v.rec.roag1 and v.rec.loag1), ), ( "# with oag1 and roag1 and loag1 and glaucoma4:".ljust( just), sum(1 for v in itervalues(tbl) if v.rec.oag1 and v.rec.roag1 and v.rec.loag1 and v.rec.glaucoma4), ), ("# len(sas_tbl) == len(tbl):".ljust(just), len(sas_tbl) == len(tbl)), )) skip_save or cache.save(pickled_cache) it_consumes(imap(logger.debug, pickled_cache["_vanilla_stats"].split("\n"))) logger.debug("oags1:".ljust(just) + "{}".format(oags1)) logger.debug("loag1:".ljust(just) + "{}".format(loag1)) if "loags_id2fname" not in pickled_cache or not len( pickled_cache["loags_id2fname"]): id2fname = lambda dataset, eye: (lambda l: dict(izip(l[::2], l[ 1::2])))(tuple( chain.from_iterable( imap( lambda idnum_group: ( idnum_group[0], tuple(imap(itemgetter(1), idnum_group[1])), ), groupby( chain.from_iterable( imap( lambda ideyefnames: tuple( imap( lambda ideyefname: ( ideyefname.id, ideyefname.fname, ), ideyefnames, )), imap( lambda ideyefnames: ifilter( lambda ideyefname: ideyefname.eye == eye, ideyefnames, ), imap( lambda idnum: id2ideyefname[idnum], dataset), ), )), key=itemgetter(0), ), )))) pickled_cache["loags_id2fname"] = loags_id2fname = id2fname( dataset=loag1, eye="L") pickled_cache["roags_id2fname"] = roags_id2fname = id2fname( dataset=roag1, eye="R") else: loags_id2fname = pickled_cache["loags_id2fname"] roags_id2fname = pickled_cache["roags_id2fname"] # pp(loags_id2fname) logger.debug("generated_types.T0._fields:".ljust(just) + "{}".format(generated_types.T0._fields)) return pickled_cache
def symbolically_link(symlink_dir, df): # type: (str, pd.DataFrame) -> pd.DataFrame if symbolically_link.t > 0: symbolically_link.t -= 1 print("symbolically_link::symlink_dir:".ljust(just), "{!r}".format(symlink_dir)) vc = df.apply(pd.value_counts) # 75% in train # 12.5% in test # 12.5% in validation target_counts = pd.DataFrame({ "train": pd.Series({ idx: np.uint16(np.floor(np.multiply(vc.loc[idx].category, 0.75))) for idx in vc.category.index }), "test": pd.Series({ idx: np.uint16(np.floor(np.multiply(vc.loc[idx].category, 0.125))) for idx in vc.category.index }), }) target_counts["valid"] = pd.Series({ idx: vc.loc[idx].category - sum( (lambda ser: (ser.train, ser.test))(target_counts.loc[idx])) for idx in vc.category.index }) symlinks = [] if not path.isdir(symlink_dir): makedirs(symlink_dir) def partition_symlink(series): def g(filename_category): filename, category = filename_category if pd.isnull(filename): return category symlinks.append((filename, category)) return category it_consumes(map(g, series.items())) return series partition_symlink.t = 0 df.apply(partition_symlink) _used = set() _uniq_syms = tuple((src, dst) for src, dst in symlinks if src not in _used and (_used.add(src) or True)) random_list = get_or_generate_and_store_random_list( len(_uniq_syms), path.join(path.dirname(path.dirname(__file__)), "_data", ".cache", "dr_spoc_rand.pkl"), ) uniq_syms = tuple(_uniq_syms[i] for i in random_list) assert len(uniq_syms) == len(_uniq_syms) target_counts_cp = target_counts.copy() def get_next_tier(index): # type: (str) -> str for column in target_counts.columns: if target_counts[column][index] > 0: target_counts[column][index] -= 1 return column raise StopIteration("No more {!r}".format(index)) def tier_syms(filename_category): filename, category = (filename_category if isinstance( filename_category, tuple) else (filename_category, df.loc[filename_category].category)) current_tier = get_next_tier(category) this_filename = "_".join( (path.basename(path.dirname(filename)), path.basename(filename))) all_labels_dir = path.join( symlink_dir, dr_spoc_datasets[dr_spoc_datasets.index("dr_spoc")], current_tier, category, ) grad_and_no_grad_dir = path.join( symlink_dir, dr_spoc_datasets[dr_spoc_datasets.index( "dr_spoc_grad_and_no_grad")], current_tier, ) no_no_grad_dir = path.join( symlink_dir, dr_spoc_datasets[dr_spoc_datasets.index("dr_spoc_no_no_grad")], current_tier, category, ) if not path.isdir(all_labels_dir): makedirs(all_labels_dir) all_labels_dst = path.join(all_labels_dir, this_filename) try: symlink(filename, all_labels_dst, target_is_directory=False) except FileExistsError: tier_syms.FileExistsError += 1 with suppress(FileExistsError): label = category if category == "No gradable image" else "gradable" grad_and_no_grad_dir = path.join(grad_and_no_grad_dir, label) if not path.isdir(grad_and_no_grad_dir): makedirs(grad_and_no_grad_dir) grad_and_no_grad_dst = path.join(grad_and_no_grad_dir, this_filename) symlink(filename, grad_and_no_grad_dst, target_is_directory=False) if label != "No gradable image": if not path.isdir(no_no_grad_dir): makedirs(no_no_grad_dir) no_no_grad_dir_dst = path.join(no_no_grad_dir, this_filename) symlink(filename, no_no_grad_dir_dst, target_is_directory=False) if tier_syms.t > 0: tier_syms.t -= 1 print("filename: {!r}\ncategory: {!r}\n".format( filename, category), sep="") tier_syms.t = 0 tier_syms.FileExistsError = 0 assert tier_syms.FileExistsError in (0, 1573) print("symlink_dir:".ljust(20), "{!r}".format(symlink_dir), sep="") it_consumes(map(tier_syms, uniq_syms)) return target_counts_cp