def biggest_dirs(drive): print('Biggest Dirs in %r' % (drive,)) dpath_list = drive.dpath_list fidxs_list = ut.dict_take(drive.dpath_to_fidx, dpath_list) unflat_dpath_bytes_list = ut.list_unflat_take(drive.fpath_bytes_list, fidxs_list) dpath_nbytes_list = list(map(sum, unflat_dpath_bytes_list)) sortx = ut.list_argsort(dpath_nbytes_list)[::-1] sel = sortx[0:10] biggest_nbytes = ut.take(dpath_nbytes_list, sel) biggest_dpaths = ut.take(dpath_list, sel) biginfo_list = list(zip(map(ut.byte_str2, biggest_nbytes), biggest_dpaths)) print(ut.list_str(biginfo_list, strvals=True)) pass
def biggest_dirs(drive): print('Biggest Dirs in %r' % (drive, )) dpath_list = drive.dpath_list fidxs_list = ut.dict_take(drive.dpath_to_fidx, dpath_list) unflat_dpath_bytes_list = ut.list_unflat_take(drive.fpath_bytes_list, fidxs_list) dpath_nbytes_list = list(map(sum, unflat_dpath_bytes_list)) sortx = ut.list_argsort(dpath_nbytes_list)[::-1] sel = sortx[0:10] biggest_nbytes = ut.take(dpath_nbytes_list, sel) biggest_dpaths = ut.take(dpath_list, sel) biginfo_list = list( zip(map(ut.byte_str2, biggest_nbytes), biggest_dpaths)) print(ut.repr2(biginfo_list, strvals=True)) pass
def make_name_model(num_annots, num_names=None, verbose=True, mode=1): """ Defines the general name model CommandLine: python -m wbia.algo.hots.bayes --exec-make_name_model --show Example: >>> # DISABLE_DOCTEST >>> from wbia.algo.hots.bayes import * # NOQA >>> defaults = dict(num_annots=2, num_names=2, verbose=True, mode=2) >>> kw = ut.argparse_funckw(make_name_model, defaults) >>> model = make_name_model(**kw) >>> ut.quit_if_noshow() >>> show_model(model, show_prior=True) >>> ut.show_if_requested() """ # annots = ut.chr_range(num_annots, base='a') mode = ut.get_argval('--mode', default=mode) annots = ut.chr_range(num_annots, base=ut.get_argval('--base', default='a')) # The indexes of match CPDs will not change if another annotation is added upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2) if num_names is None: num_names = num_annots # -- Define CPD Templates def match_pmf(match_type, n1, n2): if n1 == n2: val = 1.0 if match_type == 'same' else 0.0 # val = .999 if match_type == 'same' else 0.001 elif n1 != n2: # val = 0.01 if match_type == 'same' else .99 val = 0.0 if match_type == 'same' else 1.0 return val def score_pmf(score_type, match_type): score_lookup = { 'same': { 'low': 0.1, 'high': 0.9, 'veryhigh': 0.9 }, 'diff': { 'low': 0.9, 'high': 0.09, 'veryhigh': 0.01 } #'same': {'low': .1, 'high': .9}, #'diff': {'low': .9, 'high': .1} } val = score_lookup[match_type][score_type] return val def score_pmf3(score_type, match_type, isdup='False'): score_lookup = { 'False': { 'same': { 'low': 0.1, 'high': 0.5, 'veryhigh': 0.4 }, 'diff': { 'low': 0.9, 'high': 0.09, 'veryhigh': 0.01 }, }, 'True': { 'same': { 'low': 0.01, 'high': 0.2, 'veryhigh': 0.79 }, 'diff': { 'low': 0.4, 'high': 0.4, 'veryhigh': 0.2 }, }, } val = score_lookup[isdup][match_type][score_type] return val def score_pmf2(score_type, n1, n2): score_lookup = { True: { 'low': 0.1, 'high': 0.4, 'veryhigh': 0.5 }, False: { 'low': 0.9, 'high': 0.09, 'veryhigh': 0.01 }, } val = score_lookup[n1 == n2][score_type] return val def dup_pmf(dupstate, match_type): lookup = { 'same': { 'True': 0.5, 'False': 0.5 }, 'diff': { 'True': 0.0, 'False': 1.0 }, } return lookup[match_type][dupstate] def check_pmf(n0, n1, match_type): pass def trimatch_pmf(match_ab, match_bc, match_ca): lookup = { 'same': { 'same': { 'same': 1, 'diff': 0 }, 'diff': { 'same': 0, 'diff': 1 }, }, 'diff': { 'same': { 'same': 0, 'diff': 1 }, 'diff': { 'same': 0.5, 'diff': 0.5 }, }, } return lookup[match_ca][match_bc][match_ab] name_cpd_t = pgm_ext.TemplateCPD('name', ('n', num_names), varpref='N', special_basis_pool=SPECIAL_BASIS_POOL) if mode == 1 or mode == 5: match_cpd_t = pgm_ext.TemplateCPD( 'match', ['diff', 'same'], varpref='M', evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=match_pmf, ) if mode == 5: trimatch_cpd_t = pgm_ext.TemplateCPD( 'tri_match', ['diff', 'same'], varpref='T', # evidence_ttypes=[match_cpd_t, match_cpd_t, match_cpd_t], evidence_ttypes=[match_cpd_t, match_cpd_t], pmf_func=trimatch_pmf, ) score_cpd_t = pgm_ext.TemplateCPD( #'score', ['low', 'high', 'veryhigh'], 'score', ['low', 'high'], varpref='S', evidence_ttypes=[match_cpd_t], pmf_func=score_pmf, ) else: score_cpd_t = pgm_ext.TemplateCPD( #'score', ['low', 'high', 'veryhigh'], 'score', ['low', 'high'], varpref='S', evidence_ttypes=[match_cpd_t], pmf_func=score_pmf, ) elif mode == 2: name_cpd_t = pgm_ext.TemplateCPD('name', ('n', num_names), varpref='N', special_basis_pool=SPECIAL_BASIS_POOL) score_cpd_t = pgm_ext.TemplateCPD( #'score', ['low', 'high', 'veryhigh'], 'score', ['low', 'high'], varpref='S', evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=score_pmf2, ) elif mode == 3 or mode == 4: match_cpd_t = pgm_ext.TemplateCPD( 'match', ['diff', 'same'], varpref='M', evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=match_pmf, ) if mode == 3: dup_cpd_t = pgm_ext.TemplateCPD('dup', ['False', 'True'], varpref='D') else: dup_cpd_t = pgm_ext.TemplateCPD( 'dup', ['False', 'True'], varpref='D', evidence_ttypes=[match_cpd_t], pmf_func=dup_pmf, ) score_cpd_t = pgm_ext.TemplateCPD( 'score', ['low', 'high', 'veryhigh'], varpref='S', evidence_ttypes=[match_cpd_t, dup_cpd_t], pmf_func=score_pmf3, ) # Instanciate templates if mode == 1 or mode == 5: name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots] namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs) match_cpds = [ match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds ] score_cpds = [ score_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds) ] if mode == 5: # triple_idxs = ut.colwise_diag_idxs(num_annots, 3) tid2_match = {cpd._template_id: cpd for cpd in match_cpds} trimatch_cpds = [] # such hack for cpd in match_cpds: parents = [] this_ = list(cpd._template_id) for aid in annots: if aid in this_: continue for aid2 in this_: key = aid2 + aid if key not in tid2_match: key = aid + aid2 parents += [tid2_match[key]] trimatch_cpds += [trimatch_cpd_t.new_cpd(parents=parents)] # score_cpds = [score_cpd_t.new_cpd(parents=cpds) # for cpds in zip(trimatch_cpds)] cpd_list = name_cpds + score_cpds + match_cpds + trimatch_cpds else: cpd_list = name_cpds + score_cpds + match_cpds elif mode == 2: name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots] namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs) score_cpds = [ score_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds ] cpd_list = name_cpds + score_cpds elif mode == 3 or mode == 4: name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots] namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs) match_cpds = [ match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds ] if mode == 3: dup_cpds = [ dup_cpd_t.new_cpd(parents=''.join(map(str, aids))) for aids in ut.list_unflat_take(annots, upper_diag_idxs) ] else: dup_cpds = [ dup_cpd_t.new_cpd(parents=[mcpds]) for mcpds in match_cpds ] score_cpds = [ score_cpd_t.new_cpd(parents=([mcpds] + [dcpd])) for mcpds, dcpd in zip(match_cpds, dup_cpds) ] cpd_list = name_cpds + score_cpds + match_cpds + dup_cpds # logger.info('upper_diag_idxs = %r' % (upper_diag_idxs,)) logger.info('score_cpds = %r' % (ut.list_getattr(score_cpds, 'variable'), )) # import sys # sys.exit(1) # Make Model model = pgm_ext.define_model(cpd_list) model.num_names = num_names if verbose: model.print_templates() # ut.colorprint('\n --- CPD Templates ---', 'blue') # for temp_cpd in templates: # ut.colorprint(temp_cpd._cpdstr('psql'), 'cyan') # print_ascii_graph(model) return model
def name_model_mode1(num_annots, num_names=None, verbose=True): r""" spaghettii CommandLine: python -m wbia.algo.hots.bayes --exec-name_model_mode1 --show python -m wbia.algo.hots.bayes --exec-name_model_mode1 python -m wbia.algo.hots.bayes --exec-name_model_mode1 --num-annots=3 Example: >>> # DISABLE_DOCTEST >>> from wbia.algo.hots.bayes import * # NOQA >>> defaults = dict(num_annots=2, num_names=2, verbose=True) >>> kw = ut.argparse_funckw(name_model_mode1, defaults) >>> model = name_model_mode1(**kw) >>> ut.quit_if_noshow() >>> show_model(model, show_prior=False, show_title=False) >>> ut.show_if_requested() Ignore: import nx2tikz logger.info(nx2tikz.dumps_tikz(model, layout='layered', use_label=True)) """ annots = ut.chr_range(num_annots, base=ut.get_argval('--base', default='a')) # The indexes of match CPDs will not change if another annotation is added upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2) if num_names is None: num_names = num_annots # +--- Define CPD Templates --- # +-- Name Factor --- name_cpd_t = pgm_ext.TemplateCPD('name', ('n', num_names), varpref='N', special_basis_pool=SPECIAL_BASIS_POOL) name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots] # +-- Match Factor --- def match_pmf(match_type, n1, n2): return { True: { 'same': 1.0, 'diff': 0.0 }, False: { 'same': 0.0, 'diff': 1.0 }, }[n1 == n2][match_type] match_cpd_t = pgm_ext.TemplateCPD( 'match', ['diff', 'same'], varpref='M', evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=match_pmf, ) namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs) match_cpds = [match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds] # +-- Score Factor --- def score_pmf(score_type, match_type): score_lookup = { 'same': { 'low': 0.1, 'high': 0.9, 'veryhigh': 0.9 }, 'diff': { 'low': 0.9, 'high': 0.09, 'veryhigh': 0.01 }, } val = score_lookup[match_type][score_type] return val score_cpd_t = pgm_ext.TemplateCPD( 'score', ['low', 'high'], varpref='S', evidence_ttypes=[match_cpd_t], pmf_func=score_pmf, ) score_cpds = [ score_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds) ] # L___ End CPD Definitions ___ cpd_list = name_cpds + score_cpds + match_cpds logger.info('score_cpds = %r' % (ut.list_getattr(score_cpds, 'variable'), )) # Make Model model = pgm_ext.define_model(cpd_list) model.num_names = num_names if verbose: model.print_templates() return model
def name_model_mode5(num_annots, num_names=None, verbose=True, mode=1): mode = ut.get_argval('--mode', default=mode) annots = ut.chr_range(num_annots, base=ut.get_argval('--base', default='a')) # The indexes of match CPDs will not change if another annotation is added upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2) if num_names is None: num_names = num_annots # -- Define CPD Templates name_cpd_t = pgm_ext.TemplateCPD('name', ('n', num_names), varpref='N', special_basis_pool=SPECIAL_BASIS_POOL) name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots] def match_pmf(match_type, n1, n2): return { True: { 'same': 1.0, 'diff': 0.0 }, False: { 'same': 0.0, 'diff': 1.0 }, }[n1 == n2][match_type] match_cpd_t = pgm_ext.TemplateCPD( 'match', ['diff', 'same'], varpref='M', evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=match_pmf, ) namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs) match_cpds = [match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds] def trimatch_pmf(match_ab, match_bc, match_ca): lookup = { 'same': { 'same': { 'same': 1, 'diff': 0 }, 'diff': { 'same': 0, 'diff': 1 }, }, 'diff': { 'same': { 'same': 0, 'diff': 1 }, 'diff': { 'same': 0.5, 'diff': 0.5 }, }, } return lookup[match_ca][match_bc][match_ab] trimatch_cpd_t = pgm_ext.TemplateCPD( 'tri_match', ['diff', 'same'], varpref='T', evidence_ttypes=[match_cpd_t, match_cpd_t], pmf_func=trimatch_pmf, ) # triple_idxs = ut.colwise_diag_idxs(num_annots, 3) tid2_match = {cpd._template_id: cpd for cpd in match_cpds} trimatch_cpds = [] # such hack for cpd in match_cpds: parents = [] this_ = list(cpd._template_id) for aid in annots: if aid in this_: continue for aid2 in this_: key = aid2 + aid if key not in tid2_match: key = aid + aid2 parents += [tid2_match[key]] trimatch_cpds += [trimatch_cpd_t.new_cpd(parents=parents)] def score_pmf(score_type, match_type): score_lookup = { 'same': { 'low': 0.1, 'high': 0.9, 'veryhigh': 0.9 }, 'diff': { 'low': 0.9, 'high': 0.09, 'veryhigh': 0.01 }, } val = score_lookup[match_type][score_type] return val score_cpd_t = pgm_ext.TemplateCPD( 'score', ['low', 'high'], varpref='S', evidence_ttypes=[match_cpd_t], pmf_func=score_pmf, ) score_cpds = [ score_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds) ] # score_cpds = [score_cpd_t.new_cpd(parents=cpds) # for cpds in zip(trimatch_cpds)] cpd_list = name_cpds + score_cpds + match_cpds + trimatch_cpds logger.info('score_cpds = %r' % (ut.list_getattr(score_cpds, 'variable'), )) # Make Model model = pgm_ext.define_model(cpd_list) model.num_names = num_names if verbose: model.print_templates() return model
def make_name_model(num_annots, num_names=None, verbose=True, mode=1, num_scores=2, p_score_given_same=None, hack_score_only=False, score_basis=None, special_names=None): r""" CommandLine: python -m ibeis.algo.hots.bayes --exec-make_name_model --show python -m ibeis.algo.hots.bayes --exec-make_name_model python -m ibeis.algo.hots.bayes --exec-make_name_model --num-annots=3 Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.hots.bayes import * # NOQA >>> defaults = dict(num_annots=2, num_names=2, verbose=True) >>> modeltype = ut.get_argval('--modeltype', default='bayes') >>> kw = ut.argparse_funckw(make_name_model, defaults) >>> model = make_name_model(**kw) >>> ut.quit_if_noshow() >>> model.show_model(show_prior=False, show_title=False, modeltype=modeltype) >>> ut.show_if_requested() """ if special_names is None: special_names = SPECIAL_BASIS_POOL assert mode == 1, 'only can do mode 1' base = ut.get_argval('--base', type_=str, default='a') annots = ut.chr_range(num_annots, base=base) # The indexes of match CPDs will not change if another annotation is added upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2) if hack_score_only: upper_diag_idxs = upper_diag_idxs[-hack_score_only:] if num_names is None: num_names = num_annots # +--- Define CPD Templates and Instantiation --- cpd_list = [] # Name Factor name_cpd_t = pgm_ext.TemplateCPD( 'name', ('n', num_names), special_basis_pool=special_names) name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots] #name_cpds = [name_cpd_t.new_cpd(parents=aid, constrain_state=count) # for count, aid in enumerate(annots, start=1)] cpd_list.extend(name_cpds) # Match Factor def match_pmf(match_type, n1, n2): return { True: {'same': 1.0, 'diff': 0.0}, False: {'same': 0.0, 'diff': 1.0}, }[n1 == n2][match_type] match_states = ['diff', 'same'] match_cpd_t = pgm_ext.TemplateCPD( 'match', match_states, evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=match_pmf) namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs) match_cpds = [match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds] cpd_list.extend(match_cpds) # Score Factor score_states = list(range(num_scores)) if score_basis is not None: score_states = ['%.2f' % (s,) for s in score_basis] if p_score_given_same is None: tmp = np.arange(num_scores + 1)[1:] tmp = np.cumsum(tmp) tmp = (tmp / tmp.sum()) p_score_given_same = tmp def score_pmf(score_type, match_type): if isinstance(score_type, six.string_types): score_type = score_states.index(score_type) if match_type == 'same': return p_score_given_same[score_type] else: return p_score_given_same[-(score_type + 1)] score_cpd_t = pgm_ext.TemplateCPD( 'score', score_states, evidence_ttypes=[match_cpd_t], pmf_func=score_pmf) score_cpds = [score_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)] cpd_list.extend(score_cpds) with_humans = False if with_humans: human_states = ['diff', 'same'] human_cpd_t = pgm_ext.TemplateCPD( 'human', human_states, evidence_ttypes=[match_cpd_t], pmf_func=[[.9, .1], [.1, .9]]) human_cpds = [human_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)] cpd_list.extend(human_cpds) with_rank = False # Rank depends on dependant scores if with_rank: rank_states = ['0', '1', '2', '3'] rank_cpd_t = pgm_ext.TemplateCPD( 'rank', rank_states, evidence_ttypes=[match_cpd_t], pmf_func=None) rank_cpds = [rank_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)] cpd_list.extend(rank_cpds) # L___ End CPD Definitions ___ print('score_cpds = %r' % (ut.list_getattr(score_cpds, 'variable'),)) # Make Model model = pgm_ext.define_model(cpd_list) model.num_names = num_names if verbose: model.print_templates(ignore_ttypes=['match']) return model
def get_col(table, tbl_rowids, colnames=None): """ colnames = ('mask', 'size') FIXME; unpacking is confusing with sql controller """ # print('Get prop of %r, colnames=%r' % (table, colnames)) try: request_unpack = False if colnames is None: colnames = table.data_colnames #table._internal_data_colnames else: if isinstance(colnames, six.text_type): request_unpack = True colnames = (colnames,) # print('* colnames = %r' % (colnames,)) eager = True nInput = None total = 0 intern_colnames = [] extern_resolve_colxs = [] nesting_xs = [] for c in colnames: if c in table.external_to_internal: intern_colnames.append([table.external_to_internal[c]]) read_func = table.extern_read_funcs[c] extern_resolve_colxs.append((total, read_func)) nesting_xs.append(total) total += 1 elif c in table.nested_to_flat: nest = table.nested_to_flat[c] nesting_xs.append(list(range(total, total + len(nest)))) intern_colnames.append(nest) total += len(nest) else: nesting_xs.append(total) intern_colnames.append([c]) total += 1 flat_intern_colnames = tuple(ut.flatten(intern_colnames)) # do sql read # FIXME: understand unpack_scalars and keepwrap raw_prop_list = table.get_internal_columns( tbl_rowids, flat_intern_colnames, eager, nInput, unpack_scalars=True, keepwrap=True) # unpack_scalars=not # request_unpack) # print('depth(raw_prop_list) = %r' % (ut.depth_profile(raw_prop_list),)) prop_listT = list(zip(*raw_prop_list)) for extern_colx, read_func in extern_resolve_colxs: data_list = [] for uri in prop_listT[extern_colx]: try: # FIXME: only do this for a localpath uri1 = ut.unixjoin(table.depc.cache_dpath, uri) data = read_func(uri1) except Exception as ex: ut.printex(ex, 'failed to load external data', iswarning=False) raise # FIXME #data = None data_list.append(data) prop_listT[extern_colx] = data_list nested_proplistT = ut.list_unflat_take(prop_listT, nesting_xs) for tx in ut.where([isinstance(xs, list) for xs in nesting_xs]): nested_proplistT[tx] = list(zip(*nested_proplistT[tx])) prop_list = list(zip(*nested_proplistT)) if request_unpack: prop_list = [None if p is None else p[0] for p in prop_list] except Exception as ex: ut.printex(ex, 'failed in get col', keys=[ 'table.tablename', 'request_unpack', 'tbl_rowids', 'colnames', 'raw_prop_list', (ut.depth_profile, 'raw_prop_list'), 'prop_listT', (ut.depth_profile, 'prop_listT'), 'nesting_xs', 'nested_proplistT', 'prop_list']) raise return prop_list
def fix_duplicates(drive): r""" for every duplicate file passing a (eg avi) filter, remove the file that is in the smallest directory. On a tie use the smallest dpath. This will filter all duplicate files in a folder into a single folder. but... need to look at non-duplicates in that folder and decide if they should be moved as well. So, should trigger on folders that have at least 50% duplicate. Might not want to move curated folders. Example: cd ~/local/scripts >>> from register_files import * # NOQA >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/']) >>> drives = [Drive(root_dpath) for root_dpath in dpaths] >>> E = drive = drives[0] >>> #D, E, F = drives """ print('Fixing Duplicates in %r' % (drive,)) list_ = drive.fpath_hashX_list multiindex_dict_ = build_multindex(list_) duplicate_hashes = [ key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1 ] duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes) unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) # Check if any dups have been removed still_exists = ut.unflat_map(exists, unflat_fpaths) unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists) duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1] # Look at duplicate files unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs) # Find highly coupled directories if True: coupled_dirs = [] for fpaths in unflat_fpaths: #basedir = ut.longest_existing_path(commonprefix(fpaths)) dirs = sorted(list(map(dirname, fpaths))) _list = list(range(len(dirs))) idxs = ut.upper_diag_self_prodx(_list) coupled_dirs.extend(list(map(tuple, ut.list_unflat_take(dirs, idxs)))) hist_ = ut.dict_hist(coupled_dirs) coupled_idxs = ut.list_argsort(hist_.values())[::-1] most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100]) print('Coupled fpaths: ' + ut.list_str(most_coupled, nl=True)) print('%d unique files are duplicated' % (len(unflat_sizes),)) #print('Duplicate sizes: ' + ut.list_str(unflat_sizes[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0::5], nl=True)) print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths, nl=True)) # Find duplicate directories dpath_list = list(drive.dpath_to_fidx.keys()) fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list) #exists_list = list(map(exists, drive.fpath_list)) #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list) fname_registry = [basename(fpath) for fpath in drive.fpath_list] unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list) def unsorted_list_hash(list_): return ut.hashstr27(str(sorted(list_))) unflat_fname_sets = list(map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000))) fname_based_duplicate_dpaths = [] multiindex_dict2_ = build_multindex(unflat_fname_sets) fname_based_duplicate_hashes = [key for key, val in multiindex_dict2_.items() if len(val) > 1] print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes),)) fname_based_duplicate_didxs = ut.dict_take(multiindex_dict2_, fname_based_duplicate_hashes) fname_based_duplicate_dpaths = ut.list_unflat_take(dpath_list, fname_based_duplicate_didxs) print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
def fix_duplicates(drive): r""" for every duplicate file passing a (eg avi) filter, remove the file that is in the smallest directory. On a tie use the smallest dpath. This will filter all duplicate files in a folder into a single folder. but... need to look at non-duplicates in that folder and decide if they should be moved as well. So, should trigger on folders that have at least 50% duplicate. Might not want to move curated folders. Example: cd ~/local/scripts >>> from register_files import * # NOQA >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/']) >>> drives = [Drive(root_dpath) for root_dpath in dpaths] >>> E = drive = drives[0] >>> #D, E, F = drives """ print('Fixing Duplicates in %r' % (drive, )) list_ = drive.fpath_hashX_list multiindex_dict_ = build_multindex(list_) duplicate_hashes = [ key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1 ] duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes) unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) # Check if any dups have been removed still_exists = ut.unflat_map(exists, unflat_fpaths) unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists) duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1] # Look at duplicate files unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs) # Find highly coupled directories if True: coupled_dirs = [] for fpaths in unflat_fpaths: #basedir = ut.longest_existing_path(commonprefix(fpaths)) dirs = sorted(list(map(dirname, fpaths))) _list = list(range(len(dirs))) idxs = ut.upper_diag_self_prodx(_list) coupled_dirs.extend( list(map(tuple, ut.list_unflat_take(dirs, idxs)))) hist_ = ut.dict_hist(coupled_dirs) coupled_idxs = ut.list_argsort(hist_.values())[::-1] most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100]) print('Coupled fpaths: ' + ut.repr2(most_coupled, nl=True)) print('%d unique files are duplicated' % (len(unflat_sizes), )) #print('Duplicate sizes: ' + ut.repr2(unflat_sizes[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0::5], nl=True)) print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths, nl=True)) # Find duplicate directories dpath_list = list(drive.dpath_to_fidx.keys()) fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list) #exists_list = list(map(exists, drive.fpath_list)) #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list) fname_registry = [basename(fpath) for fpath in drive.fpath_list] unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list) def unsorted_list_hash(list_): return ut.hashstr27(str(sorted(list_))) unflat_fname_sets = list( map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000))) fname_based_duplicate_dpaths = [] multiindex_dict2_ = build_multindex(unflat_fname_sets) fname_based_duplicate_hashes = [ key for key, val in multiindex_dict2_.items() if len(val) > 1 ] print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes), )) fname_based_duplicate_didxs = ut.dict_take( multiindex_dict2_, fname_based_duplicate_hashes) fname_based_duplicate_dpaths = ut.list_unflat_take( dpath_list, fname_based_duplicate_didxs) print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
def make_name_model(num_annots, num_names=None, verbose=True, mode=1, num_scores=2, p_score_given_same=None, hack_score_only=False, score_basis=None, special_names=None): r""" CommandLine: python -m ibeis.algo.hots.bayes --exec-make_name_model --no-cnn python -m ibeis.algo.hots.bayes --exec-make_name_model --show --no-cnn python -m ibeis.algo.hots.bayes --exec-make_name_model --num-annots=3 Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.hots.bayes import * # NOQA >>> defaults = dict(num_annots=2, num_names=2, verbose=True) >>> modeltype = ut.get_argval('--modeltype', default='bayes') >>> kw = ut.argparse_funckw(make_name_model, defaults) >>> model = make_name_model(**kw) >>> ut.quit_if_noshow() >>> model.show_model(show_prior=False, show_title=False, modeltype=modeltype) >>> ut.show_if_requested() """ if special_names is None: special_names = SPECIAL_BASIS_POOL assert mode == 1, 'only can do mode 1' base = ut.get_argval('--base', type_=str, default='a') annots = ut.chr_range(num_annots, base=base) # The indexes of match CPDs will not change if another annotation is added upper_diag_idxs = ut.colwise_diag_idxs(num_annots, 2) if hack_score_only: upper_diag_idxs = upper_diag_idxs[-hack_score_only:] if num_names is None: num_names = num_annots # +--- Define CPD Templates and Instantiation --- cpd_list = [] # Name Factor name_cpd_t = pgm_ext.TemplateCPD( NAME_TTYPE, ('n', num_names), special_basis_pool=special_names) name_cpds = [name_cpd_t.new_cpd(parents=aid) for aid in annots] #name_cpds = [name_cpd_t.new_cpd(parents=aid, constrain_state=count) # for count, aid in enumerate(annots, start=1)] cpd_list.extend(name_cpds) # Match Factor def match_pmf(match_type, n1, n2): return { True: {'same': 1.0, 'diff': 0.0}, False: {'same': 0.0, 'diff': 1.0}, }[n1 == n2][match_type] match_states = ['diff', 'same'] match_cpd_t = pgm_ext.TemplateCPD( MATCH_TTYPE, match_states, evidence_ttypes=[name_cpd_t, name_cpd_t], pmf_func=match_pmf) #match_cpd_t.varpref = 'S' namepair_cpds = ut.list_unflat_take(name_cpds, upper_diag_idxs) match_cpds = [match_cpd_t.new_cpd(parents=cpds) for cpds in namepair_cpds] cpd_list.extend(match_cpds) # Score Factor score_states = list(range(num_scores)) if score_basis is not None: score_states = ['%.2f' % (s,) for s in score_basis] if p_score_given_same is None: tmp = np.arange(num_scores + 1)[1:] tmp = np.cumsum(tmp) tmp = (tmp / tmp.sum()) p_score_given_same = tmp def score_pmf(score_type, match_type): if isinstance(score_type, six.string_types): score_type = score_states.index(score_type) if match_type == 'same': return p_score_given_same[score_type] else: return p_score_given_same[-(score_type + 1)] score_cpd_t = pgm_ext.TemplateCPD( SCORE_TTYPE, score_states, evidence_ttypes=[match_cpd_t], pmf_func=score_pmf) #match_cpd_t.varpref = 'P' score_cpds = [score_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)] cpd_list.extend(score_cpds) with_humans = False if with_humans: human_states = ['diff', 'same'] human_cpd_t = pgm_ext.TemplateCPD( 'human', human_states, evidence_ttypes=[match_cpd_t], pmf_func=[[.9, .1], [.1, .9]]) human_cpds = [human_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)] cpd_list.extend(human_cpds) with_rank = False # Rank depends on dependant scores if with_rank: rank_states = ['0', '1', '2', '3'] rank_cpd_t = pgm_ext.TemplateCPD( 'rank', rank_states, evidence_ttypes=[match_cpd_t], pmf_func=None) rank_cpds = [rank_cpd_t.new_cpd(parents=cpds) for cpds in zip(match_cpds)] cpd_list.extend(rank_cpds) # L___ End CPD Definitions ___ print('score_cpds = %r' % (ut.list_getattr(score_cpds, 'variable'),)) # Make Model model = pgm_ext.define_model(cpd_list) model.num_names = num_names if verbose: model.print_templates(ignore_ttypes=[MATCH_TTYPE]) return model