def make_plots(pics_lt, dirpath, filename): # Load model, sorting components by scale and instantiating distributions from names with open(dirpath + '/' + filename) as file: model = json.load(file) model_params, name = model[:-1], model[-1] dist_names, params, params_fix, weights = [ list(x) for x in zip(*sorted(zip(*model[:-1]), key=lambda y: y[1]['scale'])) ] dists = [dists_dict[dist_name] for dist_name in dist_names] mixmod = MixtureModel(dists, params, params_fix, weights, name) # Calculate quantiles _, feature = dirpath.split('/') q_obs = pics_lt.loc[pics_lt[feature] != 0, feature].sort_values() q_dist = [ mixmod.ppf((i - 0.5) / len(q_obs)) for i in range(1, len(q_obs) + 1) ] for cutoff in [5, 2, 1, 0]: if cutoff == 0: sliver = slice(None) # Slices do not behave nicely with zeroes else: idx = ceil(cutoff * len(q_obs) / 100) # Ceil ensures the cutoff fraction is removed sliver = slice(idx, -idx) # Plot quantiles fig, ax = plt.subplots() ax.scatter(q_dist[sliver], q_obs[sliver], s=10, edgecolors='none') ax.plot(ax.get_xlim(), ax.get_xlim(), color='k', linewidth=1) # Plot y=x line ax.set_xlabel('Theoretical Quantiles') ax.set_ylabel('Observed Quantiles') ax.set_title(f'{feature}: {name} with {cutoff}% Trimmed Tails') plt.savefig(f'out/{feature}/qq_{cutoff}_{name}.png') plt.close() # Plot histograms fig, ax = plt.subplots() ax.hist(q_obs[sliver], bins=50, density=True, color='white', linewidth=1, edgecolor='black') x = linspace(min(q_obs[sliver]), max(q_obs[sliver]), 1000) for i in range(len(mixmod.dists)): ax.plot(x, mixmod.pdf_comp(x, comp=i), label=mixmod.dists[i].name) ax.plot(x, mixmod.pdf(x), label='total') ax.set_xlabel(feature) ax.set_ylabel('Density') ax.set_title(f'{feature}: {name} with {cutoff}% Trimmed Tails') ax.legend() plt.savefig(f'out/{feature}/hist_{cutoff}_{name}.png') plt.close()
def fit_model(rates, feature, model): # Filter data and unpack variables data = rates.loc[rates[feature] != 0, feature].sort_values() name, dists = model # Make output directories for feature models cur_dir = f'out/{feature}/' if not os.path.exists(cur_dir): os.makedirs(cur_dir) # Recursive folder creation # Get maxes for generation of random initials rand_maxes = [] for dist in dists: cfe = cfes[dist.name] # Get closed-form estimator n = max(1, int(len(data) * random())) sample_params = pd.DataFrame( [cfe(np.random.choice(data, n)) for _ in range(500)]) rand_max = (sample_params.mean() + num_std * sample_params.std()).to_dict() rand_maxes.append(rand_max) results = [] excepts = [] # For numerical exceptions (nan, inf) while len(results) < num_init: mixmod = MixtureModel(dists, name=name, params=get_rand_params(rand_maxes)) try: n, ll = mixmod.fit(data, max_iter=1000) except RuntimeError: # Catch RuntimeErrors from failure to converge pass # Store output based on ll value if np.isnan(ll) or np.isinf(ll): excepts.append((n, ll, mixmod)) else: results.append((n, ll, mixmod)) # Store model with largest log-likelihood _, _, mixmod_max = max(results, key=lambda x: x[1]) model = ([dist.name for dist in mixmod_max.dists], mixmod_max.params, mixmod_max.params_fix, mixmod_max.weights, name) with open(cur_dir + f'model_{name}.json', 'w') as file: json.dump(model, file) # Store EM metadata results.extend(excepts) ns, lls, cons = zip(*[(n, ll, mixmod.converged) for n, ll, mixmod in results]) df = pd.DataFrame.from_dict({'n_iter': ns, 'll': lls, 'converged': cons}) df.to_csv(cur_dir + f'meta_{name}.tsv', sep='\t', na_rep='nan')
for feature in os.listdir('out'): model_paths = [ x for x in os.listdir('out/' + feature) if x.endswith('.json') ] fig, axs = plt.subplots(len(model_paths), 1, figsize=(6, 6)) fig.subplots_adjust(top=0.875, bottom=0.075) fig.suptitle( f'{feature}:\nPosterior Probabilities of Mixture Model Components', y=0.95, size=10) for i, model_path in enumerate(model_paths): # Load model with open('/'.join(['out', feature, model_path])) as file: dist_names, params, params_fix, weights, name = json.load(file) dists = [dists_dict[dist_name] for dist_name in dist_names] mixmod = MixtureModel(dists, params, params_fix, weights, name) # Create heatmap data = rates.loc[rates[feature] != 0, feature].sort_values() expts = mixmod.posterior(data) # Plot heatmap ax = axs[i] ax.imshow(expts, vmin=0, vmax=1, aspect='auto', extent=[0, len(data), 0, len(mixmod.dists)]) ax.set_aspect(0.035 * len(data)) ax.set_title(mixmod.name, size=8) ax.tick_params(labelsize=7.5)
# Make output directory if not os.path.exists('out/'): os.mkdir('out/') rates = {} counts = {} fracs = {} for feature in pics_lt: # Load model, sorting components by scale and instantiating distributions from names with open(f'../mixture_pic/out/{feature}/model_{model_paths[feature]}.json') as file: model = json.load(file) model_params, name = model[:-1], model[-1] dist_names, params, params_fix, weights = [list(x) for x in zip(*sorted(zip(*model[:-1]), key=lambda y: y[1]['scale']))] dists = [dists_dict[dist_name] for dist_name in dist_names] mixmod = MixtureModel(dists, params, params_fix, weights, name) # Remove extremes raw = pics_lt[feature] idx = np.flatnonzero(mixmod.posterior(raw)[-1] < thresh) clean = raw[idx] rates[feature] = (clean ** 2).groupby('block_id').mean() # Count extremes counts[feature] = len(raw) - len(clean) fracs[feature] = (len(raw) - len(clean)) / (len(raw) - (raw == 0).sum()) # Fraction of non-zero contrasts # Plot histograms of contrast counts in each block y = clean.groupby('block_id').count().value_counts() plt.bar(y.index, y) plt.title(f'{feature}: Contrast Counts in Blocks')