def map_variant_to_mean_full_relative_gamma(datadir, *, dose='sober', filtered=True): child_gammas = get_child_gammas(datadir) parent_gammas = get_parent_gammas(datadir) varcon = mapping_lib.get_mapping('variant', 'control', datadir) vargamma = mapping_lib.get_mapping('variant', 'gamma', datadir, dose=dose) conmask = vargamma.index.intersection(varcon.loc[varcon.control].index) congamma = vargamma.loc[conmask] sigma = congamma.std().gamma z = -sigma # easier to read unfiltered = (child_gammas / parent_gammas) - 1 if filtered: geodelt_gammas = unfiltered.where(parent_gammas < (_Z_THRESHOLD * z)) else: geodelt_gammas = unfiltered relgammas = select_dose(geodelt_gammas, dose, datadir) relgammas = relgammas.stack(level='sid', dropna=False)[['03']].unstack() if filtered: colname = 'relgamma' else: colname = 'unfiltered_relgamma' relgammas = pd.DataFrame(relgammas.mean(axis=1), columns=[colname]) relgammas.reset_index(inplace=True) mapping_lib.make_mapping(relgammas, 'variant', colname, datadir, dose=dose)
def fetch_training_data(datadir): data = mapping_lib.get_mapping('variant', 'relgamma', datadir, dose='sober') data = training_lib.filter_for_training(data, datadir) data = data.dropna() data.reset_index(inplace=True) var_orig = mapping_lib.get_mapping('variant', 'original', datadir).original var_pam = mapping_lib.get_mapping('variant', 'pam', datadir).pam data['original'] = data.variant.map(var_orig) data['pam'] = data.variant.map(var_pam) data.set_index('variant', inplace=True) return data
def map_variant_to_bin(datadir, dose='sober'): varrg = mapping_lib.get_mapping('variant', 'relgamma', datadir, dose=dose) bins = relgamma_bins() rgbin = bin_relgammas(varrg.relgamma.values, bins) rgbin = pd.DataFrame(rgbin.T, index=varrg.index, columns=['rgbin']).reset_index() mapping_lib.make_mapping(rgbin, 'variant', 'rgbin', datadir, dose=dose)
def compute_rough_gammas(datadir): data = mapping_lib.get_countdata(datadir) varcon = mapping_lib.get_mapping('variant', 'control', datadir) data = data.merge(varcon.reset_index(), how='left', on='variant') data = data.merge(mapping_lib.get_sample_tags(datadir), how='left', on='sample') data.drop('sample', axis=1, inplace=True) def normalize(counts): return counts * _NORMAL_SIZE / counts.sum() data['norm'] = data.groupby(['sid', 'tp']).raw.transform(normalize) data['log'] = np.log2(data.norm.clip(_PSEUDO)) data.set_index(['variant', 'sid', 'tp'], inplace=True) grouper = data.groupby(['sid'], group_keys=False) relevant = list() for i in range(1, 4): namespan = _namespan_func(i) diff = grouper.apply(_diff_by_tp, 'log', k=i, raw_threshold=_THRESHOLD) diffcenters = diff.loc[data.control].unstack(level=[-2, -1]).median() dg = diff.unstack(level=[-2, -1]).subtract(diffcenters, axis='columns') dg.columns = dg.columns.map(namespan) relevant.append(dg) X = pd.concat(relevant, axis=1) X.to_csv(datadir / _ROUGH_GAMMA_FILE, sep='\t')
def derive_child_parent_gammas(datadir): var_orig = mapping_lib.get_mapping('variant', 'original', datadir) var_orig.reset_index(inplace=True) allgammas = get_normed_gammas(datadir) parent_gammas = allgammas.loc[var_orig.original] child_gammas = allgammas.loc[var_orig.variant] parent_gammas.index = child_gammas.index child_gammas.to_csv(datadir / _CHILD_GAMMA_FILE_NAME, sep='\t') parent_gammas.to_csv(datadir / _PARENT_GAMMA_FILE_NAME, sep='\t')
def downsample_families(data, ratio, datadir): var_orig = mapping_lib.get_mapping('variant', 'original', datadir) oanno = pd.merge(data, var_orig, left_on='variant', right_index=True) families = set(oanno.original.unique()) samplesize = int(len(families) * ratio) sample = random.sample(families, samplesize) samplevariants = oanno.loc[oanno.original.isin(sample)].index littledata = data.loc[data.index.intersection(samplevariants)] return littledata
def one_hot_pair_encoder(datadir): var_orig = mapping_lib.get_mapping('variant', 'original', datadir) var_pam = mapping_lib.get_mapping('variant', 'pam', datadir) bases = ['A', 'C', 'G', 'T'] enc = skpreproc.OneHotEncoder(categories=[bases], sparse=False) def encoder(seq): orig = var_orig.loc[seq].original pam = var_pam.loc[seq].pam varplus = seq + pam[0] origplus = orig + pam[0] V = np.array(list(varplus)) V = V.reshape(len(varplus), 1) O = np.array(list(origplus)) O = O.reshape(len(origplus), 1) onehot = np.stack( [enc.fit_transform(V), enc.fit_transform(O)], axis=-1) return ((varplus, origplus), onehot) return encoder
# variant -> original # TODO(jsh): fix this (see above) mapping_lib.adapt_orig_map(ORIG_MAP, UNGD) # locus_tag -> gene # TODO(jsh): fix this (see above) mapping_lib.adapt_gene_map(GENE_MAP, UNGD) # count grid mapping_lib.read_countfiles(STATICDIR, COUNT_GLOB, UNGD) mapping_lib.make_sample_tags(UNGD) # locus_tag -> locus_len mapping_lib.map_locus_tag_to_len(GENOME_FILE, UNGD) # locus_tag -> bmk_ess # locus_tag -> bmk_sick mapping_lib.process_bmk_spreadsheet(JMPBMK_ANNOS, UNGD) # variant -> is_oneoff orig_map_frame = mapping_lib.get_mapping('variant', 'original', UNGD) orig_map_frame.reset_index(inplace=True) mapping_lib.map_variant_to_oneoff(orig_map_frame, UNGD) # adapt OD data mapping_lib.adapt_od_data(OD_DATA, UNGD) # variant -> control mapping_lib.make_variant_controltag_map(UNGD)
dfra_copies = 4 muraa_copies = 4 fola_copies = 4 mura_copies = 4 con_adaptid = 0 bsu_exploit_adaptid = 1 bsu_explore_adaptid = 2 eco_exploit_adaptid = 3 eco_explore_adaptid = 4 dfra_adaptid = 5 muraa_adaptid = 6 fola_adaptid = 7 mura_adaptid = 8 cmap = mapping_lib.get_mapping('variant', 'control', UNGD) controls = list(cmap.loc[cmap.control].index) colis = build_oligos(controls, con_adaptid, ncopies=con_copies) with open(OLIGOFILE, 'w') as outfile: allolis = list() allolis.extend(colis) allolis.extend( oligos_from(BSU_EXPLOIT, bsu_exploit_adaptid, bsu_exploit_copies)) allolis.extend( oligos_from(BSU_EXPLORE, bsu_explore_adaptid, bsu_explore_copies)) allolis.extend( oligos_from(ECO_EXPLOIT, eco_exploit_adaptid, eco_exploit_copies)) allolis.extend( oligos_from(ECO_EXPLORE, eco_explore_adaptid, eco_explore_copies)) allolis.extend(oligos_from(DFRA_FILE, dfra_adaptid, dfra_copies))
N_LOCI = 300 N_FAMILIES = 10 EXPLOIT_GUIDES_PER_LOCUS = 9 EXPLORE_GUIDES_PER_FAMILY = 9 EXPLOITFILE = (UNGD / _CODEFILE).with_suffix('.exploit.tsv') EXPLOREFILE = (UNGD / _CODEFILE).with_suffix('.explore.tsv') if __name__ == '__main__': logging.info('Reading preds from {BSU_PREDFILE}...'.format(**locals())) preds = pd.read_csv(BSU_PREDFILE, sep='\t') logging.info('Reading comps from {COMPS}...'.format(**locals())) comps = pd.read_csv(COMPS, sep='\t') logging.info('Reading targets from {BSU_TARGETS}...'.format(**locals())) all_targets = pd.read_csv(BSU_TARGETS, sep='\t') var_rg = mapping_lib.get_mapping('variant', 'unfiltered_relgamma', UNGD, dose='sober') var_rg.rename(columns={'unfiltered_relgamma': 'relgamma'}, inplace=True) comps['relgamma'] = comps.variant.map(var_rg.relgamma) important = set(pd.read_csv(BSU_LOCI, sep='\t', header=None)[0]) essmap = mapping_lib.get_mapping('locus_tag', 'bmk_ess', UNGD) kinda = (important - set(essmap.loc[essmap.bmk_ess == True].index)) veryimp = important - kinda fill_size = N_LOCI - len(veryimp) # grab mean parent gamma for all such loci var_orig = mapping_lib.get_mapping('variant', 'original', UNGD) var_loc = mapping_lib.get_mapping('variant', 'locus_tag', UNGD) locsub = var_loc.loc[var_loc.locus_tag.isin(kinda)] var_orig.reset_index(inplace=True) origs = var_orig.loc[var_orig.variant == var_orig.original] bothsub = set(origs.loc[origs.variant.isin(locsub.index)].variant)
UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref') _DIR_PREFIX = pathlib.Path(__file__).parents[1] _CODEFILE = pathlib.Path(__file__).name PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots') _DATA_FRACTION = 1 _K_FOLD_SPLITS = 3 _REL_PLOT_MIN = -1.2 _REL_PLOT_MAX = 1 _EPOCHS = 30 _BATCH_SIZE = 32 ###################### # Read Relgamma Data # ###################### data = mapping_lib.get_mapping('variant', 'relgamma', UNGD, dose='sober') ############### # Filter Data # ############### # Remove non-oneoff guides (parents, off-strand, controls, etc.) data = training_lib.filter_for_training(data, UNGD) data = data.dropna() ################### # Downsample Data # ################### data = training_lib.downsample_families(data, _DATA_FRACTION, UNGD) ################### # Preprocess Data #
def filter_for_training(variantframe, datadir): var_oneoff = mapping_lib.get_mapping('variant', 'is_oneoff', datadir) maskset = var_oneoff.loc[var_oneoff.is_oneoff].index oneoffs = variantframe.loc[variantframe.index.intersection(maskset)] return oneoffs
_DIR_PREFIX = pathlib.Path(__file__).parents[1] _CODEFILE = pathlib.Path(__file__).name PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots') _PLOT_MIN = -1.2 _PLOT_MAX = 0.2 _FIGDPI = 300 shutil.rmtree(PLOTDIR, ignore_errors=True) PLOTDIR.mkdir(parents=True, exist_ok=True) ############################ # Re-load/process raw data # ############################ sober = mapping_lib.get_mapping('variant', 'gamma', UNGD, dose='sober') sober.columns = ['sober'] low = mapping_lib.get_mapping('variant', 'gamma', UNGD, dose='low') low.columns = ['low'] high = mapping_lib.get_mapping('variant', 'gamma', UNGD, dose='high') high.columns = ['high'] familymap = mapping_lib.get_mapping('variant', 'original', UNGD) locusmap = mapping_lib.get_mapping('variant', 'locus_tag', UNGD) genemap = mapping_lib.get_mapping('locus_tag', 'gene_name', UNGD) geneids = genemap.loc[locusmap.locus_tag] geneids.index = locusmap.index data = pd.concat([familymap, sober, low, high, geneids], axis=1, sort=True) for gene, group in data.groupby(['gene_name']):
UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref') _DIR_PREFIX = pathlib.Path(__file__).parents[1] _CODEFILE = pathlib.Path(__file__).name PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots') _DATA_FRACTION = 1 _K_FOLD_SPLITS = 3 _REL_PLOT_MIN = -1.2 _REL_PLOT_MAX = 1 _BATCH_SIZE = 32 _EPOCHS = 10 ###################### # Read Relgamma Data # ###################### data = mapping_lib.get_mapping('variant', 'relgamma', UNGD) ############### # Filter Data # ############### # Remove non-oneoff guides (parents, off-strand, controls, etc.) data = training_lib.filter_for_training(data, UNGD) data = data.dropna() ############### # Weight Data # ############### # binmap = mapping_lib.get_mapping('variant', 'rgbin', UNGD).loc[data.index] # binweights = gamma_lib.weight_bins(binmap.rgbin) # weightmap = binmap.rgbin.map(binweights) # weightmap.name = 'binweight'
#!/usr/bin/env python # Author: John Hawkins (jsh) [[email protected]] import logging import pathlib import mapping_lib import gamma_lib logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref') DIR_PREFIX = pathlib.Path(__file__).parents[1] _HORIA_FILE = 'horia.gammas.tsv' pg = gamma_lib.get_parent_gammas(UNGD) cg = gamma_lib.get_child_gammas(UNGD) rg = gamma_lib.unfiltered_mean_relgammas(UNGD) oo = mapping_lib.get_mapping('variant', 'is_oneoff', UNGD) oi = oo.loc[oo.is_oneoff].index mpg = pg.stack(level=0, dropna=False)['03'].unstack().mean(axis=1) mcg = cg.stack(level=0, dropna=False)['03'].unstack().mean(axis=1) import IPython; IPython.embed() rg['parent_gamma'] = mpg rg['child_gamma'] = mcg rg.loc[oi].to_csv(UNGD / _HORIA_FILE, sep='\t')
format='%(asctime)s %(levelname)s %(message)s') UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref') _DIR_PREFIX = pathlib.Path(__file__).parents[1] _CODEFILE = pathlib.Path(__file__).name PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots') _REL_PLOT_MIN = -1.2 _REL_PLOT_MAX = 1 _FIGDPI = 100 ############################ # Re-load/process raw data # ############################ data = mapping_lib.get_mapping('variant', 'relgamma', UNGD) data = training_lib.filter_for_training(data, UNGD) data = data.dropna() familymap = mapping_lib.get_mapping('variant', 'original', UNGD) familymap = familymap.loc[data.index] encoder = training_lib.one_hot_pair_encoder(UNGD) encodings = [encoder(x)[1] for x in data.index] X = np.stack(encodings, axis=0) y = np.array(data[['relgamma']], dtype=float) cross_predictions = np.full_like(y, np.nan) X_scaler = dict() for i in range(X.shape[1]): for j in range(X.shape[3]): X_scaler[(i,j)] = skpreproc.StandardScaler() X[:,i,:,j] = X_scaler[(i,j)].fit_transform(X[:,i,:,j])
UNGD = pathlib.Path('/home/jsh/ungd/proj/vecref') _DIR_PREFIX = pathlib.Path(__file__).parents[1] _CODEFILE = pathlib.Path(__file__).name PLOTDIR = (UNGD / _CODEFILE).with_suffix('.plots') _REL_PLOT_MIN = -1.2 _REL_PLOT_MAX = 1 _FIGDPI = 300 ############################ # Re-load/process raw data # ############################ data = eval_lib.fetch_training_data(UNGD) familymap = mapping_lib.get_mapping('variant', 'original', UNGD) familymap = familymap.loc[data.index] encoder = training_lib.get_linear_encoder() X, y = eval_lib.featurize_training_data(encoder, data, UNGD) cross_predictions = np.full_like(y, np.nan) y_orig = y X_scaler, y_scaler, X, y = eval_lib.scale_training_data_linear(X, y) ######################## # Read Prediction Data # ######################## modeldir = training_lib.LINEAR_MODELDIR model_template = 'model.{i}.d5' coverage_template = 'model.{i}.coverage.pickle' # Loop over models