def inferMissingVariances(data, meta, sample_ids, ctrl_spec, ctrl_geneset): #Check for nan variances for s, sample_id in enumerate(sample_ids): if sum(np.isnan(data[:, s, 1]) == 0): continue #If this is a control replicate, ignore it, as JACKS will use the data variance for this at model time if sample_id in ctrl_spec and ctrl_spec[sample_id] == sample_id: continue # If this is a sample, and a ctrl_geneset is specified, use twice the variance between # sample and control within this set to infer the mean-variance relationship, # then apply to all nan variances if sample_id in ctrl_spec and len(ctrl_geneset) > 0: nan_flags = np.isnan(data[:, s, 1]) guideset_indexs = [ i for i, x in enumerate(meta[:, 1]) if x in ctrl_geneset ] ctrl_data = data[:, [sample_ids.index(ctrl_spec[sample_id])], 0] concat_data = np.concatenate((ctrl_data, data[:, [s], 0]), axis=1) data[nan_flags, s, 1] = 2 * calc_posterior_sd( concat_data, guideset_indexs=guideset_indexs)[nan_flags] #sigma_hat else: LOG.warning( 'Undefined variances in sample %s, set --ctrl_genes input to JACKS to infer variances from control genes' % sample_id) return data
def normalizeLogCounts(logcounts, normtype='median', ctrl_guide_indexes=[]): LOG.info('Applying %s normalisation' % normtype) G, L = logcounts.shape if normtype == 'median': logcounts -= np.tile(np.nanmedian(logcounts, axis=0), (G, 1)) # median-normalize elif normtype == 'zmad': logcounts -= np.tile(np.nanmedian(logcounts, axis=0), (G, 1)) # median-normalize logcounts = logcounts / np.tile( 1.4826 * np.nanmedian(abs(logcounts), axis=0), (G, 1)) #adjust to median absolute deviation = 1 elif normtype == 'mode': for i in range(L): hist, bin_edges = np.histogram(logcounts[:, i], bins=100) hist_smooth = 0.1 * hist[:-4] + 0.2 * hist[1:-3] + 0.4 * hist[ 2:-2] + 0.2 * hist[3:-1] + 0.1 * hist[4:] bin_middles = 0.5 * bin_edges[3:-2] + 0.5 * bin_edges[2:-3] norm_factor = bin_middles[np.argmax(hist_smooth)] logcounts[:, i] -= norm_factor elif normtype == 'ctrl_guides': if len(ctrl_guide_indexes) == 0: raise Exception('No guides specified for ctrl guide normalization') logcounts -= np.tile( np.nanmedian(logcounts[ctrl_guide_indexes, :], axis=0), (G, 1)) else: raise Exception('Unrecognised normalisation type %s' % normtype) return logcounts
def resample_run_jacks(count_tab: Union[pd.DataFrame, Dict[str, pd.DataFrame]], repmap_fn: Union[str, os.PathLike], fractions: List[float], nreps: int, tabulate: True, working_dir: Union[str, os.PathLike], processors: int = None, do_resample=True, jacks_kwargs=None): """Run a resampling experiment. If do_resample is True, the count_tab is resampled, to size given in fractions, nreps times per fraction. If do_resample is False, a dictionary of already resampled counts should be supplied as count_tab. Returns dict of dict of DF prodcued by tabulate_score, keyed first by fraction and then rep letter. repmap is in the JACKS format.""" #todo make work with other analyses from jacks.jacks_io import runJACKS from jacks.infer import LOG as jacksLOG jacksLOG.setLevel(logging.WARNING) if jacks_kwargs is None: jacks_kwargs = {} jkwgs = dict(ctrl_sample_hdr='ctrl', gene_hdr='gene', sgrna_hdr='guide') jkwgs.update(jacks_kwargs) assert os.path.isdir(working_dir) if do_resample: resamped_tabs = get_resampled_tabs(count_tab, fractions, nreps, processors) else: resamped_tabs = count_tab # the output tables = {f: {} for f in fractions} for frac, letter, k in iter_reps(nreps, fractions): tab = resamped_tabs[frac][letter] tabpath = f"{working_dir}/count_{k}.tsv" tab.to_csv(tabpath, '\t') respath = f"{working_dir}/jacks_{k}" runJACKS(tabpath, repmap_fn, tabpath, 'rep', 'samp', outprefix=respath, **jkwgs) if tabulate: tables[frac][letter] = tabulate_score(respath, return_ps=True) if tabulate: return tables else: return None
def set_logger(log_fn): hndlr = logging.FileHandler(log_fn, 'w') # hndlr.setLevel(logging.INFO) pipeLOG.setLevel(logging.INFO) pipeLOG.addHandler(hndlr) try: jacksLOG.addHandler(hndlr) except: pass
def collateTestControlSamples(data, sample_ids, ctrl_spec): test_sample_idxs = [ i for i, x in enumerate(sample_ids) if ctrl_spec[x] != x ] LOG.info('Collating %d samples' % len(test_sample_idxs)) testdata = data[:, test_sample_idxs, :] ctrldata = data[:, [ sample_ids.index(ctrl_spec[sample_ids[idx]]) for idx in test_sample_idxs ], :] return testdata, ctrldata, test_sample_idxs
def readControlGeneset(ctrl_genes, gene_spec): known_genes = set([gene_spec[x] for x in gene_spec]) if os.path.isfile(ctrl_genes): f = io.open(ctrl_genes) geneset = set([line.split()[0] for line in f if line.split()[0] in known_genes]) f.close() LOG.info('Read %d recognised control genes from %s' % (len(geneset), ctrl_genes)) else: if ctrl_genes not in known_genes: raise Exception('Not a file or unrecognised control gene: %s' % ctrl_genes) geneset = set([ctrl_genes]) LOG.info('Using %s as control gene' % (ctrl_genes)) return geneset
def preprocess(countfile, replicatefile, guidemappingfile, rep_hdr=REP_HDR_DEFAULT, sample_hdr=SAMPLE_HDR_DEFAULT, common_ctrl_sample=COMMON_CTRL_SAMPLE_DEFAULT, ctrl_sample_hdr=None, sgrna_hdr=SGRNA_HDR_DEFAULT, gene_hdr=GENE_HDR_DEFAULT, ignore_blank_genes=False, outprefix=OUTPREFIX_DEFAULT, reffile=None): # Load the specification of samples to include LOG.info('Loading sample specification') sample_spec, ctrl_spec, sample_num_reps = createSampleSpec(countfile, replicatefile, rep_hdr, sample_hdr, common_ctrl_sample, ctrl_sample_hdr) # Load the mappings from guides to genes LOG.info('Loading gene mappings') gene_spec = createGeneSpec(guidemappingfile, sgrna_hdr, gene_hdr, ignore_blank_genes=ignore_blank_genes) sgrna_reference_file = reffile x_ref = None if sgrna_reference_file: # Load the sgrna reference (precomputed X's) LOG.info('Loading sgrna reference values') x_ref = loadSgrnaReference(reffile) # Check that the data to be loaded have sgrna reference values LOG.info('Checking sgrna reference identifiers against gene mappings') for guide in gene_spec: if guide not in x_ref: raise Exception('%s has no sgrna reference in %s' % (guide, sgrna_reference_file)) return sample_spec, ctrl_spec, gene_spec, x_ref
def writeJacksWResults( outprefix, jacks_results, cell_lines, write_types=[''], ctrl_geneset=set(), fdr=None, fdr_thresh_type='REGULAR', pseudo=False): #Sort genes by w1 ordered_genes = getSortedGenes(jacks_results) fouts = [io.open(outprefix + '_gene%s_JACKS_results.txt' % write_type,'w') for write_type in write_types] for fout in fouts: fout.write(u'Gene\t%s\n' % ('\t'.join(cell_lines))) if '_fdr' in write_types or '_pval' in write_types: LOG.info('Computing P-values') jacks_w1_pvals,jacks_w1_fdrs = computeW1PvalsAndFDRs(jacks_results, cell_lines, noness_genes = ctrl_geneset, pseudo=pseudo, compute_fdr=('_fdr' in write_types)) #Determine threshold sets for fdr cut-offs (blank out non-significant genes) if fdr is not None: if fdr_thresh_type == 'REGULAR': fdr_sets = getFDRGeneSets(jacks_w1_pvals, fdr) elif fdr_thresh_type == 'LOCAL_FDR': fdr_sets = getLocalFDRGeneSets(jacks_w1_fdrs, fdr) else: raise Exception('Unrecognised FDR threshold type (expecting REGULAR or LOCAL_FDR): ', fdr_thresh_type) #Write out one line per gene (all cell lines) for w1_mean, gene in ordered_genes: for write_type,fout in zip(write_types, fouts): #Determine whether to include the gene for each cell line (if fdr thresholded) if fdr is not None: sig_gene_flags = [(gene in x) for x in fdr_sets] else: sig_gene_flags = [True for x in jacks_results[gene][4]] if sum(sig_gene_flags) == 0: continue #Write out the values if write_type=='_pval': w1s = ['%5e' % x for x in jacks_w1_pvals[gene]] elif write_type == '_fdr': w1s = ['%5e' % x for x in jacks_w1_fdrs[gene]] elif write_type == '_std': w1s = ['%5e' % np.sqrt(w2 - w1**2.0) for (w1,w2) in zip(jacks_results[gene][4],jacks_results[gene][5])] elif write_type == '': w1s = [('%5e' % w1) if flag else '' for (w1,flag) in zip(jacks_results[gene][4],sig_gene_flags)] else: raise Exception('Unrecognised write type: %s' % write_type) w1_str = '\t'.join(w1s) if 'JACKS_PSEUDO_GENE' not in gene: fout.write(u'%s\t%s\n' % (gene, w1_str)) for fout in fouts: fout.close()
def load_data_and_run(sample_spec, gene_spec, ctrl_spec, sgrna_reference_file, x_ref, outprefix, apply_w_hp=APPLY_W_HP_DEFAULT, norm_type=NORM_TYPE_DEFAULT, ctrl_genes=None, fdr=None, fdr_thresh_type = 'REGULAR', n_pseudo=0, count_prior=32 ): # Load negative control genes (if any) ctrl_geneset = readControlGeneset(ctrl_genes, gene_spec) if ctrl_genes is not None else set() if '/' in outprefix and not os.path.exists(os.path.dirname(outprefix)): os.makedirs(os.path.dirname(outprefix)) outfile_x = outprefix + '_grna_JACKS_results.txt' outfile_lfc = outprefix + '_logfoldchange_means.txt' outfile_lfc_std = outprefix + '_logfoldchange_std.txt' outfile_pickle = outprefix + PICKLE_FILENAME # Load the data and preprocess LOG.info('Loading data and pre-processing') data, meta, sample_ids, genes, gene_index = loadDataAndPreprocess(sample_spec, gene_spec,ctrl_spec=ctrl_spec,normtype=norm_type, ctrl_geneset=ctrl_geneset, prior=count_prior) gene_grnas = {gene: [x for x in meta[gene_index[gene], 0]] for gene in gene_index} testdata, ctrldata, test_sample_idxs = collateTestControlSamples(data, sample_ids, ctrl_spec) sample_ids_without_ctrl = [sample_ids[idx] for idx in test_sample_idxs] x_reference = None if sgrna_reference_file: # Create the X reference (in the correct order) x_reference = {'X1': np.array([eval(x_ref[x]['X1']) for x in meta[:, 0]]), 'X2': np.array([eval(x_ref[x]['X2']) for x in meta[:, 0]])} else: writeFoldChanges(outfile_lfc, testdata, ctrldata, meta, sample_ids_without_ctrl) writeFoldChanges(outfile_lfc_std, testdata, ctrldata, meta, sample_ids_without_ctrl, write_std=True) #Run all samples against their controls LOG.info('Running JACKS inference') jacks_results = inferJACKS(gene_index, testdata, ctrldata, apply_w_hp=apply_w_hp, fixed_x=x_reference) #Add a set of pseudo genes, created by randomly sampling from guides targeting genes in the control set if n_pseudo > 0 and len(ctrl_geneset) > 0: LOG.info('Running JACKS inference on %d pseudogenes' % n_pseudo) pseudo_gene_index = createPseudoNonessGenes(gene_index, ctrl_geneset, n_pseudo) jacks_pseudo_results = inferJACKS(pseudo_gene_index, testdata, ctrldata, apply_w_hp=apply_w_hp) writeJacksWResults(outprefix + '_pseudo_noness', jacks_pseudo_results, sample_ids_without_ctrl, write_types=['', '_std'] ) for gene in jacks_results: jacks_pseudo_results[gene] = jacks_results[gene] # Write out the results LOG.info('Writing JACKS results') if len(ctrl_geneset) > 0 and n_pseudo > 0: writeJacksWResults(outprefix, jacks_pseudo_results, sample_ids_without_ctrl, ctrl_geneset=set([x for x in jacks_pseudo_results if 'JACKS_PSEUDO_GENE' in x]), write_types=['', '_std', '_pval'], fdr=fdr, pseudo=True, fdr_thresh_type=fdr_thresh_type) else: writeJacksWResults(outprefix, jacks_results, sample_ids_without_ctrl, ctrl_geneset=ctrl_geneset, write_types=['', '_std']) writeJacksXResults(outfile_x, jacks_results, gene_grnas) pickleJacksFullResults(outfile_pickle, jacks_results, sample_ids_without_ctrl, gene_grnas)
import logging from jacks.jacks_io import runJACKSFromArgs from jacks.infer import LOG if __name__ == '__main__': LOG.setLevel(logging.INFO) runJACKSFromArgs()
def runJACKS(countfile, replicatefile, guidemappingfile, rep_hdr=REP_HDR_DEFAULT, sample_hdr=SAMPLE_HDR_DEFAULT, common_ctrl_sample=COMMON_CTRL_SAMPLE_DEFAULT, ctrl_sample_hdr=None, sgrna_hdr=SGRNA_HDR_DEFAULT, gene_hdr=GENE_HDR_DEFAULT, outprefix=OUTPREFIX_DEFAULT, reffile=None, apply_w_hp=APPLY_W_HP_DEFAULT): outprefix = outprefix if '/' in outprefix and not os.path.exists(os.path.dirname(outprefix)): os.makedirs(os.path.dirname(outprefix)) outfile_w = outprefix + '_gene_JACKS_results.txt' outfile_w2 = outprefix + '_genestd_JACKS_results.txt' outfile_x = outprefix + '_grna_JACKS_results.txt' outfile_lfc = outprefix + '_logfoldchange_means.txt' outfile_lfc_std = outprefix + '_logfoldchange_std.txt' outfile_pickle = outprefix + PICKLE_FILENAME # Load the specification of samples to include LOG.info('Loading sample specification') sample_spec, ctrl_spec, sample_num_reps = createSampleSpec( countfile, replicatefile, rep_hdr, sample_hdr, common_ctrl_sample, ctrl_sample_hdr) # Load the mappings from guides to genes LOG.info('Loading gene mappings') gene_spec = createGeneSpec(guidemappingfile, sgrna_hdr, gene_hdr) sgrna_reference_file = reffile if sgrna_reference_file: # Load the sgrna reference (precomputed X's) LOG.info('Loading sgrna reference values') x_ref = loadSgrnaReference(reffile) # Check that the data to be loaded have sgrna reference values LOG.info('Checking sgrna reference identifiers against gene mappings') for guide in gene_spec: if guide not in x_ref: raise Exception('%s has no sgrna reference in %s' % (guide, sgrna_reference_file)) # Load the data and preprocess LOG.info('Loading data and pre-processing') data, meta, sample_ids, genes, gene_index = loadDataAndPreprocess( sample_spec, gene_spec) gene_grnas = { gene: [x for x in meta[gene_index[gene], 0]] for gene in gene_index } x_reference = None if sgrna_reference_file: # Create the X reference (in the correct order) x_reference = { 'X1': np.array([eval(x_ref[x]['X1']) for x in meta[:, 0]]), 'X2': np.array([eval(x_ref[x]['X2']) for x in meta[:, 0]]) } else: writeFoldChanges(outfile_lfc, data, meta, sample_ids) writeFoldChanges(outfile_lfc_std, data, meta, sample_ids, write_std=True) #Run all samples against their controls LOG.info('Running JACKS inference') testdata, ctrldata, test_sample_idxs = collateTestControlSamples( data, sample_ids, ctrl_spec) jacks_results = inferJACKS(gene_index, testdata, ctrldata, apply_w_hp=apply_w_hp) # Write out the results LOG.info('Writing JACKS results') sample_ids_without_ctrl = [sample_ids[idx] for idx in test_sample_idxs] writeJacksWResults(outfile_w, jacks_results, sample_ids_without_ctrl) writeJacksWResults(outfile_w2, jacks_results, sample_ids_without_ctrl, write_w2=True) writeJacksXResults(outfile_x, jacks_results, gene_grnas) pickleJacksFullResults(outfile_pickle, jacks_results, sample_ids_without_ctrl, gene_grnas)
parser = getJacksParser() parser.add_argument("--sample_id", type=str, default=None, help="Sample id to run MAGeCK on") parser.add_argument("--v10", type=str, default='', help="Data set label") args = parser.parse_args() inputs_dir = 'input_files' if not os.path.isdir(inputs_dir): os.makedirs(inputs_dir) # Load the specification of samples to include LOG.info('Loading sample specification') sample_spec, ctrl_spec, sample_num_reps = createSampleSpec(args.countfile, args.replicatefile, args.rep_hdr, args.sample_hdr, args.common_ctrl_sample, args.ctrl_sample_hdr) # Load the mappings from guides to genes LOG.info('Loading gene mappings') gene_spec = createGeneSpec(args.guidemappingfile, args.sgrna_hdr, args.gene_hdr) # Sample not specified: re-call self for all samples if args.sample_id is None: for sample_id in ctrl_spec: if ctrl_spec[sample_id] == sample_id: continue cmd = py_cmd + ' ' + ' '.join(sys.argv) + ' --sample_id="%s"' % sample_id os.system(cmd) #Sample specified - run MAGeCK else:
import sys, io, os, random, logging import scipy.stats as ST import numpy as np from jacks.preprocess import subsample_and_preprocess from jacks.jacks_io import readControlGeneset, collateTestControlSamples, createSampleSpec, createGeneSpec from jacks.infer import LOG, inferJACKSGene LOG.setLevel(logging.WARNING) if len(sys.argv) != 8 and len(sys.argv) != 9: print('Usage: sample_jacks_screen.py condensed_input test_line num_replicates(-1 for all) num_celllines(-1 for all) outfile num_samples num_guides(-1 for all) job_idx\n') print('where, condensed_input = countfile#replicatefile:rep_hdr:sample_hdr:ctrl_sample_or_hdr#guidemappingfile:sgrna_hdr:gene_hdr#ctrl_genes(can be blank)') else: #Minimial checks on this, as this is for a script that is intended for use internally only condensed_input = sys.argv[1] countfile, replicatestuff, grnastuff, ctrl_genes = condensed_input.split('#') replicatefile, rep_hdr, sample_hdr, ctrl_sample_or_hdr = replicatestuff.split(':') guidemappingfile, sgrna_hdr, gene_hdr = grnastuff.split(':') ctrl_sample_hdr = ctrl_sample_or_hdr if ctrl_sample_or_hdr == 'Control' else None sample_spec, ctrl_spec, sample_num_reps = createSampleSpec(countfile, replicatefile, rep_hdr, sample_hdr, ctrl_sample_or_hdr, ctrl_sample_hdr) gene_spec = createGeneSpec(guidemappingfile, sgrna_hdr, gene_hdr) test_celllines = [sample_id for sample_id in ctrl_spec if ctrl_spec[sample_id] != sample_id] ctrl_geneset = readControlGeneset(ctrl_genes) if ctrl_genes is not '' else set() normtype = 'median' test_line = sys.argv[2] num_replicates = eval(sys.argv[3]) num_celllines = eval(sys.argv[4]) outfile = sys.argv[5]
for sample_id, colname in sample_spec[filename]: if sample_id == cell_line or sample_id == ctrl_spec[cell_line]: if filename not in new_sample_spec: new_sample_spec[filename] = [] new_sample_spec[filename].append((sample_id, colname)) return new_sample_spec def filterCtrlSpec(ctrl_spec, cell_line): new_ctrl_spec = {} new_ctrl_spec[cell_line] = ctrl_spec[cell_line] #Sample new_ctrl_spec[ctrl_spec[cell_line]] = ctrl_spec[cell_line] #Control return new_ctrl_spec if __name__ == '__main__': LOG.setLevel(logging.WARNING) parser = getJacksParser() parser.add_argument("--cell_line", type=str, default=None, help="cell line to run") parser.add_argument("--separate", action='store_true', default=False, help="Run cell lines separately") args = parser.parse_args() outprefix = args.outprefix if '/' in outprefix and not os.path.exists(os.path.dirname(outprefix)): os.makedirs(os.path.dirname(outprefix)) # Load the specification of samples to include
from argparse import Namespace from attrdict import AttrDict, AttrMap import yaml import numpy as np import pandas as pd import matplotlib.pyplot as plt from crispr_tools import qc, tools, jacks_tools import pprint try: from jacks.jacks_io import runJACKS from jacks.infer import LOG as jacksLOG jacksLOG.setLevel(logging.WARNING) except ImportError: print('To run jacks you need to install JACKS,\n', 'https://github.com/felicityallen/JACKS/tree/master/jacks\n' "You can still run Mageck though, if it's installed.") def runJACKS(*a, **k): raise ModuleNotFoundError('JACKS not installed!!!') from crispr_tools.drugz import drugZ_analysis from crispr_tools.tools import list_not_str # with open(pathlib.Path(__file__).parent/'version.txt') as f: # __version__ = f.readline().replace('\n', '') class ConfigurationError(Exception): """Errors in the configuration file that would prevent the pipeline from running"""
from jacks.jacks_io import createGeneSpec, createSampleSpec, getJacksParser, collateTestControlSamples, writeJacksWResults from jacks.preprocess import loadDataAndPreprocess import scipy as SP def infer_JACKS_meanfc(gene_index, testdata, ctrldata): results = {} for gene in gene_index: Ig = gene_index[gene] y = (testdata[Ig, :, 0] - ctrldata[Ig, :, 0]) w1 = SP.nanmean(y, axis=0) results[gene] = (y, -1.0, -1.0, -1.0, w1, -1.0) return results LOG.setLevel(logging.WARNING) parser = getJacksParser() args = parser.parse_args() outprefix = args.outprefix if '/' in outprefix and not os.path.exists(os.path.dirname(outprefix)): os.makedirs(os.path.dirname(outprefix)) outfile_w = outprefix + '_gene_results.txt' outfile_w2 = outprefix + '_genestd_results.txt' # Load the specification of samples to include LOG.info('Loading sample specification') sample_spec, ctrl_spec, sample_num_reps = createSampleSpec( args.countfile, args.replicatefile, args.rep_hdr, args.sample_hdr, args.common_ctrl_sample, args.ctrl_sample_hdr)