def load_data_and_run(sample_spec, gene_spec, ctrl_spec, sgrna_reference_file, x_ref, outprefix, apply_w_hp=APPLY_W_HP_DEFAULT, norm_type=NORM_TYPE_DEFAULT, ctrl_genes=None, fdr=None, fdr_thresh_type = 'REGULAR', n_pseudo=0, count_prior=32 ): # Load negative control genes (if any) ctrl_geneset = readControlGeneset(ctrl_genes, gene_spec) if ctrl_genes is not None else set() if '/' in outprefix and not os.path.exists(os.path.dirname(outprefix)): os.makedirs(os.path.dirname(outprefix)) outfile_x = outprefix + '_grna_JACKS_results.txt' outfile_lfc = outprefix + '_logfoldchange_means.txt' outfile_lfc_std = outprefix + '_logfoldchange_std.txt' outfile_pickle = outprefix + PICKLE_FILENAME # Load the data and preprocess LOG.info('Loading data and pre-processing') data, meta, sample_ids, genes, gene_index = loadDataAndPreprocess(sample_spec, gene_spec,ctrl_spec=ctrl_spec,normtype=norm_type, ctrl_geneset=ctrl_geneset, prior=count_prior) gene_grnas = {gene: [x for x in meta[gene_index[gene], 0]] for gene in gene_index} testdata, ctrldata, test_sample_idxs = collateTestControlSamples(data, sample_ids, ctrl_spec) sample_ids_without_ctrl = [sample_ids[idx] for idx in test_sample_idxs] x_reference = None if sgrna_reference_file: # Create the X reference (in the correct order) x_reference = {'X1': np.array([eval(x_ref[x]['X1']) for x in meta[:, 0]]), 'X2': np.array([eval(x_ref[x]['X2']) for x in meta[:, 0]])} else: writeFoldChanges(outfile_lfc, testdata, ctrldata, meta, sample_ids_without_ctrl) writeFoldChanges(outfile_lfc_std, testdata, ctrldata, meta, sample_ids_without_ctrl, write_std=True) #Run all samples against their controls LOG.info('Running JACKS inference') jacks_results = inferJACKS(gene_index, testdata, ctrldata, apply_w_hp=apply_w_hp, fixed_x=x_reference) #Add a set of pseudo genes, created by randomly sampling from guides targeting genes in the control set if n_pseudo > 0 and len(ctrl_geneset) > 0: LOG.info('Running JACKS inference on %d pseudogenes' % n_pseudo) pseudo_gene_index = createPseudoNonessGenes(gene_index, ctrl_geneset, n_pseudo) jacks_pseudo_results = inferJACKS(pseudo_gene_index, testdata, ctrldata, apply_w_hp=apply_w_hp) writeJacksWResults(outprefix + '_pseudo_noness', jacks_pseudo_results, sample_ids_without_ctrl, write_types=['', '_std'] ) for gene in jacks_results: jacks_pseudo_results[gene] = jacks_results[gene] # Write out the results LOG.info('Writing JACKS results') if len(ctrl_geneset) > 0 and n_pseudo > 0: writeJacksWResults(outprefix, jacks_pseudo_results, sample_ids_without_ctrl, ctrl_geneset=set([x for x in jacks_pseudo_results if 'JACKS_PSEUDO_GENE' in x]), write_types=['', '_std', '_pval'], fdr=fdr, pseudo=True, fdr_thresh_type=fdr_thresh_type) else: writeJacksWResults(outprefix, jacks_results, sample_ids_without_ctrl, ctrl_geneset=ctrl_geneset, write_types=['', '_std']) writeJacksXResults(outfile_x, jacks_results, gene_grnas) pickleJacksFullResults(outfile_pickle, jacks_results, sample_ids_without_ctrl, gene_grnas)
def runJACKS(countfile, replicatefile, guidemappingfile, rep_hdr=REP_HDR_DEFAULT, sample_hdr=SAMPLE_HDR_DEFAULT, common_ctrl_sample=COMMON_CTRL_SAMPLE_DEFAULT, ctrl_sample_hdr=None, sgrna_hdr=SGRNA_HDR_DEFAULT, gene_hdr=GENE_HDR_DEFAULT, outprefix=OUTPREFIX_DEFAULT, reffile=None, apply_w_hp=APPLY_W_HP_DEFAULT): outprefix = outprefix if '/' in outprefix and not os.path.exists(os.path.dirname(outprefix)): os.makedirs(os.path.dirname(outprefix)) outfile_w = outprefix + '_gene_JACKS_results.txt' outfile_w2 = outprefix + '_genestd_JACKS_results.txt' outfile_x = outprefix + '_grna_JACKS_results.txt' outfile_lfc = outprefix + '_logfoldchange_means.txt' outfile_lfc_std = outprefix + '_logfoldchange_std.txt' outfile_pickle = outprefix + PICKLE_FILENAME # Load the specification of samples to include LOG.info('Loading sample specification') sample_spec, ctrl_spec, sample_num_reps = createSampleSpec( countfile, replicatefile, rep_hdr, sample_hdr, common_ctrl_sample, ctrl_sample_hdr) # Load the mappings from guides to genes LOG.info('Loading gene mappings') gene_spec = createGeneSpec(guidemappingfile, sgrna_hdr, gene_hdr) sgrna_reference_file = reffile if sgrna_reference_file: # Load the sgrna reference (precomputed X's) LOG.info('Loading sgrna reference values') x_ref = loadSgrnaReference(reffile) # Check that the data to be loaded have sgrna reference values LOG.info('Checking sgrna reference identifiers against gene mappings') for guide in gene_spec: if guide not in x_ref: raise Exception('%s has no sgrna reference in %s' % (guide, sgrna_reference_file)) # Load the data and preprocess LOG.info('Loading data and pre-processing') data, meta, sample_ids, genes, gene_index = loadDataAndPreprocess( sample_spec, gene_spec) gene_grnas = { gene: [x for x in meta[gene_index[gene], 0]] for gene in gene_index } x_reference = None if sgrna_reference_file: # Create the X reference (in the correct order) x_reference = { 'X1': np.array([eval(x_ref[x]['X1']) for x in meta[:, 0]]), 'X2': np.array([eval(x_ref[x]['X2']) for x in meta[:, 0]]) } else: writeFoldChanges(outfile_lfc, data, meta, sample_ids) writeFoldChanges(outfile_lfc_std, data, meta, sample_ids, write_std=True) #Run all samples against their controls LOG.info('Running JACKS inference') testdata, ctrldata, test_sample_idxs = collateTestControlSamples( data, sample_ids, ctrl_spec) jacks_results = inferJACKS(gene_index, testdata, ctrldata, apply_w_hp=apply_w_hp) # Write out the results LOG.info('Writing JACKS results') sample_ids_without_ctrl = [sample_ids[idx] for idx in test_sample_idxs] writeJacksWResults(outfile_w, jacks_results, sample_ids_without_ctrl) writeJacksWResults(outfile_w2, jacks_results, sample_ids_without_ctrl, write_w2=True) writeJacksXResults(outfile_x, jacks_results, gene_grnas) pickleJacksFullResults(outfile_pickle, jacks_results, sample_ids_without_ctrl, gene_grnas)
# Load negative control guides (if any) ctrl_geneset = readControlGeneset(args.ctrl_genes) if args.ctrl_genes is not None else set() # Load the data and preprocess LOG.info('Loading data and pre-processing') data, meta, sample_ids, genes, gene_index = loadDataAndPreprocess(sample_spec, gene_spec,ctrl_spec=ctrl_spec, normtype=args.norm_type, ctrl_geneset=ctrl_geneset) gene_grnas = {gene: [x for x in meta[gene_index[gene], 0]] for gene in gene_index} testdata, ctrldata, test_sample_idxs = collateTestControlSamples(data, sample_ids, ctrl_spec) sample_ids_without_ctrl = [sample_ids[idx] for idx in test_sample_idxs] #Run all samples against their controls LOG.info('Running Single JACKS inference') single_jacks_results = [] for ts in range(testdata.shape[1]): single_jacks_results.append(inferJACKS(gene_index, testdata[:,[ts],:], ctrldata[:,[ts],:], w_only=True)) jacks_results = combineSingleResults(single_jacks_results) #Add a set of pseudo genes, created by randomly sampling from guides targeting genes in the control set if args.n_pseudo > 0 and len(ctrl_geneset) > 0: LOG.info('Running Single JACKS inference on %d pseudogenes' % args.n_pseudo) pseudo_gene_index = createPseudoNonessGenes(gene_index, ctrl_geneset, args.n_pseudo) pseudo_single_results = [] for ts in range(testdata.shape[1]): pseudo_single_results.append(inferJACKS(pseudo_gene_index, testdata[:,[ts],:], ctrldata[:,[ts],:], w_only=True)) jacks_pseudo_results = combineSingleResults(pseudo_single_results) writeJacksWResults(outprefix + '_pseudo_noness', jacks_pseudo_results, sample_ids_without_ctrl, write_types=['', '_std'] ) # Write out the results
# Load negative control guides (if any) ctrl_geneset = readControlGeneset(args.ctrl_genes) if args.ctrl_genes is not None else set() ##REFERENCE (to collect X's) # Load the data and preprocess LOG.info('Reference: Loading data and pre-processing') data, meta, sample_ids, genes, gene_index = loadDataAndPreprocess(ref_sample_spec, gene_spec,ctrl_spec=ctrl_spec, normtype=args.norm_type, ctrl_geneset=ctrl_geneset) gene_grnas = {gene: [x for x in meta[gene_index[gene], 0]] for gene in gene_index} testdata, ctrldata, test_sample_idxs = collateTestControlSamples(data, sample_ids, ctrl_spec) sample_ids_without_ctrl = [sample_ids[idx] for idx in test_sample_idxs] #Run all samples against their controls LOG.info('Reference: Running JACKS inference') jacks_results = inferJACKS(gene_index, testdata, ctrldata) writeJacksXResults(ref_outfile, jacks_results, gene_grnas) ##TEST (using reference) LOG.info('Test: Loading data and pre-processing') data, meta, sample_ids, genes, gene_index = loadDataAndPreprocess(single_sample_spec, gene_spec,ctrl_spec=single_ctrl_spec, normtype=args.norm_type, ctrl_geneset=ctrl_geneset) gene_grnas = {gene: [x for x in meta[gene_index[gene], 0]] for gene in gene_index} testdata, ctrldata, test_sample_idxs = collateTestControlSamples(data, sample_ids, ctrl_spec) sample_ids_without_ctrl = [sample_ids[idx] for idx in test_sample_idxs] x_ref = loadSgrnaReference(ref_outfile) # Create the X reference (in the correct order) x_reference = {'X1': np.array([eval(x_ref[x]['X1']) for x in meta[:, 0]]), 'X2': np.array([eval(x_ref[x]['X2']) for x in meta[:, 0]])} #Run all samples against their controls